diff --git a/.clang-format b/.clang-format
new file mode 100644
index 00000000000000..6bbd46d0ff9565
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,29 @@
+# This file is used by clang-format to autoformat paddle source code
+#
+# The clang-format is part of llvm toolchain.
+# It need to install llvm and clang to format source code style.
+#
+# The basic usage is,
+#   clang-format -i -style=file PATH/TO/SOURCE/CODE
+#
+# The -style=file implicit use ".clang-format" file located in one of 
+# parent directory. 
+# The -i means inplace change.
+#
+# The document of clang-format is 
+#   http://clang.llvm.org/docs/ClangFormat.html
+#   http://clang.llvm.org/docs/ClangFormatStyleOptions.html
+#
+# TODO(yuyang18): Add python and other language code style
+---
+Language:        Cpp
+BasedOnStyle:  Google
+IndentWidth:     2
+TabWidth:        2
+ContinuationIndentWidth: 4
+AccessModifierOffset: -2  # The private/protected/public has no indent in class
+PointerAlignment: Left    # int* p/int& p, not int *p/int &p
+Standard:  Cpp11 
+AllowAllParametersOfDeclarationOnNextLine: true
+...
+
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000000000..00368ede67d3d2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*.DS_Store
+build/
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 00000000000000..cb991cc9cfccf5
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,156 @@
+cmake_minimum_required(VERSION 2.8)
+
+project(paddle CXX C)
+set(PADDLE_MAJOR_VERSION 0)
+set(PADDLE_MINOR_VERSION 8)
+set(PADDLE_PATCH_VERSION 0b)
+set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
+set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
+include(package)
+include(swig)
+find_package(CUDA QUIET)
+find_package(Protobuf REQUIRED)
+find_package(PythonLibs 2.7 REQUIRED)
+find_package(PythonInterp 2.7 REQUIRED)
+find_package(NumPy)
+find_package(Threads REQUIRED)
+find_package(Glog)
+find_package(Gflags QUIET)
+find_package(GTest)
+find_package(Sphinx)
+find_package(Doxygen)
+include(cblas)
+find_program(M4_EXECUTABLE m4)
+###################### Configurations ###########################
+option(WITH_DSO "Compile PaddlePaddle with dynamic linked libraries" ON)
+option(WITH_GPU "Compile PaddlePaddle with gpu" ${CUDA_FOUND})
+option(WITH_DOUBLE "Compile PaddlePaddle with double precision, otherwise use single precision" OFF)
+option(WITH_AVX "Compile PaddlePaddle with avx instructs" ON) # TODO(yuyang18): Check AVX is supported or not as default value
+option(WITH_PYTHON "Compile PaddlePaddle with python interpretor" ON)
+option(WITH_STYLE_CHECK "Style Check for PaddlePaddle" ${PYTHONINTERP_FOUND})
+option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF)
+option(WITH_GLOG "Compile PaddlePaddle use glog, otherwise use a log implement internally" ${LIBGLOG_FOUND})
+option(WITH_GFLAGS "Compile PaddlePaddle use gflags, otherwise use a flag implement internally" ${GFLAGS_FOUND})
+option(WITH_TIMER "Compile PaddlePaddle use timer" OFF)
+option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
+option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
+option(WITH_DOC_CN "Compile PaddlePaddle with Chinese documentation" OFF)
+option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle predict api" ${SWIG_FOUND})
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING 
+        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
+        FORCE)
+endif()
+
+include(enableCXX11)
+include(cpplint)
+include(ccache)
+include(util)
+include(flags)
+include(cudnn)
+include(FindPythonModule)
+include(check_packages)
+
+# add PaddlePaddle version
+if(DEFINED ENV{PADDLE_VERSION})
+    add_definitions(-DPADDLE_VERSION=\"$ENV{PADDLE_VERSION}\")
+else()
+    if(EXISTS ${PROJ_ROOT}/.svn/)
+        find_package(Subversion REQUIRED)
+        if(SUBVERSION_FOUND)
+            Subversion_WC_INFO(${PROJ_ROOT} Project)
+            add_definitions(-DPADDLE_VERSION=${Project_WC_REVISION})
+        endif()
+    endif()
+endif()
+
+
+if(NOT WITH_GPU)
+    add_definitions(-DPADDLE_ONLY_CPU)
+    add_definitions(-DHPPL_STUB_FUNC)
+    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
+else()
+    # TODO(yuyang18): Change it to remove std=c++11 in cuda compile.
+    set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+    if(NOT CUDNN_FOUND)
+        message(FATAL_ERROR "Paddle need cudnn to compile")
+    endif()
+
+    if(WITH_DSO)
+        set(CUDA_LIBRARIES "")
+        add_definitions(-DPADDLE_USE_DSO)
+    endif(WITH_DSO)
+
+    # Include cuda and cudnn
+    include_directories(${CUDNN_INCLUDE_DIR})
+    include_directories(${CUDA_TOOLKIT_INCLUDE})
+endif(NOT WITH_GPU)
+
+if(WITH_DOUBLE)
+    add_definitions(-DPADDLE_TYPE_DOUBLE -DHPPL_TYPE_DOUBLE)
+    set(ACCURACY double)
+else(WITH_DOUBLE)
+    set(ACCURACY float)
+endif(WITH_DOUBLE)
+
+if(NOT WITH_TIMER)
+    add_definitions(-DPADDLE_DISABLE_TIMER)
+endif(NOT WITH_TIMER)
+
+if(WITH_AVX)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
+else(WITH_AVX)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse3")
+endif(WITH_AVX)
+
+if(WITH_PYTHON)
+    include_directories(${PYTHON_INCLUDE_DIR})
+    include_directories(${PYTHON_NUMPY_INCLUDE_DIR})
+else(WITH_PYTHON)
+    add_definitions(-DPADDLE_NO_PYTHON)
+endif(WITH_PYTHON)
+
+if(NOT WITH_RDMA)
+    add_definitions(-DPADDLE_DISABLE_RDMA)
+endif()
+
+if(WITH_GLOG)
+    add_definitions(-DPADDLE_USE_GLOG)
+endif()
+
+if(WITH_GFLAGS)
+    add_definitions(-DPADDLE_USE_GFLAGS)
+    add_definitions(-DGFLAGS_NS=${GFLAGS_NAMESPACE})
+    include_directories(${GFLAGS_INCLUDE_DIRS})
+endif()
+
+if(WITH_TESTING)
+    enable_testing()
+    include_directories(${GTEST_INCLUDE_DIRS})
+endif()
+
+include_directories("${CBLAS_INC_DIR}")
+include_directories("${PROJ_ROOT}")
+include_directories("${PROJ_ROOT}/paddle/cuda/include")
+include_directories(${PROTOBUF_INCLUDE_DIRS})
+include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
+if(EXISTS "${PROJ_ROOT}/paddle/internals/CMakeLists.txt")
+    set(PADDLE_WITH_INTERNAL ON)
+    include(paddle/internals/CMakeLists.txt)
+else()
+    set(PADDLE_WITH_INTERNAL OFF)
+    set(INTERNAL_PROTO_PATH "")
+endif()
+add_subdirectory(proto)
+add_subdirectory(paddle)
+add_subdirectory(python)
+if(WITH_DOC)
+    add_subdirectory(doc)
+endif()
+if(WITH_DOC_CN)
+    add_subdirectory(doc_cn)
+endif()
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000000000..2ff3140db0d702
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,203 @@
+Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
new file mode 100644
index 00000000000000..e7d337866c4ba9
--- /dev/null
+++ b/README.md
@@ -0,0 +1,84 @@
+# PaddlePaddle
+
+[![Documentation Status](https://readthedocs.org/projects/ctcspeechrecognition/badge/?version=latest)](http://ctcspeechrecognition.readthedocs.io/en/latest/?badge=latest)
+
+PaddlePaddle (PArallel Distributed Deep LEarning) is an easy-to-use,
+efficient, flexible and scalable deep learning platform, which is originally
+developed by Baidu scientists and engineers for the purpose of applying deep
+learning to many products at Baidu.
+
+## Features
+
+- **Flexibility**
+
+   PaddlePaddle supports a wide range of neural network architectures and
+   optimization algorithms. It is easy to configure complex models such as
+   neural machine translation model with attention mechanism or complex memory
+   connection.
+
+-  **Efficiency**
+  
+  In order to unleash the power of heterogeneous computing resource,
+  optimization occurs at different levels of PaddlePaddle, including
+  computing, memory, architecture and communication. The following are some
+  examples:
+  1. Optimized math operations through SSE/AVX intrinsics, BLAS libraries
+  (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels. 
+  2. Highly optimized recurrent networks which can handle **variable-length** 
+  sequence without padding.
+  3. Optimized local and distributed training for models with high dimensional
+  sparse data.
+
+- **Scalability**
+
+  With PaddlePaddle, it is easy to use many CPUs/GPUs and machines to speed
+  up your training. PaddlePaddle can achieve high throughput and performance
+  via optimized communication.
+
+- **Connected to Products**
+
+  In addition, PaddlePaddle is also designed to be easily deployable. At Baidu,
+  PaddlePaddle has been deployed into products or service with a vast number
+  of users, including ad click-through rate (CTR) prediction, large-scale image
+  classification, optical character recognition(OCR), search ranking, computer
+  virus detection, recommendation, etc. It is widely utilized in products at
+  Baidu and it has achieved a significant impact. We hope you can also exploit
+  the capability of PaddlePaddle to make a huge impact for your product.
+
+## Installation
+See [installation guide]() to build and install from the source code or install
+the Docker Image.
+
+## Documentation
+- [Quick Start]() <br>
+   You can follow the quick start tutorial to learn how use PaddlePaddle
+   step-by-step.
+    
+- [Example and Demo]() <br>
+   We provide five demos, including: image classification, sentiment analysis,
+   sequence to sequence model, recommendation, semantic role labelling. 
+   
+- [Distributed Training]() <br>
+  This system supports training deep learning models on multiple machines
+  with data parallelism.
+   
+- [Python API]() <br>
+   PaddlePaddle supports using either Python interface or C++ to build your
+   system. We also use SWIG to wrap C++ source code to create a user friendly
+   interface for Python. You can also use SWIG to create interface for your
+   favorite programming language.
+ 
+- [How to Contribute]() <br>
+   We sincerely appreciate your interest and contributions. If you’d like to
+   contribute, please read the contribution guide.   
+
+- [Source Code Documents]() <br>
+      
+## Ask Questions
+
+If you want to ask questions and discuss about methods and models, welcome
+to send email to paddle-dev@baidu.com. Framework development discussions and
+bug reports are collected on [Issues](https://github.com/paddle/paddle/issues).
+
+## Copyright and License
+PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
diff --git a/authors b/authors
new file mode 100644
index 00000000000000..ab4d3118ff1f7e
--- /dev/null
+++ b/authors
@@ -0,0 +1,53 @@
+Cao, Ying
+Cheng, Yujuan
+Dang, Qingqing
+Dong, Tengfei
+Du, Dalong
+Feng, Shouqiang
+Gao, Haoyuan
+Han, Baochang
+Han, Jinchen
+Hao, Nanyu
+He, Daoyuan
+He, Zhengyan
+Hou, Jue
+Huang, Chang
+Huang, Zhiheng
+Hu, Na
+Kong, Qi
+Liao, Gang
+Li, Bo
+Li, Jiajie
+Li, Jing
+Li, Lei
+Li, Peng
+Liu, Sheng
+Liu, Yuan
+Li, Yuze
+Luo, Heng
+Luo, Tao
+Lyu, Qin
+Mao, Hongyue
+Qian, Xiaojun
+Qi, Jun
+Qin, Duohao
+Shen, Guolong
+Shi, Guangchuan
+Song, Xiang
+Wang, Jiang
+Wang, Yanfei
+Wang, Yong
+Weng, Renliang
+Xu, Tianbing
+Xu, Wei
+Xu, Xingyu
+Yan, Chong
+Yan, Chunwei
+Yang, Yi
+Yu, Yang
+Yu, Yinan
+Zhang, Jian
+Zhang, Ruiqing
+Zhang, Weide
+Zhao, Liang
+Zhou, Jie
diff --git a/cmake/FindGflags.cmake b/cmake/FindGflags.cmake
new file mode 100644
index 00000000000000..6587089ba382dc
--- /dev/null
+++ b/cmake/FindGflags.cmake
@@ -0,0 +1,582 @@
+# Ceres Solver - A fast non-linear least squares minimizer
+# Copyright 2015 Google Inc. All rights reserved.
+# http://ceres-solver.org/
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# * Neither the name of Google Inc. nor the names of its contributors may be
+#   used to endorse or promote products derived from this software without
+#   specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: alexs.mac@gmail.com (Alex Stewart)
+#
+
+# FindGflags.cmake - Find Google gflags logging library.
+#
+# This module will attempt to find gflags, either via an exported CMake
+# configuration (generated by gflags >= 2.1 which are built with CMake), or
+# by performing a standard search for all gflags components.  The order of
+# precedence for these two methods of finding gflags is controlled by:
+# GFLAGS_PREFER_EXPORTED_GFLAGS_CMAKE_CONFIGURATION.
+#
+# This module defines the following variables:
+#
+# GFLAGS_FOUND: TRUE iff gflags is found.
+# GFLAGS_INCLUDE_DIRS: Include directories for gflags.
+# GFLAGS_LIBRARIES: Libraries required to link gflags.
+# GFLAGS_NAMESPACE: The namespace in which gflags is defined.  In versions of
+#                   gflags < 2.1, this was google, for versions >= 2.1 it is
+#                   by default gflags, although can be configured when building
+#                   gflags to be something else (i.e. google for legacy
+#                   compatibility).
+#
+# The following variables control the behaviour of this module when an exported
+# gflags CMake configuration is not found.
+#
+# GFLAGS_PREFER_EXPORTED_GFLAGS_CMAKE_CONFIGURATION: TRUE/FALSE, iff TRUE then
+#                           then prefer using an exported CMake configuration
+#                           generated by gflags >= 2.1 over searching for the
+#                           gflags components manually.  Otherwise (FALSE)
+#                           ignore any exported gflags CMake configurations and
+#                           always perform a manual search for the components.
+#                           Default: TRUE iff user does not define this variable
+#                           before we are called, and does NOT specify either
+#                           GFLAGS_INCLUDE_DIR_HINTS or GFLAGS_LIBRARY_DIR_HINTS
+#                           otherwise FALSE.
+# GFLAGS_INCLUDE_DIR_HINTS: List of additional directories in which to
+#                           search for gflags includes, e.g: /timbuktu/include.
+# GFLAGS_LIBRARY_DIR_HINTS: List of additional directories in which to
+#                           search for gflags libraries, e.g: /timbuktu/lib.
+#
+# The following variables are also defined by this module, but in line with
+# CMake recommended FindPackage() module style should NOT be referenced directly
+# by callers (use the plural variables detailed above instead).  These variables
+# do however affect the behaviour of the module via FIND_[PATH/LIBRARY]() which
+# are NOT re-called (i.e. search for library is not repeated) if these variables
+# are set with valid values _in the CMake cache_. This means that if these
+# variables are set directly in the cache, either by the user in the CMake GUI,
+# or by the user passing -DVAR=VALUE directives to CMake when called (which
+# explicitly defines a cache variable), then they will be used verbatim,
+# bypassing the HINTS variables and other hard-coded search locations.
+#
+# GFLAGS_INCLUDE_DIR: Include directory for gflags, not including the
+#                     include directory of any dependencies.
+# GFLAGS_LIBRARY: gflags library, not including the libraries of any
+#                 dependencies.
+
+# Reset CALLERS_CMAKE_FIND_LIBRARY_PREFIXES to its value when FindGflags was
+# invoked, necessary for MSVC.
+macro(GFLAGS_RESET_FIND_LIBRARY_PREFIX)
+  if (MSVC)
+    set(CMAKE_FIND_LIBRARY_PREFIXES "${CALLERS_CMAKE_FIND_LIBRARY_PREFIXES}")
+  endif (MSVC)
+endmacro(GFLAGS_RESET_FIND_LIBRARY_PREFIX)
+
+# Called if we failed to find gflags or any of it's required dependencies,
+# unsets all public (designed to be used externally) variables and reports
+# error message at priority depending upon [REQUIRED/QUIET/<NONE>] argument.
+macro(GFLAGS_REPORT_NOT_FOUND REASON_MSG)
+  unset(GFLAGS_FOUND)
+  unset(GFLAGS_INCLUDE_DIRS)
+  unset(GFLAGS_LIBRARIES)
+  # Do not use unset, as we want to keep GFLAGS_NAMESPACE in the cache,
+  # but simply clear its value.
+  set(GFLAGS_NAMESPACE "" CACHE STRING
+    "gflags namespace (google or gflags)" FORCE)
+
+  # Make results of search visible in the CMake GUI if gflags has not
+  # been found so that user does not have to toggle to advanced view.
+  mark_as_advanced(CLEAR GFLAGS_INCLUDE_DIR
+                         GFLAGS_LIBRARY
+                         GFLAGS_NAMESPACE)
+
+  gflags_reset_find_library_prefix()
+
+  # Note <package>_FIND_[REQUIRED/QUIETLY] variables defined by FindPackage()
+  # use the camelcase library name, not uppercase.
+  if (Gflags_FIND_QUIETLY)
+    message(STATUS "Failed to find gflags - " ${REASON_MSG} ${ARGN})
+  elseif (Gflags_FIND_REQUIRED)
+    message(FATAL_ERROR "Failed to find gflags - " ${REASON_MSG} ${ARGN})
+  else()
+    # Neither QUIETLY nor REQUIRED, use no priority which emits a message
+    # but continues configuration and allows generation.
+    message("-- Failed to find gflags - " ${REASON_MSG} ${ARGN})
+  endif ()
+  return()
+endmacro(GFLAGS_REPORT_NOT_FOUND)
+
+# Verify that all variable names passed as arguments are defined (can be empty
+# but must be defined) or raise a fatal error.
+macro(GFLAGS_CHECK_VARS_DEFINED)
+  foreach(CHECK_VAR ${ARGN})
+    if (NOT DEFINED ${CHECK_VAR})
+      message(FATAL_ERROR "Ceres Bug: ${CHECK_VAR} is not defined.")
+    endif()
+  endforeach()
+endmacro(GFLAGS_CHECK_VARS_DEFINED)
+
+# Use check_cxx_source_compiles() to compile trivial test programs to determine
+# the gflags namespace.  This works on all OSs except Windows.  If using Visual
+# Studio, it fails because msbuild forces check_cxx_source_compiles() to use
+# CMAKE_BUILD_TYPE=Debug for the test project, which usually breaks detection
+# because MSVC requires that the test project use the same build type as gflags,
+# which would normally be built in Release.
+#
+# Defines: GFLAGS_NAMESPACE in the caller's scope with the detected namespace,
+#          which is blank (empty string, will test FALSE is CMake conditionals)
+#          if detection failed.
+function(GFLAGS_CHECK_GFLAGS_NAMESPACE_USING_TRY_COMPILE)
+  # Verify that all required variables are defined.
+  gflags_check_vars_defined(
+    GFLAGS_INCLUDE_DIR GFLAGS_LIBRARY)
+  # Ensure that GFLAGS_NAMESPACE is always unset on completion unless
+  # we explicitly set if after having the correct namespace.
+  set(GFLAGS_NAMESPACE "" PARENT_SCOPE)
+
+  include(CheckCXXSourceCompiles)
+  # Setup include path & link library for gflags for CHECK_CXX_SOURCE_COMPILES.
+  set(CMAKE_REQUIRED_INCLUDES ${GFLAGS_INCLUDE_DIR})
+  set(CMAKE_REQUIRED_LIBRARIES ${GFLAGS_LIBRARY} ${GFLAGS_LINK_LIBRARIES})
+  # First try the (older) google namespace.  Note that the output variable
+  # MUST be unique to the build type as otherwise the test is not repeated as
+  # it is assumed to have already been performed.
+  check_cxx_source_compiles(
+    "#include <gflags/gflags.h>
+     int main(int argc, char * argv[]) {
+       google::ParseCommandLineFlags(&argc, &argv, true);
+       return 0;
+     }"
+     GFLAGS_IN_GOOGLE_NAMESPACE)
+  if (GFLAGS_IN_GOOGLE_NAMESPACE)
+    set(GFLAGS_NAMESPACE google PARENT_SCOPE)
+    return()
+  endif()
+
+  # Try (newer) gflags namespace instead.  Note that the output variable
+  # MUST be unique to the build type as otherwise the test is not repeated as
+  # it is assumed to have already been performed.
+  set(CMAKE_REQUIRED_INCLUDES ${GFLAGS_INCLUDE_DIR})
+  set(CMAKE_REQUIRED_LIBRARIES ${GFLAGS_LIBRARY} ${GFLAGS_LINK_LIBRARIES})
+  check_cxx_source_compiles(
+    "#include <gflags/gflags.h>
+     int main(int argc, char * argv[]) {
+        gflags::ParseCommandLineFlags(&argc, &argv, true);
+        return 0;
+     }"
+     GFLAGS_IN_GFLAGS_NAMESPACE)
+  if (GFLAGS_IN_GFLAGS_NAMESPACE)
+    set(GFLAGS_NAMESPACE gflags PARENT_SCOPE)
+    return()
+  endif (GFLAGS_IN_GFLAGS_NAMESPACE)
+endfunction(GFLAGS_CHECK_GFLAGS_NAMESPACE_USING_TRY_COMPILE)
+
+# Use regex on the gflags headers to attempt to determine the gflags namespace.
+# Checks both gflags.h (contained namespace on versions < 2.1.2) and
+# gflags_declare.h, which contains the namespace on versions >= 2.1.2.
+# In general, this method should only be used when
+# GFLAGS_CHECK_GFLAGS_NAMESPACE_USING_TRY_COMPILE() cannot be used, or has
+# failed.
+#
+# Defines: GFLAGS_NAMESPACE in the caller's scope with the detected namespace,
+#          which is blank (empty string, will test FALSE is CMake conditionals)
+#          if detection failed.
+function(GFLAGS_CHECK_GFLAGS_NAMESPACE_USING_REGEX)
+  # Verify that all required variables are defined.
+  gflags_check_vars_defined(GFLAGS_INCLUDE_DIR)
+  # Ensure that GFLAGS_NAMESPACE is always undefined on completion unless
+  # we explicitly set if after having the correct namespace.
+  set(GFLAGS_NAMESPACE "" PARENT_SCOPE)
+
+  # Scan gflags.h to identify what namespace gflags was built with.  On
+  # versions of gflags < 2.1.2, gflags.h was configured with the namespace
+  # directly, on >= 2.1.2, gflags.h uses the GFLAGS_NAMESPACE #define which
+  # is defined in gflags_declare.h, we try each location in turn.
+  set(GFLAGS_HEADER_FILE ${GFLAGS_INCLUDE_DIR}/gflags/gflags.h)
+  if (NOT EXISTS ${GFLAGS_HEADER_FILE})
+    gflags_report_not_found(
+      "Could not find file: ${GFLAGS_HEADER_FILE} "
+      "containing namespace information in gflags install located at: "
+      "${GFLAGS_INCLUDE_DIR}.")
+  endif()
+  file(READ ${GFLAGS_HEADER_FILE} GFLAGS_HEADER_FILE_CONTENTS)
+
+  string(REGEX MATCH "namespace [A-Za-z]+"
+    GFLAGS_NAMESPACE "${GFLAGS_HEADER_FILE_CONTENTS}")
+  string(REGEX REPLACE "namespace ([A-Za-z]+)" "\\1"
+    GFLAGS_NAMESPACE "${GFLAGS_NAMESPACE}")
+
+  if (NOT GFLAGS_NAMESPACE)
+    gflags_report_not_found(
+      "Failed to extract gflags namespace from header file: "
+      "${GFLAGS_HEADER_FILE}.")
+  endif (NOT GFLAGS_NAMESPACE)
+
+  if (GFLAGS_NAMESPACE STREQUAL "google" OR
+      GFLAGS_NAMESPACE STREQUAL "gflags")
+    # Found valid gflags namespace from gflags.h.
+    set(GFLAGS_NAMESPACE "${GFLAGS_NAMESPACE}" PARENT_SCOPE)
+    return()
+  endif()
+
+  # Failed to find gflags namespace from gflags.h, gflags is likely a new
+  # version, check gflags_declare.h, which in newer versions (>= 2.1.2) contains
+  # the GFLAGS_NAMESPACE #define, which is then referenced in gflags.h.
+  set(GFLAGS_DECLARE_FILE ${GFLAGS_INCLUDE_DIR}/gflags/gflags_declare.h)
+  if (NOT EXISTS ${GFLAGS_DECLARE_FILE})
+    gflags_report_not_found(
+      "Could not find file: ${GFLAGS_DECLARE_FILE} "
+      "containing namespace information in gflags install located at: "
+      "${GFLAGS_INCLUDE_DIR}.")
+  endif()
+  file(READ ${GFLAGS_DECLARE_FILE} GFLAGS_DECLARE_FILE_CONTENTS)
+
+  string(REGEX MATCH "#define GFLAGS_NAMESPACE [A-Za-z]+"
+    GFLAGS_NAMESPACE "${GFLAGS_DECLARE_FILE_CONTENTS}")
+  string(REGEX REPLACE "#define GFLAGS_NAMESPACE ([A-Za-z]+)" "\\1"
+    GFLAGS_NAMESPACE "${GFLAGS_NAMESPACE}")
+
+  if (NOT GFLAGS_NAMESPACE)
+    gflags_report_not_found(
+      "Failed to extract gflags namespace from declare file: "
+      "${GFLAGS_DECLARE_FILE}.")
+  endif (NOT GFLAGS_NAMESPACE)
+
+  if (GFLAGS_NAMESPACE STREQUAL "google" OR
+      GFLAGS_NAMESPACE STREQUAL "gflags")
+    # Found valid gflags namespace from gflags.h.
+    set(GFLAGS_NAMESPACE "${GFLAGS_NAMESPACE}" PARENT_SCOPE)
+    return()
+  endif()
+endfunction(GFLAGS_CHECK_GFLAGS_NAMESPACE_USING_REGEX)
+
+# -----------------------------------------------------------------
+# By default, if the user has expressed no preference for using an exported
+# gflags CMake configuration over performing a search for the installed
+# components, and has not specified any hints for the search locations, then
+# prefer a gflags exported configuration if available.
+if (NOT DEFINED GFLAGS_PREFER_EXPORTED_GFLAGS_CMAKE_CONFIGURATION
+    AND NOT GFLAGS_INCLUDE_DIR_HINTS
+    AND NOT GFLAGS_LIBRARY_DIR_HINTS)
+  message(STATUS "No preference for use of exported gflags CMake configuration "
+    "set, and no hints for include/library directories provided. "
+    "Defaulting to preferring an installed/exported gflags CMake configuration "
+    "if available.")
+  set(GFLAGS_PREFER_EXPORTED_GFLAGS_CMAKE_CONFIGURATION TRUE)
+endif()
+
+if (GFLAGS_PREFER_EXPORTED_GFLAGS_CMAKE_CONFIGURATION)
+  # Try to find an exported CMake configuration for gflags, as generated by
+  # gflags versions >= 2.1.
+  #
+  # We search twice, s/t we can invert the ordering of precedence used by
+  # find_package() for exported package build directories, and installed
+  # packages (found via CMAKE_SYSTEM_PREFIX_PATH), listed as items 6) and 7)
+  # respectively in [1].
+  #
+  # By default, exported build directories are (in theory) detected first, and
+  # this is usually the case on Windows.  However, on OS X & Linux, the install
+  # path (/usr/local) is typically present in the PATH environment variable
+  # which is checked in item 4) in [1] (i.e. before both of the above, unless
+  # NO_SYSTEM_ENVIRONMENT_PATH is passed).  As such on those OSs installed
+  # packages are usually detected in preference to exported package build
+  # directories.
+  #
+  # To ensure a more consistent response across all OSs, and as users usually
+  # want to prefer an installed version of a package over a locally built one
+  # where both exist (esp. as the exported build directory might be removed
+  # after installation), we first search with NO_CMAKE_PACKAGE_REGISTRY which
+  # means any build directories exported by the user are ignored, and thus
+  # installed directories are preferred.  If this fails to find the package
+  # we then research again, but without NO_CMAKE_PACKAGE_REGISTRY, so any
+  # exported build directories will now be detected.
+  #
+  # To prevent confusion on Windows, we also pass NO_CMAKE_BUILDS_PATH (which
+  # is item 5) in [1]), to not preferentially use projects that were built
+  # recently with the CMake GUI to ensure that we always prefer an installed
+  # version if available.
+  #
+  # [1] http://www.cmake.org/cmake/help/v2.8.11/cmake.html#command:find_package
+  find_package(gflags QUIET
+                      NO_MODULE
+                      NO_CMAKE_PACKAGE_REGISTRY
+                      NO_CMAKE_BUILDS_PATH)
+  if (gflags_FOUND)
+    message(STATUS "Found installed version of gflags: ${gflags_DIR}")
+  else(gflags_FOUND)
+    # Failed to find an installed version of gflags, repeat search allowing
+    # exported build directories.
+    message(STATUS "Failed to find installed gflags CMake configuration, "
+      "searching for gflags build directories exported with CMake.")
+    # Again pass NO_CMAKE_BUILDS_PATH, as we know that gflags is exported and
+    # do not want to treat projects built with the CMake GUI preferentially.
+    find_package(gflags QUIET
+                        NO_MODULE
+                        NO_CMAKE_BUILDS_PATH)
+    if (gflags_FOUND)
+      message(STATUS "Found exported gflags build directory: ${gflags_DIR}")
+    endif(gflags_FOUND)
+  endif(gflags_FOUND)
+
+  set(FOUND_INSTALLED_GFLAGS_CMAKE_CONFIGURATION ${gflags_FOUND})
+
+  # gflags v2.1 - 2.1.2 shipped with a bug in their gflags-config.cmake [1]
+  # whereby gflags_LIBRARIES = "gflags", but there was no imported target
+  # called "gflags", they were called: gflags[_nothreads]-[static/shared].
+  # As this causes linker errors when gflags is not installed in a location
+  # on the current library paths, detect if this problem is present and
+  # fix it.
+  #
+  # [1] https://github.com/gflags/gflags/issues/110
+  if (gflags_FOUND)
+    # NOTE: This is not written as additional conditions in the outer
+    #       if (gflags_FOUND) as the NOT TARGET "${gflags_LIBRARIES}"
+    #       condition causes problems if gflags is not found.
+    if (${gflags_VERSION} VERSION_LESS 2.1.3 AND
+        NOT TARGET "${gflags_LIBRARIES}")
+      message(STATUS "Detected broken gflags install in: ${gflags_DIR}, "
+        "version: ${gflags_VERSION} <= 2.1.2 which defines gflags_LIBRARIES = "
+        "${gflags_LIBRARIES} which is not an imported CMake target, see: "
+        "https://github.com/gflags/gflags/issues/110.  Attempting to fix by "
+        "detecting correct gflags target.")
+      # Ordering here expresses preference for detection, specifically we do not
+      # want to use the _nothreads variants if the full library is available.
+      list(APPEND CHECK_GFLAGS_IMPORTED_TARGET_NAMES
+        gflags-shared gflags-static
+        gflags_nothreads-shared gflags_nothreads-static)
+      foreach(CHECK_GFLAGS_TARGET ${CHECK_GFLAGS_IMPORTED_TARGET_NAMES})
+        if (TARGET ${CHECK_GFLAGS_TARGET})
+          message(STATUS "Found valid gflags target: ${CHECK_GFLAGS_TARGET}, "
+            "updating gflags_LIBRARIES.")
+          set(gflags_LIBRARIES ${CHECK_GFLAGS_TARGET})
+          break()
+        endif()
+      endforeach()
+      if (NOT TARGET ${gflags_LIBRARIES})
+        message(STATUS "Failed to fix detected broken gflags install in: "
+          "${gflags_DIR}, version: ${gflags_VERSION} <= 2.1.2, none of the "
+          "imported targets for gflags: ${CHECK_GFLAGS_IMPORTED_TARGET_NAMES} "
+          "are defined.  Will continue with a manual search for gflags "
+          "components.  We recommend you build/install a version of gflags > "
+          "2.1.2 (or master).")
+        set(FOUND_INSTALLED_GFLAGS_CMAKE_CONFIGURATION FALSE)
+      endif()
+    endif()
+  endif()
+
+  if (FOUND_INSTALLED_GFLAGS_CMAKE_CONFIGURATION)
+    message(STATUS "Detected gflags version: ${gflags_VERSION}")
+    set(GFLAGS_FOUND ${gflags_FOUND})
+    set(GFLAGS_INCLUDE_DIR ${gflags_INCLUDE_DIR})
+    set(GFLAGS_LIBRARY ${gflags_LIBRARIES})
+
+    # gflags does not export the namespace in their CMake configuration, so
+    # use our function to determine what it should be, as it can be either
+    # gflags or google dependent upon version & configuration.
+    #
+    # NOTE: We use the regex method to determine the namespace here, as
+    #       check_cxx_source_compiles() will not use imported targets, which
+    #       is what gflags will be in this case.
+    gflags_check_gflags_namespace_using_regex()
+
+    if (NOT GFLAGS_NAMESPACE)
+      gflags_report_not_found(
+        "Failed to determine gflags namespace using regex for gflags "
+        "version: ${gflags_VERSION} exported here: ${gflags_DIR} using CMake.")
+    endif (NOT GFLAGS_NAMESPACE)
+  else (FOUND_INSTALLED_GFLAGS_CMAKE_CONFIGURATION)
+    message(STATUS "Failed to find an installed/exported CMake configuration "
+      "for gflags, will perform search for installed gflags components.")
+  endif (FOUND_INSTALLED_GFLAGS_CMAKE_CONFIGURATION)
+endif(GFLAGS_PREFER_EXPORTED_GFLAGS_CMAKE_CONFIGURATION)
+
+if (NOT GFLAGS_FOUND)
+  # Either failed to find an exported gflags CMake configuration, or user
+  # told us not to use one.  Perform a manual search for all gflags components.
+
+  # Handle possible presence of lib prefix for libraries on MSVC, see
+  # also GFLAGS_RESET_FIND_LIBRARY_PREFIX().
+  if (MSVC)
+    # Preserve the caller's original values for CMAKE_FIND_LIBRARY_PREFIXES
+    # s/t we can set it back before returning.
+    set(CALLERS_CMAKE_FIND_LIBRARY_PREFIXES "${CMAKE_FIND_LIBRARY_PREFIXES}")
+    # The empty string in this list is important, it represents the case when
+    # the libraries have no prefix (shared libraries / DLLs).
+    set(CMAKE_FIND_LIBRARY_PREFIXES "lib" "" "${CMAKE_FIND_LIBRARY_PREFIXES}")
+  endif (MSVC)
+
+  # Search user-installed locations first, so that we prefer user installs
+  # to system installs where both exist.
+  list(APPEND GFLAGS_CHECK_INCLUDE_DIRS
+    /usr/local/include
+    /usr/local/homebrew/include # Mac OS X
+    /opt/local/var/macports/software # Mac OS X.
+    /opt/local/include
+    /usr/include)
+  list(APPEND GFLAGS_CHECK_PATH_SUFFIXES
+    gflags/include # Windows (for C:/Program Files prefix).
+    gflags/Include ) # Windows (for C:/Program Files prefix).
+
+  list(APPEND GFLAGS_CHECK_LIBRARY_DIRS
+    /usr/local/lib
+    /usr/local/homebrew/lib # Mac OS X.
+    /opt/local/lib
+    /usr/lib)
+  list(APPEND GFLAGS_CHECK_LIBRARY_SUFFIXES
+    gflags/lib # Windows (for C:/Program Files prefix).
+    gflags/Lib ) # Windows (for C:/Program Files prefix).
+
+  # Search supplied hint directories first if supplied.
+  find_path(GFLAGS_INCLUDE_DIR
+    NAMES gflags/gflags.h
+    PATHS ${GFLAGS_INCLUDE_DIR_HINTS}
+    ${GFLAGS_CHECK_INCLUDE_DIRS}
+    PATH_SUFFIXES ${GFLAGS_CHECK_PATH_SUFFIXES})
+  if (NOT GFLAGS_INCLUDE_DIR OR
+      NOT EXISTS ${GFLAGS_INCLUDE_DIR})
+    gflags_report_not_found(
+      "Could not find gflags include directory, set GFLAGS_INCLUDE_DIR "
+      "to directory containing gflags/gflags.h")
+  endif (NOT GFLAGS_INCLUDE_DIR OR
+    NOT EXISTS ${GFLAGS_INCLUDE_DIR})
+
+  find_library(GFLAGS_LIBRARY NAMES gflags
+    PATHS ${GFLAGS_LIBRARY_DIR_HINTS}
+    ${GFLAGS_CHECK_LIBRARY_DIRS}
+    PATH_SUFFIXES ${GFLAGS_CHECK_LIBRARY_SUFFIXES})
+  if (NOT GFLAGS_LIBRARY OR
+      NOT EXISTS ${GFLAGS_LIBRARY})
+    gflags_report_not_found(
+      "Could not find gflags library, set GFLAGS_LIBRARY "
+      "to full path to libgflags.")
+  endif (NOT GFLAGS_LIBRARY OR
+    NOT EXISTS ${GFLAGS_LIBRARY})
+
+  # gflags typically requires a threading library (which is OS dependent), note
+  # that this defines the CMAKE_THREAD_LIBS_INIT variable.  If we are able to
+  # detect threads, we assume that gflags requires it.
+  find_package(Threads QUIET)
+  set(GFLAGS_LINK_LIBRARIES ${CMAKE_THREAD_LIBS_INIT})
+  # On Windows (including MinGW), the Shlwapi library is used by gflags if
+  # available.
+  if (WIN32)
+    include(CheckIncludeFileCXX)
+    check_include_file_cxx("shlwapi.h" HAVE_SHLWAPI)
+    if (HAVE_SHLWAPI)
+      list(APPEND GFLAGS_LINK_LIBRARIES shlwapi.lib)
+    endif(HAVE_SHLWAPI)
+  endif (WIN32)
+
+  # Mark internally as found, then verify. GFLAGS_REPORT_NOT_FOUND() unsets
+  # if called.
+  set(GFLAGS_FOUND TRUE)
+
+  # Identify what namespace gflags was built with.
+  if (GFLAGS_INCLUDE_DIR AND NOT GFLAGS_NAMESPACE)
+    # To handle Windows peculiarities / CMake bugs on MSVC we try two approaches
+    # to detect the gflags namespace:
+    #
+    # 1) Try to use check_cxx_source_compiles() to compile a trivial program
+    #    with the two choices for the gflags namespace.
+    #
+    # 2) [In the event 1) fails] Use regex on the gflags headers to try to
+    #    determine the gflags namespace.  Whilst this is less robust than 1),
+    #    it does avoid any interaction with msbuild.
+    gflags_check_gflags_namespace_using_try_compile()
+
+    if (NOT GFLAGS_NAMESPACE)
+      # Failed to determine gflags namespace using check_cxx_source_compiles()
+      # method, try and obtain it using regex on the gflags headers instead.
+      message(STATUS "Failed to find gflags namespace using using "
+        "check_cxx_source_compiles(), trying namespace regex instead, "
+        "this is expected on Windows.")
+      gflags_check_gflags_namespace_using_regex()
+
+      if (NOT GFLAGS_NAMESPACE)
+        gflags_report_not_found(
+          "Failed to determine gflags namespace either by "
+          "check_cxx_source_compiles(), or namespace regex.")
+      endif (NOT GFLAGS_NAMESPACE)
+    endif (NOT GFLAGS_NAMESPACE)
+  endif (GFLAGS_INCLUDE_DIR AND NOT GFLAGS_NAMESPACE)
+
+  # Make the GFLAGS_NAMESPACE a cache variable s/t the user can view it, and could
+  # overwrite it in the CMake GUI.
+  set(GFLAGS_NAMESPACE "${GFLAGS_NAMESPACE}" CACHE STRING
+    "gflags namespace (google or gflags)" FORCE)
+
+  # gflags does not seem to provide any record of the version in its
+  # source tree, thus cannot extract version.
+
+  # Catch case when caller has set GFLAGS_NAMESPACE in the cache / GUI
+  # with an invalid value.
+  if (GFLAGS_NAMESPACE AND
+      NOT GFLAGS_NAMESPACE STREQUAL "google" AND
+      NOT GFLAGS_NAMESPACE STREQUAL "gflags")
+    gflags_report_not_found(
+      "Caller defined GFLAGS_NAMESPACE:"
+      " ${GFLAGS_NAMESPACE} is not valid, not google or gflags.")
+  endif ()
+  # Catch case when caller has set GFLAGS_INCLUDE_DIR in the cache / GUI and
+  # thus FIND_[PATH/LIBRARY] are not called, but specified locations are
+  # invalid, otherwise we would report the library as found.
+  if (GFLAGS_INCLUDE_DIR AND
+      NOT EXISTS ${GFLAGS_INCLUDE_DIR}/gflags/gflags.h)
+    gflags_report_not_found(
+      "Caller defined GFLAGS_INCLUDE_DIR:"
+      " ${GFLAGS_INCLUDE_DIR} does not contain gflags/gflags.h header.")
+  endif (GFLAGS_INCLUDE_DIR AND
+    NOT EXISTS ${GFLAGS_INCLUDE_DIR}/gflags/gflags.h)
+  # TODO: This regex for gflags library is pretty primitive, we use lowercase
+  #       for comparison to handle Windows using CamelCase library names, could
+  #       this check be better?
+  string(TOLOWER "${GFLAGS_LIBRARY}" LOWERCASE_GFLAGS_LIBRARY)
+  if (GFLAGS_LIBRARY AND
+      NOT "${LOWERCASE_GFLAGS_LIBRARY}" MATCHES ".*gflags[^/]*")
+    gflags_report_not_found(
+      "Caller defined GFLAGS_LIBRARY: "
+      "${GFLAGS_LIBRARY} does not match gflags.")
+  endif (GFLAGS_LIBRARY AND
+    NOT "${LOWERCASE_GFLAGS_LIBRARY}" MATCHES ".*gflags[^/]*")
+
+  gflags_reset_find_library_prefix()
+
+endif(NOT GFLAGS_FOUND)
+
+# Set standard CMake FindPackage variables if found.
+if (GFLAGS_FOUND)
+  set(GFLAGS_INCLUDE_DIRS ${GFLAGS_INCLUDE_DIR})
+  set(GFLAGS_LIBRARIES ${GFLAGS_LIBRARY} ${GFLAGS_LINK_LIBRARIES})
+endif (GFLAGS_FOUND)
+
+# Handle REQUIRED / QUIET optional arguments.
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Gflags DEFAULT_MSG
+  GFLAGS_INCLUDE_DIRS GFLAGS_LIBRARIES GFLAGS_NAMESPACE)
+
+# Only mark internal variables as advanced if we found gflags, otherwise
+# leave them visible in the standard GUI for the user to set manually.
+if (GFLAGS_FOUND)
+  mark_as_advanced(FORCE GFLAGS_INCLUDE_DIR
+    GFLAGS_LIBRARY
+    GFLAGS_NAMESPACE
+    gflags_DIR) # Autogenerated by find_package(gflags)
+endif (GFLAGS_FOUND)
diff --git a/cmake/FindGlog.cmake b/cmake/FindGlog.cmake
new file mode 100644
index 00000000000000..142e2ca96ba76d
--- /dev/null
+++ b/cmake/FindGlog.cmake
@@ -0,0 +1,24 @@
+#
+# Find libglog
+#
+#  LIBGLOG_INCLUDE_DIR - where to find glog/logging.h, etc.
+#  LIBGLOG_LIBRARY     - List of libraries when using libglog.
+#  LIBGLOG_FOUND       - True if libglog found.
+#
+# from https://github.com/facebook/hhvm/blob/master/CMake/FindGlog.cmake
+
+IF (LIBGLOG_INCLUDE_DIR)
+  # Already in cache, be silent
+  SET(LIBGLOG_FIND_QUIETLY TRUE)
+ENDIF ()
+
+FIND_PATH(LIBGLOG_INCLUDE_DIR glog/logging.h)
+
+FIND_LIBRARY(LIBGLOG_LIBRARY glog)
+
+# handle the QUIETLY and REQUIRED arguments and set LIBGLOG_FOUND to TRUE if
+# all listed variables are TRUE
+INCLUDE(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(LIBGLOG DEFAULT_MSG LIBGLOG_LIBRARY LIBGLOG_INCLUDE_DIR)
+
+MARK_AS_ADVANCED(LIBGLOG_LIBRARY LIBGLOG_INCLUDE_DIR)
\ No newline at end of file
diff --git a/cmake/FindNumPy.cmake b/cmake/FindNumPy.cmake
new file mode 100644
index 00000000000000..8cdd642ac01315
--- /dev/null
+++ b/cmake/FindNumPy.cmake
@@ -0,0 +1,38 @@
+# Find the Python NumPy package
+# PYTHON_NUMPY_INCLUDE_DIR
+# NUMPY_FOUND
+# will be set by this script
+
+cmake_minimum_required(VERSION 2.6)
+
+if(NOT PYTHON_EXECUTABLE)
+  if(NumPy_FIND_QUIETLY)
+    find_package(PythonInterp QUIET)
+  else()
+    find_package(PythonInterp)
+    set(_numpy_out 1)
+  endif()
+endif()
+
+if (PYTHON_EXECUTABLE)
+  # write a python script that finds the numpy path
+  file(WRITE ${PROJECT_BINARY_DIR}/FindNumpyPath.py
+      "try: import numpy; print(numpy.get_include())\nexcept:pass\n")
+
+  # execute the find script
+  exec_program("${PYTHON_EXECUTABLE}" ${PROJECT_BINARY_DIR}
+    ARGS "FindNumpyPath.py"
+    OUTPUT_VARIABLE NUMPY_PATH)
+elseif(_numpy_out)
+  message(STATUS "Python executable not found.")
+endif(PYTHON_EXECUTABLE)
+
+find_path(PYTHON_NUMPY_INCLUDE_DIR numpy/arrayobject.h
+  HINTS "${NUMPY_PATH}" "${PYTHON_INCLUDE_PATH}")
+
+if(PYTHON_NUMPY_INCLUDE_DIR)
+  set(PYTHON_NUMPY_FOUND 1 CACHE INTERNAL "Python numpy found")
+endif(PYTHON_NUMPY_INCLUDE_DIR)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NumPy DEFAULT_MSG PYTHON_NUMPY_INCLUDE_DIR)
diff --git a/cmake/FindPythonModule.cmake b/cmake/FindPythonModule.cmake
new file mode 100644
index 00000000000000..2eb3441428e829
--- /dev/null
+++ b/cmake/FindPythonModule.cmake
@@ -0,0 +1,30 @@
+# Find if a Python module is installed
+# Found at http://www.cmake.org/pipermail/cmake/2011-January/041666.html
+# To use do: find_python_module(PyQt4 REQUIRED)
+function(find_python_module module)
+    string(TOUPPER ${module} module_upper)
+    if(NOT PY_${module_upper})
+        if(ARGC GREATER 1 AND ARGV1 STREQUAL "REQUIRED")
+            set(${module}_FIND_REQUIRED TRUE)
+        else()
+            set(${module}_FIND_REQUIRED FALSE)
+        endif()
+        # A module's location is usually a directory, but for binary modules
+        # it's a .so file.
+        execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+            "import re, ${module}; print(re.compile('/__init__.py.*').sub('',${module}.__file__))"
+            RESULT_VARIABLE _${module}_status
+            OUTPUT_VARIABLE _${module}_location
+            ERROR_QUIET
+            OUTPUT_STRIP_TRAILING_WHITESPACE)
+        if(NOT _${module}_status)
+            set(PY_${module_upper} ${_${module}_location} CACHE STRING
+                "Location of Python module ${module}")
+        endif(NOT _${module}_status)
+    endif(NOT PY_${module_upper})
+    find_package_handle_standard_args(PY_${module} DEFAULT_MSG PY_${module_upper})
+    if(NOT PY_${module_upper}_FOUND AND ${module}_FIND_REQUIRED)
+        message(FATAL_ERROR "python module ${module} is not found")
+    endif()
+    set(PY_${module_upper}_FOUND ${PY_${module_upper}_FOUND} PARENT_SCOPE)
+endfunction(find_python_module)
diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake
new file mode 100644
index 00000000000000..6702f45a168bf0
--- /dev/null
+++ b/cmake/FindSphinx.cmake
@@ -0,0 +1,146 @@
+# - This module looks for Sphinx
+# Find the Sphinx documentation generator
+#
+# This modules defines
+#  SPHINX_EXECUTABLE
+#  SPHINX_FOUND
+
+find_program(SPHINX_EXECUTABLE
+  NAMES sphinx-build
+  PATHS
+    /usr/bin
+    /usr/local/bin
+    /opt/local/bin
+  DOC "Sphinx documentation generator"
+)
+
+if( NOT SPHINX_EXECUTABLE )
+  set(_Python_VERSIONS
+    2.7 2.6 2.5 2.4 2.3 2.2 2.1 2.0 1.6 1.5
+  )
+
+  foreach( _version ${_Python_VERSIONS} )
+    set( _sphinx_NAMES sphinx-build-${_version} )
+
+    find_program( SPHINX_EXECUTABLE
+      NAMES ${_sphinx_NAMES}
+      PATHS
+        /usr/bin
+        /usr/local/bin
+        /opt/loca/bin
+      DOC "Sphinx documentation generator"
+    )
+  endforeach()
+endif()
+
+include(FindPackageHandleStandardArgs)
+
+find_package_handle_standard_args(Sphinx DEFAULT_MSG
+  SPHINX_EXECUTABLE
+)
+
+
+option( SPHINX_HTML_OUTPUT "Build a single HTML with the whole content." ON )
+option( SPHINX_DIRHTML_OUTPUT "Build HTML pages, but with a single directory per document." OFF )
+option( SPHINX_HTMLHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in htmlhelp." OFF )
+option( SPHINX_QTHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in qthelp." OFF )
+option( SPHINX_DEVHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in devhelp." OFF )
+option( SPHINX_EPUB_OUTPUT "Build HTML pages with additional information for building a documentation collection in epub." OFF )
+option( SPHINX_LATEX_OUTPUT "Build LaTeX sources that can be compiled to a PDF document using pdflatex." OFF )
+option( SPHINX_MAN_OUTPUT "Build manual pages in groff format for UNIX systems." OFF )
+option( SPHINX_TEXT_OUTPUT "Build plain text files." OFF )
+
+
+mark_as_advanced(
+  SPHINX_EXECUTABLE
+  SPHINX_HTML_OUTPUT
+  SPHINX_DIRHTML_OUTPUT
+  SPHINX_HTMLHELP_OUTPUT
+  SPHINX_QTHELP_OUTPUT
+  SPHINX_DEVHELP_OUTPUT
+  SPHINX_EPUB_OUTPUT
+  SPHINX_LATEX_OUTPUT
+  SPHINX_MAN_OUTPUT
+  SPHINX_TEXT_OUTPUT
+)
+
+function( Sphinx_add_target target_name builder conf cache source destination )
+  add_custom_target( ${target_name} ALL
+    COMMAND ${SPHINX_EXECUTABLE} -b ${builder}
+    -d ${cache}
+    -c ${conf}
+    ${source}
+    ${destination}
+    COMMENT "Generating sphinx documentation: ${builder}"
+    )
+
+  set_property(
+    DIRECTORY APPEND PROPERTY
+    ADDITIONAL_MAKE_CLEAN_FILES
+    ${destination}
+    )
+endfunction()
+
+# Target dependencies can be optionally listed at the end.
+function( Sphinx_add_targets target_base_name conf source base_destination )
+
+  set( _dependencies )
+
+  foreach( arg IN LISTS ARGN )
+    set( _dependencies ${_dependencies} ${arg} )
+  endforeach()
+
+  if( ${SPHINX_HTML_OUTPUT} )
+    Sphinx_add_target( ${target_base_name}_html html ${conf} ${source} ${base_destination}/html )
+
+    add_dependencies( ${target_base_name}_html ${_dependencies} )
+  endif()
+
+  if( ${SPHINX_DIRHTML_OUTPUT} )
+    Sphinx_add_target( ${target_base_name}_dirhtml dirhtml ${conf} ${source} ${base_destination}/dirhtml )
+
+    add_dependencies( ${target_base_name}_dirhtml ${_dependencies} )
+  endif()
+
+  if( ${SPHINX_QTHELP_OUTPUT} )
+    Sphinx_add_target( ${target_base_name}_qthelp qthelp ${conf} ${source} ${base_destination}/qthelp )
+
+    add_dependencies( ${target_base_name}_qthelp ${_dependencies} )
+  endif()
+
+  if( ${SPHINX_DEVHELP_OUTPUT} )
+    Sphinx_add_target( ${target_base_name}_devhelp devhelp ${conf} ${source} ${base_destination}/devhelp )
+
+    add_dependencies( ${target_base_name}_devhelp ${_dependencies} )
+  endif()
+
+  if( ${SPHINX_EPUB_OUTPUT} )
+    Sphinx_add_target( ${target_base_name}_epub epub ${conf} ${source} ${base_destination}/epub )
+
+    add_dependencies( ${target_base_name}_epub ${_dependencies} )
+  endif()
+
+  if( ${SPHINX_LATEX_OUTPUT} )
+    Sphinx_add_target( ${target_base_name}_latex latex ${conf} ${source} ${base_destination}/latex )
+
+    add_dependencies( ${target_base_name}_latex ${_dependencies} )
+  endif()
+
+  if( ${SPHINX_MAN_OUTPUT} )
+    Sphinx_add_target( ${target_base_name}_man man ${conf} ${source} ${base_destination}/man )
+
+    add_dependencies( ${target_base_name}_man ${_dependencies} )
+  endif()
+
+  if( ${SPHINX_TEXT_OUTPUT} )
+    Sphinx_add_target( ${target_base_name}_text text ${conf} ${source} ${base_destination}/text )
+
+    add_dependencies( ${target_base_name}_text ${_dependencies} )
+  endif()
+
+  if( ${BUILD_TESTING} )
+    sphinx_add_target( ${target_base_name}_linkcheck linkcheck ${conf} ${source} ${base_destination}/linkcheck )
+
+    add_dependencies( ${target_base_name}_linkcheck ${_dependencies} )
+  endif()
+endfunction()
\ No newline at end of file
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
new file mode 100644
index 00000000000000..617bd7ea7162b8
--- /dev/null
+++ b/cmake/cblas.cmake
@@ -0,0 +1,119 @@
+# Find the CBlas libraries
+#
+# It will search MKL, atlas, OpenBlas, reference-cblas in order.
+#
+# If any cblas implementation found, the following variable will be set.
+#    CBLAS_PROVIDER  # one of MKL, ATLAS, OPENBLAS, REFERENCE
+#    CBLAS_INC_DIR   # the include directory for cblas.
+#    CBLAS_LIBS      # a list of libraries should be linked by paddle. 
+#                    # Each library should be full path to object file.
+#
+# User should set one of MKL_ROOT, ATLAS_ROOT, OPENBLAS_ROOT, REFERENCE_CBLAS_ROOT
+# during cmake. If none of them set, it will try to find cblas implementation in
+# system paths.
+#
+
+
+## Find MKL First.
+set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")
+
+find_path(MKL_INCLUDE_DIR mkl.h PATHS ${MKL_ROOT}/include)
+find_library(MKL_CORE_LIB NAMES mkl_core PATHS ${MKL_ROOT}/lib)
+find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS ${MKL_ROOT}/lib)
+find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS ${MKL_ROOT}/lib)
+
+
+if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
+  set(CBLAS_PROVIDER MKL)
+  set(CBLAS_INC_DIR ${MKL_INCLUDE_DIR})
+  set(CBLAS_LIBS ${MKL_INTEL_LP64}
+          ${MKL_SEQUENTIAL_LIB}
+          ${MKL_CORE_LIB})
+  add_definitions(-DPADDLE_USE_MKL)
+  return() # return file.
+endif()
+
+## Then find atlas.
+set(ATLAS_ROOT $ENV{ATLAS_ROOT} CACHE PATH "Folder contains Atlas")
+set(ATLAS_INCLUDE_SEARCH_PATHS
+        ${ATLAS_ROOT}/include
+        /usr/include
+        /usr/include/atlas)
+set(ATLAS_LIB_SEARCH_PATHS
+        ${ATLAS_ROOT}/lib
+        /usr/lib
+        /usr/lib/blas/atlas
+        /usr/lib/atlas
+        /usr/lib/atlas-base   # special for ubuntu 14.04.
+    )
+find_path(ATLAS_INC_DIR NAMES cblas.h 
+  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
+find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3 
+  PATHS ${ATLAS_LIB_SEARCH_PATHS})
+find_library(ATLAS_LIB NAMES atlas libatlas.so.3
+  PATHS ${ATLAS_LIB_SEARCH_PATHS})
+
+if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB)
+  set(CBLAS_PROVIDER ATLAS)
+  set(CBLAS_INC_DIR ${ATLAS_INC_DIR})
+  set(CBLAS_LIBS ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
+  return()
+endif()
+
+## Then find openblas.
+set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
+set(OPENBLAS_INCLUDE_SEARCH_PATHS
+        ${OPENBLAS_ROOT}/include
+        /usr/include
+        /usr/include/openblas)
+set(OPENBLAS_LIB_SEARCH_PATHS
+        ${OPENBLAS_ROOT}/lib
+        /usr/lib
+        /usr/lib/blas/openblas
+        /usr/lib/openblas)
+
+find_path(OPENBLAS_INC_DIR NAMES cblas.h
+  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
+find_library(OPENBLAS_LIB NAMES openblas
+  PATHS ${OPENBLAS_LIB_SEARCH_PATHS})
+
+if(OPENBLAS_INC_DIR AND OPENBLAS_LIB)
+  set(CBLAS_PROVIDER OPENBLAS)
+  set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR})
+  set(CBLAS_LIBS ${OPENBLAS_LIB})
+  return()
+endif()
+
+
+## Then find the reference-cblas.  www.netlib.org/blas/
+
+
+set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH 
+  "Folder contains reference-cblas")
+set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
+  ${REFERENCE_CBLAS_ROOT}/include
+  /usr/include
+  /usr/include/cblas
+)
+
+set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
+  ${REFERENCE_CBLAS_ROOT}/lib
+  /usr/lib
+  /usr/lib/blas/reference/
+  /usr/lib/reference/
+)
+
+find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
+        ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
+find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
+        ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
+
+if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
+  set(CBLAS_PROVIDER REFERENCE)
+  set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
+  set(CBLAS_LIBS ${REFERENCE_CBLAS_LIBRARY})
+  return()
+endif()
+
+message(FATAL_ERROR "CBlas must be set. Paddle support MKL, ATLAS, OpenBlas, reference-cblas."
+  " Try set MKL_ROOT, ATLAS_ROOT, OPENBLAS_ROOT or REFERENCE_CBLAS_ROOT.")
diff --git a/cmake/ccache.cmake b/cmake/ccache.cmake
new file mode 100644
index 00000000000000..968d41801d73c4
--- /dev/null
+++ b/cmake/ccache.cmake
@@ -0,0 +1,9 @@
+# Use ccache if found ccache program
+
+find_program(CCACHE_FOUND ccache)
+
+if(CCACHE_FOUND)
+    message(STATUS "Ccache is founded, use ccache to speed up compile.")
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache)
+endif(CCACHE_FOUND)
\ No newline at end of file
diff --git a/cmake/check_packages.cmake b/cmake/check_packages.cmake
new file mode 100644
index 00000000000000..3bc0c1fd18448e
--- /dev/null
+++ b/cmake/check_packages.cmake
@@ -0,0 +1,45 @@
+# Check package for each cmake option
+
+if(WITH_GPU)
+  find_package(CUDA REQUIRED)  # CUDA is required when use gpu
+endif()
+
+if(WITH_PYTHON)
+  find_package(PythonLibs 2.6 REQUIRED)
+  find_package(PythonInterp REQUIRED)
+  find_package(NumPy REQUIRED)
+endif()
+
+if(WITH_STYLE_CHECK)
+  find_package(PythonInterp REQUIRED)
+endif()
+
+if(WITH_GLOG)
+  find_package(Glog REQUIRED)
+endif()
+
+if(WITH_GFLAGS)
+  find_package(Gflags REQUIRED)
+endif()
+
+if(WITH_TESTING)
+  find_package(GTest REQUIRED)
+endif()
+
+if(WITH_DOC)
+  find_package(Sphinx REQUIRED)
+  find_package(Doxygen REQUIRED)
+  find_python_module(recommonmark REQUIRED)
+  find_python_module(breathe REQUIRED)
+endif()
+
+if(WITH_SWIG_PY)
+  if(NOT SWIG_FOUND)
+    message(FATAL_ERROR "SWIG is not found. Please install swig or disable WITH_SWIG_PY")
+  endif()
+  find_python_module(wheel REQUIRED)  # package wheel
+endif()
+
+if(NOT M4_EXECUTABLE)
+  message(FATAL_ERROR "Paddle need m4 to generate proto file.")
+endif()
diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake
new file mode 100644
index 00000000000000..241af9a0835b2f
--- /dev/null
+++ b/cmake/cpplint.cmake
@@ -0,0 +1,62 @@
+# util to check C++ file style
+# * it basically use google cpplint.py.
+# * It provide "add_style_check_target" for cmake.
+#   Usage see add_style_check_target's document
+#
+# TODO(yuyang18): Add python style check.
+
+set(STYLE_FILTER)
+
+# diable unwanted filters
+
+# paddle do not indent public/potected/private in class
+set(STYLE_FILTER "${STYLE_FILTER}-whitespace/indent,")
+# paddle use mutable reference. BUT IT IS NOT RECOMMANDED
+set(STYLE_FILTER "${STYLE_FILTER}-runtime/references,")
+# paddle use relative path for include.
+set(STYLE_FILTER "${STYLE_FILTER}-build/include,")
+# paddle use <thread>, <mutex>, etc.
+set(STYLE_FILTER "${STYLE_FILTER}-build/c++11,")
+# paddle use c style casting. BUT IT IS NOT RECOMMANDED
+set(STYLE_FILTER "${STYLE_FILTER}-readability/casting")
+
+
+# IGNORE SOME FILES
+set(IGNORE_PATTERN
+    .*ImportanceSampler.*
+    .*cblas\\.h.*
+    .*LtrDataProvider.*
+    .*MultiDataProvider.*)
+
+# add_style_check_target
+#
+# attach check code style step for target.
+#
+# first argument: target name to attach
+# rest arguments: source list to check code style.
+# 
+# NOTE: If WITH_STYLE_CHECK is OFF, then this macro just do nothing.
+macro(add_style_check_target TARGET_NAME)
+    if(WITH_STYLE_CHECK)
+        set(SOURCES_LIST ${ARGN})
+        list(REMOVE_DUPLICATES SOURCES_LIST)
+        list(SORT SOURCES_LIST)
+
+        foreach(filename ${SOURCES_LIST})
+            set(LINT ON)
+            foreach(pattern ${IGNORE_PATTERN})
+                if(filename MATCHES ${pattern})
+                    message(STATUS "DROP LINT ${filename}")
+                    set(LINT OFF)
+                endif() 
+            endforeach()
+            if(LINT MATCHES ON)
+                add_custom_command(TARGET ${TARGET_NAME}
+                    PRE_BUILD
+                    COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
+                                "--filter=${STYLE_FILTER}" ${filename}
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
+            endif()
+        endforeach()
+    endif()
+endmacro()
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
new file mode 100644
index 00000000000000..e2ff923a229232
--- /dev/null
+++ b/cmake/cudnn.cmake
@@ -0,0 +1,68 @@
+set(CUDNN_ROOT "" CACHE PATH "CUDNN ROOT")
+find_path(CUDNN_INCLUDE_DIR cudnn.h
+    PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
+    $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
+    NO_DEFAULT_PATH
+)
+
+get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
+
+list(APPEND CUDNN_CHECK_LIBRARY_DIRS
+    ${CUDNN_ROOT}
+    ${CUDNN_ROOT}/lib64
+    ${CUDNN_ROOT}/lib
+    $ENV{CUDNN_ROOT}
+    $ENV{CUDNN_ROOT}/lib64
+    $ENV{CUDNN_ROOT}/lib
+    /usr/lib)
+find_library(CUDNN_LIBRARY NAMES libcudnn.so # libcudnn_static.a
+    PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
+          NO_DEFAULT_PATH
+    DOC "Path to cuDNN library.")
+
+
+if(CUDNN_INCLUDE_DIR AND CUDNN_LIBRARY)
+    set(CUDNN_FOUND ON)
+else()
+    set(CUDNN_FOUND OFF)
+endif()
+
+if(CUDNN_FOUND)
+    file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
+
+    get_filename_component(CUDNN_LIB_PATH ${CUDNN_LIBRARY} DIRECTORY)
+
+    string(REGEX MATCH "define CUDNN_VERSION +([0-9]+)"
+        CUDNN_VERSION "${CUDNN_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define CUDNN_VERSION +([0-9]+)" "\\1"
+        CUDNN_VERSION "${CUDNN_VERSION}")
+
+    if("${CUDNN_VERSION}" STREQUAL "2000")
+        message(STATUS "Current cuDNN version is v2. ")
+    else()
+        string(REGEX MATCH "define CUDNN_MAJOR +([0-9]+)" CUDNN_MAJOR_VERSION
+            "${CUDNN_VERSION_FILE_CONTENTS}")
+        string(REGEX REPLACE "define CUDNN_MAJOR +([0-9]+)" "\\1"
+            CUDNN_MAJOR_VERSION "${CUDNN_MAJOR_VERSION}")
+        string(REGEX MATCH "define CUDNN_MINOR +([0-9]+)" CUDNN_MINOR_VERSION
+            "${CUDNN_VERSION_FILE_CONTENTS}")
+        string(REGEX REPLACE "define CUDNN_MINOR +([0-9]+)" "\\1"
+            CUDNN_MINOR_VERSION "${CUDNN_MINOR_VERSION}")
+        string(REGEX MATCH "define CUDNN_PATCHLEVEL +([0-9]+)"
+            CUDNN_PATCHLEVEL_VERSION "${CUDNN_VERSION_FILE_CONTENTS}")
+        string(REGEX REPLACE "define CUDNN_PATCHLEVEL +([0-9]+)" "\\1"
+            CUDNN_PATCHLEVEL_VERSION "${CUDNN_PATCHLEVEL_VERSION}")
+
+        if(NOT CUDNN_MAJOR_VERSION)
+            set(CUDNN_VERSION "???")
+        else()
+            math(EXPR CUDNN_VERSION
+                "${CUDNN_MAJOR_VERSION} * 1000 +
+                 ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
+        endif()
+
+        message(STATUS "Current cuDNN header is ${CUDNN_INCLUDE_DIR}/cudnn.h. "
+            "Current cuDNN version is v${CUDNN_MAJOR_VERSION}. ")
+
+    endif()
+endif()
diff --git a/cmake/enableCXX11.cmake b/cmake/enableCXX11.cmake
new file mode 100644
index 00000000000000..dc8cc3371aa6e5
--- /dev/null
+++ b/cmake/enableCXX11.cmake
@@ -0,0 +1,13 @@
+# Enable C++ 11 for GCC.
+# NOTE: It's only tested for gcc.
+include(CheckCXXCompilerFlag)
+CHECK_CXX_COMPILER_FLAG("-std=c++11" COMPILER_SUPPORT_CXX11)
+CHECK_CXX_COMPILER_FLAG("-std=c++0x" COMPILER_SUPPORT_CXX0X)
+
+if(COMPILER_SUPPORT_CXX11)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+elseif(COMPILER_SUPPORT_CXX0X)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x")
+else()
+    message(FATAL_ERROR "Your compiler must support c++11")
+endif()
\ No newline at end of file
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
new file mode 100644
index 00000000000000..351af42ee6f6ad
--- /dev/null
+++ b/cmake/flags.cmake
@@ -0,0 +1,86 @@
+# Setting Paddle Compile Flags
+include(CheckCXXCompilerFlag)
+include(CheckCCompilerFlag)
+include(CheckCXXSymbolExists)
+# safe_set_flag
+#
+# Set a compile flag only if compiler is support
+# is_c: is C flag or C++ flag, bool type.
+# src_list: The list name which the flag name will be append to.
+# flag_name: the flag name for compiler, such as '-Werror' '-Wall' etc
+# rest arguments: not used. 
+function(safe_set_flag is_c src_list flag_name)
+    string(REPLACE "-" "_" safe_name ${flag_name})
+    string(REPLACE "=" "_" safe_name ${safe_name})
+    if(is_c)
+        CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name})
+        set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name})
+    else()
+        CHECK_CXX_COMPILER_FLAG(${flag_name} CXX_COMPILER_SUPPORT_FLAG_${safe_name})
+        set(safe_name CXX_COMPILER_SUPPORT_FLAG_${safe_name})
+    endif()
+    if(${safe_name})
+        set(${src_list} "${${src_list}} ${flag_name}" PARENT_SCOPE)
+        if(is_c)
+          set(CUDA_NVCC_FLAGS
+              --compiler-options;${flag_name}
+              ${CUDA_NVCC_FLAGS}
+              PARENT_SCOPE)
+        endif()
+    endif()
+endfunction()
+
+# helper macro to set cflag
+macro(safe_set_cflag src_list flag_name)
+    safe_set_flag(ON ${src_list} ${flag_name})
+endmacro()
+
+# helper macro to set cxxflag
+macro(safe_set_cxxflag src_list flag_name)
+    safe_set_flag(OFF ${src_list} ${flag_name})
+endmacro()
+
+CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
+if(NOT UINT64_MAX_EXISTS)
+  set(CMAKE_REQUIRED_DEFINITIONS -D__STDC_LIMIT_MACROS)
+  CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS_HERE)
+  if(UINT64_MAX_EXISTS_HERE) 
+    set(CMAKE_REQUIRED_DEFINITIONS)
+    add_definitions(-D__STDC_LIMIT_MACROS)
+  else()
+    message(FATAL_ERROR "Cannot find symbol UINT64_MAX")
+  endif()
+endif()
+
+# Common flags. the compiler flag used for C/C++ sources whenever release or debug
+# Do not care if this flag is support for gcc.
+set(COMMON_FLAGS
+    -fPIC
+    -fno-omit-frame-pointer
+    -Wall
+    -Wextra
+    -Werror
+    -Wnon-virtual-dtor
+    -Wdelete-non-virtual-dtor
+    -Wno-unused-parameter
+    -Wno-error=literal-suffix
+    -Wno-error=unused-local-typedefs)
+
+foreach(flag ${COMMON_FLAGS})
+    safe_set_cflag(CMAKE_C_FLAGS ${flag})
+    safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
+endforeach()
+
+# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
+# So, don't set these flags here.
+
+foreach(capability 30 35 50)
+    list(APPEND __arch_flags "-gencode arch=compute_${capability},code=sm_${capability}")
+endforeach()
+
+if (CUDA_VERSION VERSION_GREATER "7.0")
+    list(APPEND __arch_flags "-gencode arch=compute_52,code=sm_52")
+endif()
+
+set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
+
diff --git a/cmake/package.cmake b/cmake/package.cmake
new file mode 100644
index 00000000000000..211593f358eb34
--- /dev/null
+++ b/cmake/package.cmake
@@ -0,0 +1,21 @@
+set(CPACK_PACKAGE_NAME paddle)
+set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "")
+set(CPACK_PACKAGE_VERSION_MAJOR ${PADDLE_MAJOR_VERSION})
+set(CPACK_PACKAGE_VERSION_MINOR ${PADDLE_MINOR_VERSION})
+set(CPACK_PACKAGE_VERSION_PATCH ${PADDLE_PATCH_VERSION})
+set(CPACK_PACKAGE_VERSION ${PADDLE_VERSION})
+## DEB Settings
+set(CPACK_DEBIAN_PACKAGE_NAME paddle)
+set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE amd64)
+set(CPACK_DEBIAN_PACKAGE_MAINTAINER PaddlePaddle Dev <paddle-dev@baidu.com>)
+set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Paddle")
+set(CPACK_PACKAGE_DESCRIPTION "")
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "libatlas3-base, libgflags2, libgoogle-glog0, libprotobuf8, libpython2.7, libstdc++6, python-numpy, python-pip, python-pip-whl, python-protobuf")
+set(CPACK_DEBIAN_PACKAGE_SECTION Devel)
+set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJ_ROOT}/paddle/scripts/deb/postinst")
+#set(CPACK_GENERATOR "DEB")
+# Start cpack
+include (CMakePackageConfigHelpers)
+include (CPack)
+
+
diff --git a/cmake/swig.cmake b/cmake/swig.cmake
new file mode 100644
index 00000000000000..f5c1bcc79b3dc0
--- /dev/null
+++ b/cmake/swig.cmake
@@ -0,0 +1,36 @@
+find_program(
+    SWIG_BINARY_PATH
+    swig)
+
+if(${SWIG_BINARY_PATH} STREQUAL "SWIG_BINARY_PATH-NOTFOUND")
+    set(SWIG_FOUND OFF)
+else()
+    set(SWIG_FOUND ON)
+endif()
+
+set(MIN_SWIG_VERSION 2)
+if(SWIG_FOUND)
+    execute_process(COMMAND sh -c "${SWIG_BINARY_PATH} -version | grep Version | cut -f3 -d' '"
+        OUTPUT_VARIABLE _SWIG_VERSION
+        OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(${_SWIG_VERSION} VERSION_LESS ${MIN_SWIG_VERSION})
+        message("swig version ${MIN_SWIG_VERSION} or greater is needed for generating python api. "
+                 "Only version ${_SWIG_VERSION} is found. Set SWIG_FOUND to FALSE")
+        set(SWIG_FOUND FALSE)
+    endif(${_SWIG_VERSION} VERSION_LESS ${MIN_SWIG_VERSION})
+endif(SWIG_FOUND)
+
+function(generate_python_api target_name)
+    add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
+                              ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
+                              ${PROJ_ROOT}/paddle/Paddle_wrap.h
+        COMMAND swig -python -c++ -outcurrentdir -I../ api/Paddle.swig
+                && mv ${PROJ_ROOT}/paddle/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
+        DEPENDS ${PROJ_ROOT}/paddle/api/Paddle.swig
+        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
+        COMMENT "Generate Python API from swig")
+    add_custom_target(${target_name} ALL DEPENDS
+                ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
+                ${PROJ_ROOT}/paddle/Paddle_wrap.h
+                ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py)
+endfunction(generate_python_api)
diff --git a/cmake/util.cmake b/cmake/util.cmake
new file mode 100644
index 00000000000000..e0e372fed0b049
--- /dev/null
+++ b/cmake/util.cmake
@@ -0,0 +1,147 @@
+# Some common routine for paddle compile.
+
+
+# target_circle_link_libraries
+# Link libraries to target which has circle dependencies.
+#
+# First Argument: target name want to be linked with libraries
+# Rest Arguments: libraries which link together.
+function(target_circle_link_libraries TARGET_NAME)
+    target_link_libraries(${TARGET_NAME}
+        -Wl,--start-group
+        ${ARGN}
+        -Wl,--end-group)
+endfunction()
+
+# compile_cu_as_cpp
+# Make a cu file compiled as C++
+# Arguments: Source files
+macro(compile_cu_as_cpp)
+    foreach(s ${ARGN})
+        set_source_files_properties(${s} PROPERTIES LANGUAGE CXX)
+        set_source_files_properties(${s} PROPERTIES COMPILE_FLAGS "-x c++")
+    endforeach()
+endmacro()
+
+# link_paddle_exe
+# add paddle library for a paddle executable, such as trainer, pserver.
+#
+# It will handle WITH_PYTHON/WITH_GLOG etc.
+function(link_paddle_exe TARGET_NAME)
+    if(WITH_METRIC)
+        if(WITH_GPU)
+            set(METRIC_LIBS paddle_metric_learning paddle_dserver_lib metric metric_cpu)
+        else()
+            set(METRIC_LIBS paddle_metric_learning paddle_dserver_lib metric_cpu)
+        endif()
+    else()
+        set(METRIC_LIBS "")
+    endif()
+
+    if(PADDLE_WITH_INTERNAL)
+        set(INTERAL_LIBS paddle_internal_gserver paddle_internal_parameter)
+        target_circle_link_libraries(${TARGET_NAME}
+            -Wl,--whole-archive
+            paddle_internal_gserver
+            paddle_internal_owlqn
+            -Wl,--no-whole-archive
+            paddle_internal_parameter)
+    else()
+        set(INTERAL_LIBS "")
+    endif()
+
+    target_circle_link_libraries(${TARGET_NAME}
+        -Wl,--whole-archive
+        paddle_gserver
+        ${METRIC_LIBS}
+        -Wl,--no-whole-archive
+        paddle_pserver
+        paddle_trainer_lib
+        paddle_network
+        paddle_math
+        paddle_utils
+        paddle_parameter
+        paddle_proto
+        paddle_cuda
+        ${METRIC_LIBS}
+        ${PROTOBUF_LIBRARY}
+        ${CMAKE_THREAD_LIBS_INIT}
+        ${CBLAS_LIBS}
+        ${CMAKE_DL_LIBS}
+        ${INTERAL_LIBS}
+        -lz)
+    
+    if(WITH_PYTHON)
+        target_link_libraries(${TARGET_NAME}
+            ${PYTHON_LIBRARIES})
+    endif()
+
+    if(WITH_GLOG)
+        target_link_libraries(${TARGET_NAME}
+            ${LIBGLOG_LIBRARY})
+    endif()
+
+    if(WITH_GFLAGS)
+        target_link_libraries(${TARGET_NAME}
+            ${GFLAGS_LIBRARIES})
+    endif()
+
+    if(WITH_GPU)
+        if(NOT WITH_DSO OR WITH_METRIC) 
+            target_link_libraries(${TARGET_NAME}
+                ${CUDNN_LIBRARY}
+                ${CUDA_curand_LIBRARY}) 
+            CUDA_ADD_CUBLAS_TO_TARGET(${TARGET_NAME})
+        endif()
+
+        check_library_exists(rt clock_gettime "time.h" HAVE_CLOCK_GETTIME )
+        if(HAVE_CLOCK_GETTIME)
+            target_link_libraries(${TARGET_NAME} rt)
+        endif()
+    endif()
+endfunction()
+
+# link_paddle_test
+# Link a paddle unittest for target
+# TARGET_NAME: the unittest target name
+# Rest Arguemnts: not used.
+function(link_paddle_test TARGET_NAME)
+    link_paddle_exe(${TARGET_NAME})
+    target_link_libraries(${TARGET_NAME} ${GTEST_MAIN_LIBRARIES}
+        ${GTEST_LIBRARIES})
+endfunction()
+
+# add_unittest_without_exec
+#
+# create a paddle unittest. not specifically define how to run this unittest.
+# TARGET_NAME: the unittest target name, same as executable file name
+# Rest Arguments: the source files to compile this unittest.
+macro(add_unittest_without_exec TARGET_NAME)
+    add_executable(${TARGET_NAME} ${ARGN})
+    link_paddle_test(${TARGET_NAME})
+    add_style_check_target(${TARGET_NAME} ${ARGN})
+endmacro()
+
+# add_unittest
+# create a paddle unittest and just to execute this binary to make unittest.
+#
+# TARGET_NAME: the unittest target name, same as executable file name
+# Rest Arguments: the source files to compile this unittest.
+macro(add_unittest TARGET_NAME)
+    add_unittest_without_exec(${TARGET_NAME} ${ARGN})
+    add_test(${TARGET_NAME} ${TARGET_NAME})
+endmacro()
+
+# add_simple_unittest
+# create a paddle unittest with file name. It just compile ${TARGET_NAME}.cpp to
+# ${TARGET_NAME} and then execute it.
+macro(add_simple_unittest TARGET_NAME)
+    add_unittest(${TARGET_NAME} ${TARGET_NAME}.cpp)
+endmacro()
+
+macro(add_paddle_culib TARGET_NAME)
+    set(NVCC_FLAG ${CUDA_NVCC_FLAGS})
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--use_fast_math)
+    cuda_add_library(${TARGET_NAME} STATIC ${ARGN})
+    set(CUDA_NVCC_FLAGS ${NVCC_FLAG})
+endmacro()
diff --git a/demo/image_classification/.gitignore b/demo/image_classification/.gitignore
new file mode 100644
index 00000000000000..76961dd1436f85
--- /dev/null
+++ b/demo/image_classification/.gitignore
@@ -0,0 +1,7 @@
+data/cifar-10-batches-py
+data/cifar-out
+cifar_vgg_model/*
+plot.png
+train.log
+image_provider_copy_1.py
+*pyc
diff --git a/demo/image_classification/classify.py b/demo/image_classification/classify.py
new file mode 120000
index 00000000000000..fefce7086ae7a6
--- /dev/null
+++ b/demo/image_classification/classify.py
@@ -0,0 +1 @@
+../model_zoo/resnet/classify.py
\ No newline at end of file
diff --git a/demo/image_classification/classify.sh b/demo/image_classification/classify.sh
new file mode 100755
index 00000000000000..f797631346f5a3
--- /dev/null
+++ b/demo/image_classification/classify.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+python classify.py \
+     --job=predict \
+     --conf=vgg_16_cifar.py \
+     --model=./cifar_vgg_model/pass-00299 \
+     --multi_crop \
+     --data=./example/test.list
diff --git a/demo/image_classification/data/download_cifar.sh b/demo/image_classification/data/download_cifar.sh
new file mode 100644
index 00000000000000..ca9b0b5c905254
--- /dev/null
+++ b/demo/image_classification/data/download_cifar.sh
@@ -0,0 +1,20 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
+tar zxf cifar-10-python.tar.gz
+rm cifar-10-python.tar.gz
+rm -rf cifar-out/*
+echo Converting CIFAR data to images.....
+python process_cifar.py ./cifar-10-batches-py ./cifar-out
diff --git a/demo/image_classification/data/process_cifar.py b/demo/image_classification/data/process_cifar.py
new file mode 100644
index 00000000000000..b766118eb00737
--- /dev/null
+++ b/demo/image_classification/data/process_cifar.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import sys
+import os
+import PIL.Image as Image
+
+"""
+  Usage: python process_cifar input_dir output_dir
+"""
+
+
+def mkdir_not_exist(path):
+    """
+    Make dir if the path does not exist.
+    path: the path to be created.
+    """
+    if not os.path.exists(path):
+        os.mkdir(path)
+
+def create_dir_structure(output_dir):
+    """
+    Create the directory structure for the directory.
+    output_dir: the direcotry structure path.
+    """
+    mkdir_not_exist(os.path.join(output_dir))
+    mkdir_not_exist(os.path.join(output_dir, "train"))
+    mkdir_not_exist(os.path.join(output_dir, "test"))
+
+def convert_batch(batch_path, label_set, label_map,
+                  output_dir, data_split):
+    """
+    Convert CIFAR batch to the structure of Paddle format.
+    batch_path: the batch to be converted.
+    label_set: the set of labels.
+    output_dir: the output path.
+    data_split: whether it is training or testing data.
+    """
+    data = np.load(batch_path)
+    for data, label, filename in zip(data['data'], data['labels'],
+                                     data['filenames']):
+        data = data.reshape((3, 32, 32))
+        data = np.transpose(data, (1, 2, 0))
+        label = label_map[label]
+        output_dir_this = os.path.join(output_dir, data_split, str(label))
+        output_filename = os.path.join(output_dir_this, filename)
+        if not label in label_set:
+            label_set[label] = True
+            mkdir_not_exist(output_dir_this)
+        Image.fromarray(data).save(output_filename)
+
+
+if __name__ == '__main__':
+    input_dir = sys.argv[1]
+    output_dir = sys.argv[2]
+    num_batch = 5
+    create_dir_structure(output_dir)
+    label_map = {0: "airplane", 1: "automobile", 2: "bird", 3: "cat", 4: "deer",
+                 5: "dog", 6: "frog", 7: "horse", 8: "ship", 9: "truck"}
+    labels = {}
+    for i in range(1, num_batch + 1):
+        convert_batch(os.path.join(input_dir, "data_batch_%d" % i), labels,
+                      label_map, output_dir, "train")
+    convert_batch(os.path.join(input_dir, "test_batch"), {},
+                  label_map, output_dir, "test")
\ No newline at end of file
diff --git a/demo/image_classification/image_predictor.py b/demo/image_classification/image_predictor.py
new file mode 100644
index 00000000000000..002cb412aa3563
--- /dev/null
+++ b/demo/image_classification/image_predictor.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+from optparse import OptionParser
+
+from py_paddle import swig_paddle, util, DataProviderWrapperConverter
+from paddle.trainer.PyDataProviderWrapper import DenseSlot
+from paddle.trainer.config_parser import parse_config
+
+
+
+"""
+Will merge predictor from Qingqing.
+"""
diff --git a/demo/image_classification/image_provider.py b/demo/image_classification/image_provider.py
new file mode 100644
index 00000000000000..9e2f8b8949b39b
--- /dev/null
+++ b/demo/image_classification/image_provider.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+import random
+
+import paddle.utils.image_util as image_util
+from paddle.trainer.PyDataProvider2 import *
+
+
+#
+# {'img_size': 32,
+# 'settings': <paddle.trainer.PyDataProviderWrapper.Cls instance at 0x7fea27cb6050>,
+# 'color': True,
+# 'mean_img_size': 32,
+# 'meta': './data/cifar-out/batches/batches.meta',
+# 'num_classes': 10,
+# 'file_list': ('./data/cifar-out/batches/train_batch_000',),
+# 'use_jpeg': True}
+def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg,
+         is_train, **kwargs):
+    settings.mean_img_size = mean_img_size
+    settings.img_size = img_size
+    settings.num_classes = num_classes
+    settings.color = color
+    settings.is_train = is_train
+
+    if settings.color:
+        settings.img_raw_size = settings.img_size * settings.img_size * 3
+    else:
+        settings.img_raw_size = settings.img_size * settings.img_size
+
+    settings.meta_path = meta
+    settings.use_jpeg = use_jpeg
+
+    settings.img_mean = image_util.load_meta(settings.meta_path,
+                                             settings.mean_img_size,
+                                             settings.img_size,
+                                             settings.color)
+
+    settings.logger.info('Image size: %s', settings.img_size)
+    settings.logger.info('Meta path: %s', settings.meta_path)
+    settings.input_types = [
+        dense_vector(settings.img_raw_size),  # image feature
+        integer_value(settings.num_classes)]  # labels
+
+    settings.logger.info('DataProvider Initialization finished')
+
+
+@provider(init_hook=hook)
+def processData(settings, file_name):
+    """
+    The main function for loading data.
+    Load the batch, iterate all the images and labels in this batch.
+    file_name: the batch file name.
+    """
+    data = cPickle.load(io.open(file_name, 'rb'))
+    indexes = list(range(len(data['images'])))
+    if settings.is_train:
+        random.shuffle(indexes)
+    for i in indexes:
+        if settings.use_jpeg == 1:
+            img = image_util.decode_jpeg(data['images'][i])
+        else:
+            img = data['images'][i]
+        img_feat = image_util.preprocess_img(img, settings.img_mean,
+                                             settings.img_size, settings.is_train,
+                                             settings.color)
+        label = data['labels'][i]
+        yield img_feat.tolist(), int(label)
diff --git a/demo/image_classification/image_util.py b/demo/image_classification/image_util.py
new file mode 100644
index 00000000000000..c545d16aafbc74
--- /dev/null
+++ b/demo/image_classification/image_util.py
@@ -0,0 +1,207 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from PIL import Image
+from cStringIO import StringIO
+
+def resize_image(img, target_size):
+    """
+    Resize an image so that the shorter edge has length target_size.
+    img: the input image to be resized.
+    target_size: the target resized image size.
+    """
+    percent = (target_size/float(min(img.size[0], img.size[1])))
+    resized_size = int(round(img.size[0] * percent)), int(round(img.size[1] * percent))
+    img = img.resize(resized_size, Image.ANTIALIAS)
+    return img
+
+def flip(im):
+    """
+    Return the flipped image.
+    Flip an image along the horizontal direction.
+    im: input image, (H x W x K) ndarrays 
+    """
+    if len(im.shape) == 3:
+        return im[:, :, ::-1]
+    else:
+        return im[:, ::-1]
+
+def crop_img(im, inner_size, color=True, test=True):
+    """
+    Return cropped image.
+    The size of the cropped image is inner_size * inner_size.
+    im: (K x H x W) ndarrays
+    inner_size: the cropped image size.
+    color: whether it is color image.
+    test: whether in test mode.
+      If False, does random cropping and flipping.
+      If True, crop the center of images.
+    """
+    if color:
+        height, width = max(inner_size, im.shape[1]), max(inner_size, im.shape[2])
+        padded_im = np.zeros((3, height, width))
+        startY = (height - im.shape[1]) / 2
+        startX = (width - im.shape[2]) / 2
+        endY, endX = startY + im.shape[1], startX + im.shape[2]
+        padded_im[:, startY: endY, startX: endX] = im
+    else:
+        im = im.astype('float32')
+        height, width = max(inner_size, im.shape[0]), max(inner_size, im.shape[1])
+        padded_im = np.zeros((height, width))
+        startY = (height - im.shape[0]) / 2
+        startX = (width - im.shape[1]) / 2
+        endY, endX = startY + im.shape[0], startX + im.shape[1]
+        padded_im[startY: endY, startX: endX] = im
+    if test:
+        startY = (height - inner_size) / 2
+        startX = (width - inner_size) / 2
+    else:
+        startY = np.random.randint(0, height - inner_size + 1)
+        startX = np.random.randint(0, width - inner_size + 1)
+    endY, endX = startY + inner_size, startX + inner_size
+    if color:
+        pic = padded_im[:, startY: endY, startX: endX]
+    else:
+        pic = padded_im[startY: endY, startX: endX]
+    if (not test) and (np.random.randint(2) == 0):
+        pic = flip(pic)
+    return pic
+
+def decode_jpeg(jpeg_string):
+    np_array = np.array(Image.open(StringIO(jpeg_string)))
+    if len(np_array.shape) == 3:
+        np_array = np.transpose(np_array, (2, 0, 1))
+    return np_array
+
+def preprocess_img(im, img_mean, crop_size, is_train, color=True):
+    """
+    Does data augmentation for images.
+    If is_train is false, cropping the center region from the image.
+    If is_train is true, randomly crop a region from the image,
+    and randomy does flipping.
+    im: (K x H x W) ndarrays
+    """
+    im = im.astype('float32')
+    test = not is_train
+    pic = crop_img(im, crop_size, color, test)
+    pic -= img_mean
+    return pic.flatten()
+
+def load_meta(meta_path, mean_img_size, crop_size, color=True):
+    """
+    Return the loaded meta file.
+    Load the meta image, which is the mean of the images in the dataset.
+    The mean image is subtracted from every input image so that the expected mean
+    of each input image is zero.
+    """
+    mean = np.load(meta_path)['data_mean']
+    border = (mean_img_size - crop_size) / 2
+    if color:
+        assert(mean_img_size * mean_img_size * 3 == mean.shape[0])
+        mean = mean.reshape(3, mean_img_size, mean_img_size)
+        mean = mean[:, border: border + crop_size,
+                       border: border + crop_size].astype('float32')
+    else:
+        assert(mean_img_size * mean_img_size == mean.shape[0])
+        mean = mean.reshape(mean_img_size, mean_img_size)
+        mean = mean[border: border + crop_size,
+                    border: border + crop_size].astype('float32')
+    return mean
+
+def load_image(img_path, is_color=True):
+    """
+    Load image and return. 
+    img_path: image path.
+    is_color: is color image or not.
+    """
+    img = Image.open(img_path)
+    img.load()
+    return img
+
+def oversample(img, crop_dims):
+    """
+    image : iterable of (H x W x K) ndarrays
+    crop_dims: (height, width) tuple for the crops.
+    Returned data contains ten crops of input image, namely,
+    four corner patches and the center patch as well as their
+    horizontal reflections.
+    """
+    # Dimensions and center.
+    im_shape = np.array(img[0].shape)
+    crop_dims = np.array(crop_dims)
+    im_center = im_shape[:2] / 2.0
+
+    # Make crop coordinates
+    h_indices = (0, im_shape[0] - crop_dims[0])
+    w_indices = (0, im_shape[1] - crop_dims[1])
+    crops_ix = np.empty((5, 4), dtype=int)
+    curr = 0
+    for i in h_indices:
+        for j in w_indices:
+            crops_ix[curr] = (i, j, i + crop_dims[0], j + crop_dims[1])
+            curr += 1
+    crops_ix[4] = np.tile(im_center, (1, 2)) + np.concatenate([
+        -crop_dims / 2.0,
+         crop_dims / 2.0
+    ])
+    crops_ix = np.tile(crops_ix, (2, 1))
+
+    # Extract crops
+    crops = np.empty((10 * len(img), crop_dims[0], crop_dims[1],
+                      im_shape[-1]), dtype=np.float32)
+    ix = 0
+    for im in img:
+        for crop in crops_ix:
+            crops[ix] = im[crop[0]:crop[2], crop[1]:crop[3], :]
+            ix += 1
+        crops[ix-5:ix] = crops[ix-5:ix, :, ::-1, :]  # flip for mirrors
+    return crops
+
+class ImageTransformer:
+    def __init__(self, transpose = None,
+                 channel_swap = None, mean = None, is_color = True):
+        self.transpose = transpose
+        self.channel_swap = None
+        self.mean = None
+        self.is_color = is_color 
+
+    def set_transpose(self, order): 
+        if self.is_color:
+            assert 3 == len(order) 
+        self.transpose = order
+
+    def set_channel_swap(self, order): 
+        if self.is_color:
+            assert 3 == len(order) 
+        self.channel_swap = order
+
+    def set_mean(self, mean):
+        # mean value, may be one value per channel 
+        if mean.ndim == 1:
+            mean = mean[:, np.newaxis, np.newaxis]       
+        else: 
+            # elementwise mean
+            if self.is_color:
+                assert len(mean.shape) == 3
+        self.mean = mean 
+
+    def transformer(self, data):
+        if self.transpose is not None:
+            data = data.transpose(self.transpose)
+        if self.channel_swap is not None:
+            data = data[self.channel_swap, :, :]
+        if self.mean is not None:
+            data -= self.mean
+        return data
diff --git a/demo/image_classification/preprocess.py b/demo/image_classification/preprocess.py
new file mode 100755
index 00000000000000..0286a5d7e9dc8d
--- /dev/null
+++ b/demo/image_classification/preprocess.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.utils.preprocess_img import ImageClassificationDatasetCreater
+from optparse import OptionParser
+
+
+def option_parser():
+    parser = OptionParser(usage="usage: python preprcoess.py "\
+                          "-i data_dir [options]")
+    parser.add_option("-i", "--input", action="store",
+                      dest="input", help="Input data directory.")
+    parser.add_option("-s", "--size", action="store",
+                      dest="size", help="Processed image size.")
+    parser.add_option("-c", "--color", action="store",
+                      dest="color", help="whether to use color images.")
+    return parser.parse_args()
+
+if __name__ == '__main__':
+     options, args = option_parser()
+     data_dir = options.input
+     processed_image_size = int(options.size)
+     color = options.color == "1"
+     data_creator = ImageClassificationDatasetCreater(data_dir,
+                                                      processed_image_size,
+                                                      color)
+     data_creator.num_per_batch = 1000
+     data_creator.overwrite = True
+     data_creator.create_batches()
diff --git a/demo/image_classification/preprocess.sh b/demo/image_classification/preprocess.sh
new file mode 100755
index 00000000000000..fe89c8f4bb9464
--- /dev/null
+++ b/demo/image_classification/preprocess.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+export PYTHONPATH=$PYTHONPATH:../../
+
+data_dir=./data/cifar-out
+
+python preprocess.py -i $data_dir -s 32 -c 1
diff --git a/demo/image_classification/train.sh b/demo/image_classification/train.sh
new file mode 100755
index 00000000000000..ed9b5220fff6a4
--- /dev/null
+++ b/demo/image_classification/train.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+config=vgg_16_cifar.py
+output=./cifar_vgg_model
+log=train.log
+
+paddle train \
+--config=$config \
+--dot_period=10 \
+--log_period=100 \
+--test_all_data_in_one_period=1 \
+--use_gpu=1 \
+--trainer_count=1 \
+--num_passes=200 \
+--save_dir=$output \
+2>&1 | tee $log
+
+python -m paddle.utils.plotcurve -i $log > plot.png
diff --git a/demo/image_classification/upload_hadoop.sh b/demo/image_classification/upload_hadoop.sh
new file mode 100755
index 00000000000000..34d3a8b7ce00f6
--- /dev/null
+++ b/demo/image_classification/upload_hadoop.sh
@@ -0,0 +1,18 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+hadoop fs -Dhadoop.job.ugi=paddle_demo,paddle_demo -put data/cifar-out/batches/train_batch_* /app/idl/idl-dl/paddle/demo/image_classification/train/
+hadoop fs -Dhadoop.job.ugi=paddle_demo,paddle_demo -put data/cifar-out/batches/test_batch_* /app/idl/idl-dl/paddle/demo/image_classification/test/
+hadoop fs -Dhadoop.job.ugi=paddle_demo,paddle_demo -put data/cifar-out/batches/batches.meta /app/idl/idl-dl/paddle/demo/image_classification/train_meta
+hadoop fs -Dhadoop.job.ugi=paddle_demo,paddle_demo -put data/cifar-out/batches/batches.meta /app/idl/idl-dl/paddle/demo/image_classification/test_meta
diff --git a/demo/image_classification/vgg_16_cifar.py b/demo/image_classification/vgg_16_cifar.py
new file mode 100644
index 00000000000000..238608c3cbede1
--- /dev/null
+++ b/demo/image_classification/vgg_16_cifar.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+is_predict = get_config_arg("is_predict", bool, False)
+
+####################Data Configuration ##################
+if not is_predict:
+  data_dir='data/cifar-out/batches/'
+  meta_path=data_dir+'batches.meta'
+
+  args = {'meta':meta_path,'mean_img_size': 32,
+          'img_size': 32,'num_classes': 10,
+          'use_jpeg': 1,'color': "color"}
+
+  define_py_data_sources2(train_list=data_dir+"train.list",
+                          test_list=data_dir+'test.list',
+                          module='image_provider',
+                          obj='processData',
+                          args=args)
+
+######################Algorithm Configuration #############
+settings(
+    batch_size = 128,
+    learning_rate = 0.1 / 128.0,
+    learning_method = MomentumOptimizer(0.9),
+    regularization = L2Regularization(0.0005 * 128)
+)
+
+#######################Network Configuration #############
+data_size=3*32*32
+label_size=10
+img = data_layer(name='image',
+                 size=data_size)
+# small_vgg is predined in trainer_config_helpers.network
+predict = small_vgg(input_image=img,
+                    num_channels=3,
+                    num_classes=label_size)
+
+if not is_predict:
+    lbl = data_layer(name="label", size=label_size)
+    outputs(classification_cost(input=predict, label=lbl))
+else:
+    outputs(predict)
diff --git a/demo/model_zoo/embedding/.gitignore b/demo/model_zoo/embedding/.gitignore
new file mode 100644
index 00000000000000..908f5a3fb2f7c3
--- /dev/null
+++ b/demo/model_zoo/embedding/.gitignore
@@ -0,0 +1,2 @@
+baidu.dict
+model_*.emb
diff --git a/demo/model_zoo/embedding/extract_para.py b/demo/model_zoo/embedding/extract_para.py
new file mode 100755
index 00000000000000..17067792fc38d0
--- /dev/null
+++ b/demo/model_zoo/embedding/extract_para.py
@@ -0,0 +1,96 @@
+#!/bin/env python
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Example:
+    python extract_para.py --preModel PREMODEL --preDict PREDICT \
+                            --usrModel USRMODEL --usrDict USRDICT -d DIM
+
+Options:
+    -h, --help          show this help message and exit
+    --preModel PREMODEL the name of pretrained embedding model
+    --preDict PREDICT   the name of pretrained dictionary
+    --usrModel usrModel the name of output usr embedding model
+    --usrDict usrDict   the name of user specified dictionary
+    -d DIM              dimension of parameter
+"""
+from optparse import OptionParser
+import struct
+
+def get_row_index(preDict, usrDict):
+    """
+    Get the row positions for all words in user dictionary from pre-trained dictionary.
+    return: a list of row positions
+    Example: preDict='a\nb\nc\n', usrDict='a\nc\n', then return [0,2]
+    """
+    pos = []
+    index = dict()
+    with open(preDict, "r") as f:
+        for line_index, line in enumerate(f):
+            word = line.strip().split()[0]
+            index[word] = line_index
+    with open(usrDict, "r") as f:
+        for line in f:
+            word = line.strip().split()[0]
+            pos.append(index[word])
+    return pos
+
+def extract_parameters_by_usrDict(preModel, preDict, usrModel, usrDict, paraDim):
+    """
+    Extract desired parameters from a pretrained embedding model based on user dictionary
+    """
+    if paraDim not in [32, 64, 128, 256]:
+        raise RuntimeError("We only support 32, 64, 128, 256 dimensions now")
+
+    fi = open(preModel, "rb")
+    fo = open(usrModel, "wb")
+
+    # write filehead
+    rowIndex = get_row_index(preDict, usrDict)
+    newHead = struct.pack("iil", 0, 4, len(rowIndex) * paraDim)
+    fo.write(newHead)
+    bytes = 4 * paraDim
+    for i in range(0, len(rowIndex)):
+        # find the absolute position of input file
+        fi.seek(rowIndex[i] * bytes + 16, 0)
+        fo.write(fi.read(bytes))
+
+    print "extract parameters finish, total", len(rowIndex), "lines"
+    fi.close()
+
+def main():
+    """
+    Main entry for running paraconvert.py 
+    """
+    usage = "usage: \n" \
+            "python %prog --preModel PREMODEL --preDict PREDICT" \
+            " --usrModel USRMODEL --usrDict USRDICT -d DIM"
+    parser = OptionParser(usage)
+    parser.add_option("--preModel", action="store", dest="preModel",
+                      help="the name of pretrained embedding model")
+    parser.add_option("--preDict", action="store", dest="preDict",
+                      help="the name of pretrained dictionary")
+    parser.add_option("--usrModel", action="store", dest="usrModel",
+                      help="the name of output usr embedding model")
+    parser.add_option("--usrDict", action="store", dest="usrDict",
+                      help="the name of user specified dictionary")
+    parser.add_option("-d", action="store", dest="dim",
+                      help="dimension of parameter")
+    (options, args) = parser.parse_args()
+    extract_parameters_by_usrDict(options.preModel, options.preDict, 
+                      options.usrModel, options.usrDict, int(options.dim))
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/model_zoo/embedding/paraconvert.py b/demo/model_zoo/embedding/paraconvert.py
new file mode 100755
index 00000000000000..523412303617a3
--- /dev/null
+++ b/demo/model_zoo/embedding/paraconvert.py
@@ -0,0 +1,151 @@
+#!/bin/env python
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Example:
+    python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM
+    python paraconvert.py --t2b -i INPUT -o OUTPUT
+
+Options:
+    -h, --help  show this help message and exit
+    --b2t       convert parameter file of embedding model from binary to text
+    --t2b       convert parameter file of embedding model from text to binary
+    -i INPUT    input parameter file name
+    -o OUTPUT   output parameter file name
+    -d DIM      dimension of parameter
+"""
+from optparse import OptionParser
+import struct
+
+def binary2text(input, output, paraDim):
+    """
+    Convert a binary parameter file of embedding model to be a text file.  
+    input: the name of input binary parameter file, the format is:
+           1) the first 16 bytes is filehead:
+                version(4 bytes): version of paddle, default = 0
+                floatSize(4 bytes): sizeof(float) = 4
+                paraCount(8 bytes): total number of parameter
+           2) the next (paraCount * 4) bytes is parameters, each has 4 bytes 
+    output: the name of output text parameter file, for example:
+           0,4,32156096
+           -0.7845433,1.1937413,-0.1704215,...
+           0.0000909,0.0009465,-0.0008813,...
+           ...
+           the format is:
+           1) the first line is filehead: 
+              version=0, floatSize=4, paraCount=32156096
+           2) other lines print the paramters
+              a) each line prints paraDim paramters splitted by ','
+              b) there is paraCount/paraDim lines (embedding words)
+    paraDim: dimension of parameters 
+    """
+    fi = open(input, "rb")
+    fo = open(output, "w")
+    """
+    """
+    version, floatSize, paraCount = struct.unpack("iil", fi.read(16))
+    newHead = ','.join([str(version), str(floatSize), str(paraCount)])
+    print >> fo, newHead
+
+    bytes = 4 * int(paraDim)
+    format = "%df" % int(paraDim)
+    context = fi.read(bytes)
+    line = 0
+
+    while context:
+        numbers = struct.unpack(format, context)
+        lst = []
+        for i in numbers:
+            lst.append('%8.7f' % i)
+        print >> fo, ','.join(lst)
+        context = fi.read(bytes)
+        line += 1
+    fi.close()
+    fo.close()
+    print "binary2text finish, total", line, "lines"
+
+def get_para_count(input):
+    """
+    Compute the total number of embedding parameters in input text file. 
+    input: the name of input text file
+    """
+    numRows = 1 
+    paraDim = 0
+    with open(input) as f:
+        line = f.readline()
+        paraDim = len(line.split(","))
+        for line in f:
+            numRows += 1
+    return numRows * paraDim
+
+def text2binary(input, output, paddle_head=True):
+    """
+    Convert a text parameter file of embedding model to be a binary file.
+    input: the name of input text parameter file, for example:
+           -0.7845433,1.1937413,-0.1704215,...
+           0.0000909,0.0009465,-0.0008813,... 
+           ...
+           the format is:
+           1) it doesn't have filehead
+           2) each line stores the same dimension of parameters, 
+              the separator is commas ','
+    output: the name of output binary parameter file, the format is:
+           1) the first 16 bytes is filehead: 
+             version(4 bytes), floatSize(4 bytes), paraCount(8 bytes)
+           2) the next (paraCount * 4) bytes is parameters, each has 4 bytes
+    """
+    fi = open(input, "r")
+    fo = open(output, "wb")
+
+    newHead = struct.pack("iil", 0, 4, get_para_count(input))
+    fo.write(newHead)
+
+    count = 0
+    for line in fi:
+        line = line.strip().split(",")
+        for i in range(0, len(line)):
+            binary_data = struct.pack("f", float(line[i]))
+            fo.write(binary_data)
+        count += 1
+    fi.close()
+    fo.close()
+    print "text2binary finish, total", count, "lines"
+
+def main():
+    """
+    Main entry for running paraconvert.py 
+    """
+    usage = "usage: \n" \
+            "python %prog --b2t -i INPUT -o OUTPUT -d DIM \n" \
+            "python %prog --t2b -i INPUT -o OUTPUT"
+    parser = OptionParser(usage)
+    parser.add_option("--b2t", action="store_true",
+                      help="convert parameter file of embedding model from binary to text")
+    parser.add_option("--t2b", action="store_true",
+                      help="convert parameter file of embedding model from text to binary")
+    parser.add_option("-i", action="store", dest="input",
+                      help="input parameter file name")
+    parser.add_option("-o", action="store", dest="output",
+                      help="output parameter file name")
+    parser.add_option("-d", action="store", dest="dim",
+                      help="dimension of parameter")
+    (options, args) = parser.parse_args()
+    if options.b2t:
+        binary2text(options.input, options.output, options.dim)
+    if options.t2b:
+        text2binary(options.input, options.output)
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/model_zoo/embedding/pre_DictAndModel.sh b/demo/model_zoo/embedding/pre_DictAndModel.sh
new file mode 100755
index 00000000000000..7821850fb25cc5
--- /dev/null
+++ b/demo/model_zoo/embedding/pre_DictAndModel.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+set -x
+
+# download the dictionary and pretrained model 
+for file in baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb
+do 
+  # following is the google drive address
+  # you can also directly download from https://pan.baidu.com/s/1o8q577s
+  wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/$file --no-check-certificate
+done
diff --git a/demo/model_zoo/resnet/.gitignore b/demo/model_zoo/resnet/.gitignore
new file mode 100644
index 00000000000000..7a64209b62340a
--- /dev/null
+++ b/demo/model_zoo/resnet/.gitignore
@@ -0,0 +1,5 @@
+fea_output/
+features/
+model.list
+ResNet_50.dot
+ResNet_50.png
diff --git a/demo/model_zoo/resnet/classify.py b/demo/model_zoo/resnet/classify.py
new file mode 100755
index 00000000000000..e818995fa31a92
--- /dev/null
+++ b/demo/model_zoo/resnet/classify.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import cPickle
+import logging
+from PIL import Image
+import numpy as np
+from optparse import OptionParser
+
+import paddle.utils.image_util as image_util
+
+from py_paddle import swig_paddle, util
+from py_paddle import DataProviderWrapperConverter
+from paddle.trainer.PyDataProviderWrapper import DenseSlot
+from paddle.trainer.config_parser import parse_config
+
+logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
+logging.getLogger().setLevel(logging.INFO)
+
+class ImageClassifier():
+    def __init__(self, train_conf, model_dir=None,
+                 resize_dim=256, crop_dim=224,
+                 mean_file=None,
+                 output_layer=None,
+                 oversample=False, is_color=True):
+        """
+        train_conf: network configure.
+        model_dir: string, directory of model.
+        resize_dim: int, resized image size.
+        crop_dim: int, crop size.
+        mean_file: string, image mean file.
+        oversample: bool, oversample means multiple crops, namely five
+                    patches (the four corner patches and the center
+                    patch) as well as their horizontal reflections,
+                    ten crops in all.
+        """
+        self.train_conf = train_conf
+        self.model_dir = model_dir
+        if model_dir is None:
+            self.model_dir = os.path.dirname(train_conf)
+
+        self.resize_dim = resize_dim
+        self.crop_dims = [crop_dim, crop_dim]
+        self.oversample = oversample
+        self.is_color = is_color
+
+        self.output_layer = output_layer
+        if self.output_layer:
+            assert isinstance(self.output_layer, basestring)
+            self.output_layer = self.output_layer.split(",")
+
+        self.transformer = image_util.ImageTransformer(is_color = is_color)
+        self.transformer.set_transpose((2,0,1))
+        self.transformer.set_channel_swap((2,1,0))
+
+        self.mean_file = mean_file
+        if self.mean_file is not None:
+            mean = np.load(self.mean_file)['data_mean']
+            mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1])
+            self.transformer.set_mean(mean) # mean pixel
+        else:
+            # if you use three mean value, set like:
+            # this three mean value is calculated from ImageNet.
+            self.transformer.set_mean(np.array([103.939,116.779,123.68]))
+
+        conf_args = "is_test=1,use_gpu=1,is_predict=1"
+        conf = parse_config(train_conf, conf_args)
+        swig_paddle.initPaddle("--use_gpu=1")
+        self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
+        assert isinstance(self.network, swig_paddle.GradientMachine)
+        self.network.loadParameters(self.model_dir)
+
+        data_size = 3 * self.crop_dims[0] * self.crop_dims[1]
+        slots = [DenseSlot(data_size)]
+        is_sequence = False
+        self.converter = util.DataProviderWrapperConverter(is_sequence, slots)
+
+    def get_data(self, img_path):
+        """
+        1. load image from img_path.
+        2. resize or oversampling.
+        3. transformer data: transpose, channel swap, sub mean.
+        return K x H x W ndarray.
+
+        img_path: image path.
+        """
+        image = image_util.load_image(img_path, self.is_color)
+        # Another way to extract oversampled features is that
+        # cropping and averaging from large feature map which is
+        # calculated by large size of image.
+        # This way reduces the computation.
+        if self.oversample:
+            # image_util.resize_image: short side is self.resize_dim
+            image = image_util.resize_image(image, self.resize_dim)
+            image = np.array(image)
+            input = np.zeros((1, image.shape[0], image.shape[1], 3),
+                             dtype=np.float32)
+            input[0] = image.astype(np.float32)
+            input = image_util.oversample(input, self.crop_dims)
+        else:
+            image = image.resize(self.crop_dims, Image.ANTIALIAS)
+            input = np.zeros((1, self.crop_dims[0], self.crop_dims[1], 3),
+                             dtype=np.float32)
+            input[0] = np.array(image).astype(np.float32)
+
+        data_in = []
+        for img in input:
+            img = self.transformer.transformer(img).flatten()
+            data_in.append([img.tolist()])
+        # paddle input: [[[]],[[]],...], [[]] is one sample.
+        return data_in
+
+    def forward(self, input_data):
+        """
+        return output arguments which are the Outputs() in network configure.
+
+        input_data: py_paddle input data.
+        call forward.
+        """
+        in_arg = self.converter(input_data)
+        return self.network.forwardTest(in_arg)
+
+    def forward(self, data, output_layer):
+        """
+        return output arguments which are the Outputs() in network configure.
+
+        input_data: py_paddle input data.
+        call forward.
+        """
+        input = self.converter(data)
+        self.network.forwardTest(input)
+        output = self.network.getLayerOutputs(output_layer)
+        res = {}
+        if isinstance(output_layer, basestring):
+            output_layer = [output_layer]
+        for name in output_layer:
+            # For oversampling, average predictions across crops.
+            # If not, the shape of output[name]: (1, class_number),
+            # the mean is also applicable.
+            res[name] = output[name].mean(0)
+
+        return res
+
+    def predict(self, data_file):
+        """
+        call forward and predicting.
+
+        data_file: input image list.
+        """
+        image_files = open(data_file, 'rb').readlines()
+        results = {}
+        if self.output_layer is None:
+            self.output_layer = ["output"]
+        for line in image_files:
+            image = line.split()[0]
+            data = self.get_data(image)
+            prob = self.forward(data, self.output_layer)
+            lab = np.argsort(-prob[self.output_layer[0]])
+            results[image] = lab[0]
+            logging.info("Label of %s is: %d", image, lab[0])
+        return results
+
+    def extract(self, data_file, output_dir, batch_size = 10000):
+        """
+        extract and save features of output layers, which are
+        specify in Outputs() in network configure.
+
+        data_file: file name of input data.
+        output_dir: saved directory of extracted features.
+        batch_size: sample number of one batch file.
+        """
+        if not os.path.exists(output_dir):
+            os.mkdir(output_dir)
+
+        sample_num = 0
+        batch_num = 0
+        image_feature = {}
+        image_files = open(data_file, 'rb').readlines()
+        for idx, line in enumerate(image_files):
+            image = line.split()[0]
+            data = self.get_data(image)
+            feature = self.forward(data, self.output_layer)
+            # save extracted features
+            file_name = image.split("/")[-1]
+            image_feature[file_name] = feature
+            sample_num += 1
+            if sample_num == batch_size:
+                batch_name = os.path.join(output_dir, 'batch_%d' %(batch_num))
+                self.save_file(image_feature, batch_name)
+                logging.info('Finish batch %d', batch_num)
+                batch_num += 1
+                sample_num = 0
+                image_feature = {}
+            if idx % 1000 == 0:
+                logging.info('%d/%d, %s', idx, len(image_files), file_name)
+        if sample_num > 0:
+            batch_name = os.path.join(output_dir, 'batch_%d' %(batch_num))
+            self.save_file(image_feature, batch_name)
+            logging.info('Finish batch %d', batch_num)
+        logging.info('Done: make image feature batch')
+
+    def save_file(self, data, file):
+        of = open(file, 'wb')
+        cPickle.dump(data, of, protocol=cPickle.HIGHEST_PROTOCOL)
+
+def option_parser():
+    """
+    Main entry for predciting
+    """
+    usage = "%prog -c config -i data_list -w model_dir [options]"
+    parser = OptionParser(usage="usage: %s" % usage)
+    parser.add_option("-j", "--job",
+                      action="store", dest="job_type",
+                      help="job type: predict, extract\
+                            predict: predicting,\
+                            extract: extract features")
+    parser.add_option("-c", "--conf",
+                      action="store", dest="train_conf",
+                      help="network config")
+    parser.add_option("-i", "--data",
+                      action="store", dest="data_file",
+                      help="image list")
+    parser.add_option("-w", "--model",
+                      action="store", dest="model_path",
+                      default=None, help="model path")
+    parser.add_option("-o", "--output_dir",
+                      action="store", dest="output_dir",
+                      default="output", help="output path")
+    parser.add_option("-m", "--mean", action="store",
+                      dest="mean", default=None,
+                      help="mean file.")
+    parser.add_option("-p", "--multi_crop", action="store_true",
+                      dest="multi_crop", default=False,
+                      help="Wether to use multiple crops on image.")
+    parser.add_option("-l", "--output_layer", action="store",
+                      dest="output_layer", default=None,
+                      help="--job=extract, specify layers to extract "\
+                           "features, --job=predict, specify layer of "
+                           "classification probability, output in resnet.py.")
+    return parser.parse_args()
+
+def main():
+    """
+    1. parse input arguments.
+    2. predicting or extract features according job type.
+    """
+    options, args = option_parser()
+    obj = ImageClassifier(options.train_conf,
+                        options.model_path,
+                        mean_file=options.mean,
+                        output_layer=options.output_layer,
+                        oversample=options.multi_crop)
+    if options.job_type == "predict":
+        obj.predict(options.data_file)
+
+    elif options.job_type == "extract":
+        obj.extract(options.data_file,
+                    options.output_dir)
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/model_zoo/resnet/example/.gitignore b/demo/model_zoo/resnet/example/.gitignore
new file mode 100644
index 00000000000000..4a2b5962a6800f
--- /dev/null
+++ b/demo/model_zoo/resnet/example/.gitignore
@@ -0,0 +1 @@
+*image_list_provider_copy_1.py
diff --git a/demo/model_zoo/resnet/example/__init__.py b/demo/model_zoo/resnet/example/__init__.py
new file mode 100644
index 00000000000000..7f9e87eee60376
--- /dev/null
+++ b/demo/model_zoo/resnet/example/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/demo/model_zoo/resnet/example/cat.jpg b/demo/model_zoo/resnet/example/cat.jpg
new file mode 100644
index 00000000000000..47b01db90eddc4
Binary files /dev/null and b/demo/model_zoo/resnet/example/cat.jpg differ
diff --git a/demo/model_zoo/resnet/example/dog.jpg b/demo/model_zoo/resnet/example/dog.jpg
new file mode 100644
index 00000000000000..b9cc33cf069da5
Binary files /dev/null and b/demo/model_zoo/resnet/example/dog.jpg differ
diff --git a/demo/model_zoo/resnet/example/image_list_provider.py b/demo/model_zoo/resnet/example/image_list_provider.py
new file mode 100644
index 00000000000000..ee457e1fffc7ed
--- /dev/null
+++ b/demo/model_zoo/resnet/example/image_list_provider.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.utils.image_util import *
+from paddle.trainer.PyDataProvider2 import *
+
+
+def hook(settings, image_size, crop_size, color, file_list,
+         is_train, **kwargs):
+    """
+    Description: Init with a list of data file
+    file_list is the name list of input files.
+    kwargs["load_data_args"] is the value of 'load_data_args'
+    which can be set in config.
+    Each args is separated by a column.
+    image_size: the crop image size.
+    mean_meta: the path of the meta file to store the mean image.
+    mean_value: can be mean value, not a file.
+                can not set mean_meta and mean_value at the same time.
+    color: 'color' means a color image. Otherwise, it means a gray image.
+    is_train: whether the data provider is used for training.
+              Data argumentation might be different for training and testing.
+    """
+    settings.img_size = image_size
+    settings.crop_size = crop_size
+    settings.mean_img_size = settings.crop_size
+    settings.color = color  # default is color
+    settings.is_train = is_train
+
+    settings.is_swap_channel = kwargs.get('swap_channel', None)
+    if settings.is_swap_channel is not None:
+        settings.swap_channel = settings.is_swap_channel
+        settings.is_swap_channel = True
+
+    if settings.color:
+        settings.img_input_size = settings.crop_size * settings.crop_size * 3
+    else:
+        settings.img_input_size = settings.crop_size * settings.crop_size
+
+    settings.file_list = file_list
+    settings.mean_meta = kwargs.get('mean_meta', None)
+    settings.mean_value = kwargs.get('mean_value', None)
+    # can not specify both mean_meta and mean_value.
+    assert not (settings.mean_meta and settings.mean_value)
+    if not settings.mean_meta:
+        settings.mean_value = kwargs.get('mean_value')
+        sz = settings.crop_size * settings.crop_size
+        settings.img_mean = np.zeros(sz * 3, dtype=np.single)
+        for idx, value in enumerate(settings.mean_value):
+            settings.img_mean[idx * sz: (idx + 1) * sz] = value
+        settings.img_mean = settings.img_mean.reshape(3, settings.crop_size,
+                                                      settings.crop_size)
+
+    else:
+        settings.img_mean = load_meta(settings.mean_meta,
+                                      settings.mean_img_size,
+                                      settings.crop_size, settings.color)
+
+    settings.input_types = [
+        dense_vector(settings.img_input_size),  # image feature
+        integer_value(1)]  # labels
+
+    settings.logger.info('Image short side: %s', settings.img_size)
+    settings.logger.info('Crop size: %s', settings.crop_size)
+    settings.logger.info('Meta path: %s', settings.mean_meta)
+    if settings.is_swap_channel:
+        settings.logger.info('swap channel: %s', settings.swap_channel)
+    settings.logger.info('DataProvider Initialization finished')
+
+
+@provider(init_hook=hook, should_shuffle=False)
+def processData(settings, file_list):
+    """
+    The main function for loading data.
+    Load the batch, iterate all the images and labels in this batch.
+    file_name: the batch file name.
+    """
+    img_path, lab = file_list.strip().split(' ')
+    img = Image.open(img_path)
+    img.load()
+    img = img.resize((settings.img_size, settings.img_size), Image.ANTIALIAS)
+    img = np.array(img).astype(np.float32)
+    if len(img.shape) == 3:
+        img = np.swapaxes(img, 1, 2)
+        img = np.swapaxes(img, 1, 0)
+    # swap channel
+    if settings.is_swap_channel:
+        img = img[settings.swap_channel, :, :]
+    img_feat = preprocess_img(img,
+                              settings.img_mean,
+                              settings.crop_size,
+                              settings.is_train,
+                              settings.color)
+    yield img_feat.tolist(), int(lab.strip())
diff --git a/demo/model_zoo/resnet/example/test.list b/demo/model_zoo/resnet/example/test.list
new file mode 100644
index 00000000000000..30bbf630b640a2
--- /dev/null
+++ b/demo/model_zoo/resnet/example/test.list
@@ -0,0 +1,2 @@
+example/dog.jpg 0
+example/cat.jpg 0
diff --git a/demo/model_zoo/resnet/extract_fea_c++.sh b/demo/model_zoo/resnet/extract_fea_c++.sh
new file mode 100755
index 00000000000000..c7f9aea9a57df5
--- /dev/null
+++ b/demo/model_zoo/resnet/extract_fea_c++.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+#set names of layer which you want to extract feature
+#in Outputs() of resnet.py 
+#like: Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn")
+layer_num=50
+configure=./resnet.py
+model_path=./model/resnet_$layer_num
+fea_dir=fea_output
+#Output is text file.
+#Each line is one sample's features.
+#If you set N layer names in Outputs()
+#each line contains N features sperated by ";". 
+
+# create model list file.
+model_list=./model.list
+touch $model_list | echo $model_path > $model_list
+
+paddle train \
+  --local=true \
+  --job=test \
+  --config=$configure \
+  --model_list=$model_list \
+  --use_gpu=1 \
+  --predict_output_dir=$fea_dir \
+  --config_args=is_test=1,layer_num=$layer_num
diff --git a/demo/model_zoo/resnet/extract_fea_py.sh b/demo/model_zoo/resnet/extract_fea_py.sh
new file mode 100755
index 00000000000000..b0ec748bb8f0f8
--- /dev/null
+++ b/demo/model_zoo/resnet/extract_fea_py.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+python classify.py \
+     --job=extract \
+     --conf=resnet.py \
+     --mean=model/mean_meta_224/mean.meta \
+     --model=model/resnet_50 \
+     --data=./example/test.list \
+     --output_layer="res5_3_branch2c_conv,res5_3_branch2c_bn" \
+     --output_dir=features 
diff --git a/demo/model_zoo/resnet/get_model.sh b/demo/model_zoo/resnet/get_model.sh
new file mode 100755
index 00000000000000..89312d43edf8e4
--- /dev/null
+++ b/demo/model_zoo/resnet/get_model.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd $DIR
+
+mkdir model
+cd model
+
+echo "Downloading ResNet models..."
+
+for file in resnet_50.tar.gz resnet_101.tar.gz resnet_152.tar.gz mean_meta_224.tar.gz 
+do 
+  # following is the google drive address
+  # you can also directly download from https://pan.baidu.com/s/1o8q577s
+  wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/imagenet/$file --no-check-certificate
+  tar -xvf $file 
+  rm $file
+done
+
+echo "Done."
diff --git a/demo/model_zoo/resnet/load_feature.py b/demo/model_zoo/resnet/load_feature.py
new file mode 100644
index 00000000000000..ee4930b7a17f7f
--- /dev/null
+++ b/demo/model_zoo/resnet/load_feature.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import cPickle
+import logging
+
+logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
+logging.getLogger().setLevel(logging.INFO)
+
+def load_feature_c(file):
+    """
+    Load feature extracted by C++ interface.
+    Return a list.
+    file: feature file.
+    """
+    features = []
+    f = open(file, 'r')
+    for line in f:
+        sample = []
+        for slot in line.strip().split(";"): 
+            fea = [float(val) for val in slot.strip().split()] 
+            if fea:
+                sample.append(fea)
+        features.append(sample)
+    f.close()
+    return features
+
+def load_feature_py(feature_dir):
+    """
+    Load feature extracted by python interface.
+    Return a dictionary.
+    feature_dir: directory of feature file.
+    """
+    file_list = os.listdir(feature_dir)
+    file_list = [os.path.join(feature_dir, f) for f in file_list]
+    features = {}
+    for file_name in file_list:
+        with open(file_name, 'rb') as f:
+            feature = cPickle.load(f)
+            features.update(feature)
+            logging.info('Load feature file %s', file_name)
+    return features
+
+if __name__ == '__main__':
+    print load_feature_py(sys.argv[1]) 
+    #print load_feature_c(sys.argv[1]) 
diff --git a/demo/model_zoo/resnet/net_diagram.sh b/demo/model_zoo/resnet/net_diagram.sh
new file mode 100755
index 00000000000000..ec72432f0ad026
--- /dev/null
+++ b/demo/model_zoo/resnet/net_diagram.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+:'
+Visual deep residual network
+1. Using make_model_diagram.py to generate dot file.
+2. Using graphviz to convert dot file.
+
+Usage:
+./net_diagram.sh
+'
+
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd $DIR
+
+img_type=png
+img_fileprefix=ResNet_50
+conf_filename=resnet.py
+dot_filename=ResNet_50.dot
+config_str="layer_num=50,data_provider=0"
+
+python -m paddle.utils.make_model_diagram $conf_filename $dot_filename $config_str
+
+# If you have installed graphviz, running like this:
+# dot -Tpng -o ResNet.png ResNet.dot
diff --git a/demo/model_zoo/resnet/predict.sh b/demo/model_zoo/resnet/predict.sh
new file mode 100755
index 00000000000000..0375cd2e08c85d
--- /dev/null
+++ b/demo/model_zoo/resnet/predict.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+python classify.py \
+     --job=predict \
+     --conf=resnet.py\
+     --model=model/resnet_50 \
+     --multi_crop \
+     --data=./example/test.list
diff --git a/demo/model_zoo/resnet/resnet.py b/demo/model_zoo/resnet/resnet.py
new file mode 100644
index 00000000000000..483e308ac804e1
--- /dev/null
+++ b/demo/model_zoo/resnet/resnet.py
@@ -0,0 +1,260 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+"""
+paper: https://arxiv.org/abs/1512.03385
+"""
+is_test = get_config_arg("is_test", bool, False)
+is_predict = get_config_arg("is_predict", bool, False)
+data_provider = get_config_arg("data_provider", bool, True)
+layer_num = get_config_arg("layer_num", int, 50)
+
+if not is_predict and data_provider:
+    train_list = 'train.list' if not is_test else None
+    # mean.meta is mean file of ImageNet dataset.
+    # mean.meta size : 3 x 224 x 224.
+    # If you use three mean value, set like:
+    # "mean_value:103.939,116.779,123.68;"
+    args={
+        'mean_meta': "model/mean_meta_224/mean.meta",
+        'image_size': 224, 'crop_size': 224,
+        'color': True,'swap_channel:': [2, 1, 0]}
+    define_py_data_sources2(train_list,
+                           'example/test.list',
+                           module="example.image_list_provider",
+                           obj="processData",
+                           args=args)
+
+batch_size = 1
+learning_rate = 0.1 / batch_size
+momentum = 0.9
+weight_decay = 0.0001 * batch_size
+default_momentum(momentum)
+default_decay_rate(weight_decay)
+
+Settings(
+    algorithm='sgd',
+    batch_size=batch_size,
+    learning_rate=learning_rate,
+
+    # set the appropriate parameters according your schedule
+    learning_method='momentum',
+    learning_rate_decay_a=0.5,
+    learning_rate_decay_b=1200000 * 10,
+    learning_rate_schedule="discexp",
+)
+
+
+def conv_bn_layer(name, input, filter_size, num_filters,
+                  stride, padding, channels=None,
+                  active_type=ReluActivation()):
+    """
+    A wrapper for conv layer with batch normalization layers.
+    Note:
+    conv layer has no activation.
+    """
+
+    tmp = img_conv_layer(name=name + "_conv",
+                         input=input,
+                         filter_size=filter_size,
+                         num_channels=channels,
+                         num_filters=num_filters,
+                         stride=stride,
+                         padding=padding,
+                         act=LinearActivation(),
+                         bias_attr=False)
+    return batch_norm_layer(name=name + "_bn",
+                            input=tmp,
+                            act=active_type,
+                            use_global_stats=is_test)
+
+
+def bottleneck_block(name, input, num_filters1, num_filters2):
+    """
+    A wrapper for bottlenect building block in ResNet.
+    Last conv_bn_layer has no activation.
+    Addto layer has activation of relu.
+    """
+    last_name = conv_bn_layer(name=name + '_branch2a',
+                              input=input,
+                              filter_size=1,
+                              num_filters=num_filters1,
+                              stride=1,
+                              padding=0)
+    last_name = conv_bn_layer(name=name + '_branch2b',
+                              input=last_name,
+                              filter_size=3,
+                              num_filters=num_filters1,
+                              stride=1,
+                              padding=1)
+    last_name = conv_bn_layer(name=name + '_branch2c',
+                              input=last_name,
+                              filter_size=1,
+                              num_filters=num_filters2,
+                              stride=1,
+                              padding=0,
+                              active_type=LinearActivation())
+
+    return addto_layer(name=name + "_addto",
+                       input=[input, last_name],
+                       act=ReluActivation())
+
+
+def mid_projection(name, input, num_filters1, num_filters2, stride=2):
+    """
+    A wrapper for middile projection in ResNet.
+    projection shortcuts are used for increasing dimensions,
+    and other shortcuts are identity
+    branch1: projection shortcuts are used for increasing
+    dimensions, has no activation.
+    branch2x: bottleneck building block, shortcuts are identity.
+    """
+    # stride = 2
+    branch1 = conv_bn_layer(name=name + '_branch1',
+                            input=input,
+                            filter_size=1,
+                            num_filters=num_filters2,
+                            stride=stride,
+                            padding=0,
+                            active_type=LinearActivation())
+
+    last_name = conv_bn_layer(name=name + '_branch2a',
+                              input=input,
+                              filter_size=1,
+                              num_filters=num_filters1,
+                              stride=stride,
+                              padding=0)
+    last_name = conv_bn_layer(name=name + '_branch2b',
+                              input=last_name,
+                              filter_size=3,
+                              num_filters=num_filters1,
+                              stride=1,
+                              padding=1)
+
+    last_name = conv_bn_layer(name=name + '_branch2c',
+                              input=last_name,
+                              filter_size=1,
+                              num_filters=num_filters2,
+                              stride=1,
+                              padding=0,
+                              active_type=LinearActivation())
+
+    return addto_layer(name=name + "_addto",
+                       input=[branch1, last_name],
+                       act=ReluActivation())
+
+
+def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
+    """
+    A wrapper for 50,101,152 layers of ResNet.
+    res2_num: number of blocks stacked in conv2_x
+    res3_num: number of blocks stacked in conv3_x
+    res4_num: number of blocks stacked in conv4_x
+    res5_num: number of blocks stacked in conv5_x
+    """
+    # For ImageNet
+    # conv1: 112x112
+    img = data_layer(name='input', size=224 * 224 * 3)
+    tmp = conv_bn_layer("conv1", img,
+                        filter_size=7,
+                        channels=3,
+                        num_filters=64,
+                        stride=2,
+                        padding=3)
+    tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2)
+
+    # conv2_x: 56x56
+    tmp = mid_projection(name="res2_1",
+                         input=tmp,
+                         num_filters1=64,
+                         num_filters2=256,
+                         stride=1)
+    for i in xrange(2, res2_num + 1, 1):
+        tmp = bottleneck_block(name="res2_" + str(i),
+                               input=tmp,
+                               num_filters1=64,
+                               num_filters2=256)
+
+    # conv3_x: 28x28
+    tmp = mid_projection(name="res3_1",
+                         input=tmp,
+                         num_filters1=128,
+                         num_filters2=512)
+    for i in xrange(2, res3_num + 1, 1):
+        tmp = bottleneck_block(name="res3_" + str(i),
+                               input=tmp, num_filters1=128,
+                               num_filters2=512)
+
+    # conv4_x: 14x14
+    tmp = mid_projection(name="res4_1", input=tmp,
+                         num_filters1=256, num_filters2=1024)
+    for i in xrange(2, res4_num + 1, 1):
+        tmp = bottleneck_block(name="res4_" + str(i),
+                               input=tmp,
+                               num_filters1=256,
+                               num_filters2=1024)
+
+    # conv5_x: 7x7
+    tmp = mid_projection(name="res5_1", input=tmp,
+                         num_filters1=512, num_filters2=2048)
+    for i in xrange(2, res5_num + 1, 1):
+        tmp = bottleneck_block(name="res5_" + str(i),
+                               input=tmp, num_filters1=512,
+                               num_filters2=2048)
+
+    tmp = img_pool_layer(name='avgpool',
+                         input=tmp,
+                         pool_size=7,
+                         stride=1,
+                         pool_type=AvgPooling())
+
+    output = fc_layer(name='output',
+                      input=tmp,
+                      size=1000,
+                      act=SoftmaxActivation())
+
+    if not is_predict:
+        classification_cost(input=output, label=data_layer(name='label',
+                                                           size=1))
+
+
+def res_net_50():
+    deep_res_net(3, 4, 6, 3)
+
+
+def res_net_101():
+    deep_res_net(3, 4, 23, 3)
+
+
+def res_net_152():
+    deep_res_net(3, 8, 36, 3)
+
+
+if not is_predict:
+    Inputs("input", "label")
+else:
+    Inputs("input")
+# Outputs("cost-softmax" if not is_predict else "output")
+Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn")
+
+if layer_num == 50:
+    res_net_50()
+elif layer_num == 101:
+    res_net_101()
+elif layer_num == 152:
+    res_net_152()
+else:
+    print("Wrong layer number.")
diff --git a/demo/quick_start/.gitignore b/demo/quick_start/.gitignore
new file mode 100644
index 00000000000000..d6bc73105b1abf
--- /dev/null
+++ b/demo/quick_start/.gitignore
@@ -0,0 +1,13 @@
+*.pyc
+data/dict.txt
+data/dict_all.txt
+data/labels.list
+data/mosesdecoder-master/
+data/reviews_Electronics_5.json.gz
+data/test.list
+data/test.txt
+data/train.list
+data/train.txt
+dataprovider_copy_1.py
+train.log
+output
diff --git a/demo/quick_start/data/get_data.sh b/demo/quick_start/data/get_data.sh
new file mode 100755
index 00000000000000..f355d63225b28a
--- /dev/null
+++ b/demo/quick_start/data/get_data.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd $DIR
+
+echo "Downloading Amazon Electronics reviews data..."
+# http://jmcauley.ucsd.edu/data/amazon/
+wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+
+echo "Downloading mosesdecoder..."
+#https://github.com/moses-smt/mosesdecoder
+wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
+
+unzip master.zip
+rm master.zip
+echo "Done."
diff --git a/demo/quick_start/data/pred.list b/demo/quick_start/data/pred.list
new file mode 100644
index 00000000000000..d88b2b63851101
--- /dev/null
+++ b/demo/quick_start/data/pred.list
@@ -0,0 +1 @@
+./data/pred.txt
diff --git a/demo/quick_start/data/pred.txt b/demo/quick_start/data/pred.txt
new file mode 100644
index 00000000000000..6ed5f738ddaff6
--- /dev/null
+++ b/demo/quick_start/data/pred.txt
@@ -0,0 +1,2 @@
+the device is cute , but that &apos;s just about all that &apos;s good. the specs are what you &apos;d expect : it &apos;s a wifi mic , with some noise filter options. the app has the option to upload your baby &apos;s name and photo , which is a cutesy touch. but the app is otherwise unstable and useless unless you upgrade for $ 60 / year.set up involves downloading the app , turning on the mic , switching your phone to the wifi network of the mic , telling the app your wifi settings , switching your wifi back to your home router. the app is then directly connected to your mic.the app is adware ! the main screen says &quot; cry notifications on / off : upgrade to evoz premium and receive a text message of email when your baby is crying &quot; .but the adware points out an important limitation , this monitor is only intended to be used from your home network. if you want to access it remotely , get a webcam. this app would make a lot more sense of the premium features were included with the hardware .
+don &apos;t be fooled by my one star rating. if there was a zero , i would have selected it. this product was a waste of my money.it has never worked like the company said it supposed to. i only have one device , an iphone 4gs. after charging the the iphone mid way , the i.sound portable power max 16,000 mah is completely drained. the led light no longer lit up. when plugging the isound portable power max into a wall outlet to charge , it would charge for about 20-30 minutes and then all four battery led indicator lit up showing a full charge. i would leave it on to charge for the full 8 hours or more but each time with the same result upon using. don &apos;t buy this thing. put your money to good use elsewhere .
diff --git a/demo/quick_start/dataprovider_bow.py b/demo/quick_start/dataprovider_bow.py
new file mode 100644
index 00000000000000..bbd3ecabaadbf5
--- /dev/null
+++ b/demo/quick_start/dataprovider_bow.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+
+# id of the word not in dictionary
+UNK_IDX = 0
+
+# initializer is called by the framework during initialization.
+# It allows the user to describe the data types and setup the
+# necessary data structure for later use.
+# `settings` is an object. initializer need to properly fill settings.input_types.
+# initializer can also store other data structures needed to be used at process().
+# In this example, dictionary is stored in settings.
+# `dictionay` and `kwargs` are arguments passed from trainer_config.lr.py
+def initializer(settings, dictionary, **kwargs):
+    # Put the word dictionary into settings
+    settings.word_dict = dictionary
+
+    # setting.input_types specifies what the data types the data provider
+    # generates.
+    settings.input_types = [
+        # The first input is a sparse_binary_vector,
+        # which means each dimension of the vector is either 0 or 1. It is the
+        # bag-of-words (BOW) representation of the texts.
+        sparse_binary_vector(len(dictionary)),
+        # The second input is an integer. It represents the category id of the
+        # sample. 2 means there are two labels in the dataset.
+        # (1 for positive and 0 for negative)
+        integer_value(2)]
+
+# Delaring a data provider. It has an initializer 'data_initialzer'.
+# It will cache the generated data of the first pass in memory, so that
+# during later pass, no on-the-fly data generation will be needed.
+# `setting` is the same object used by initializer()
+# `file_name` is the name of a file listed train_list or test_list file given
+# to define_py_data_sources2(). See trainer_config.lr.py.
+@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_name):
+    # Open the input data file.
+    with open(file_name, 'r') as f:
+        # Read each line.
+        for line in f:
+            # Each line contains the label and text of the comment, separated by \t.
+            label, comment = line.strip().split('\t')
+
+            # Split the words into a list.
+            words = comment.split()
+
+            # convert the words into a list of ids by looking them up in word_dict.
+            word_vector = [settings.word_dict.get(w, UNK_IDX) for w in words]
+
+            # Return the features for the current comment. The first is a list
+            # of ids representing a 0-1 binary sparse vector of the text,
+            # the second is the integer id of the label.
+            yield word_vector, int(label)
+
+
+def predict_initializer(settings, dictionary, **kwargs):
+    settings.word_dict = dictionary
+    settings.input_types = [
+        sparse_binary_vector(len(dictionary))
+    ]
+
+# Declaring a data provider for prediction. The difference with process
+# is that label is not generated.
+@provider(init_hook=predict_initializer)
+def process_predict(settings, file_name):
+    with open(file_name, 'r') as f:
+        for line in f:
+            comment = line.strip()
+            word_vector = [settings.word_dict.get(w, UNK_IDX) for w in comment]
+            yield word_vector
diff --git a/demo/quick_start/dataprovider_emb.py b/demo/quick_start/dataprovider_emb.py
new file mode 100755
index 00000000000000..e9b17603818b3a
--- /dev/null
+++ b/demo/quick_start/dataprovider_emb.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+
+UNK_IDX = 0
+
+def initializer(settings, dictionary, **kwargs):
+    settings.word_dict = dictionary
+    settings.input_types = [
+        # Define the type of the first input as sequence of integer.
+        # The value of the integers range from 0 to len(dictrionary)-1
+        integer_value_sequence(len(dictionary)),
+        # Define the second input for label id
+        integer_value(2)]
+
+
+@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_name):
+    with open(file_name, 'r') as f:
+        for line in f:
+            label, comment = line.strip().split('\t')
+            words = comment.split()
+            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
+            yield word_slot, int(label)
+
+
+def predict_initializer(settings, dictionary, **kwargs):
+    settings.word_dict = dictionary
+    settings.input_types = [
+        integer_value(len(dictionary), seq_type=SequenceType.SEQUENCE)
+    ]
+
+
+@provider(init_hook=predict_initializer)
+def process_predict(settings, file_name):
+    with open(file_name, 'r') as f:
+        for line in f:
+            comment = line.strip()
+            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in comment]
+            yield word_slot
diff --git a/demo/quick_start/predict.sh b/demo/quick_start/predict.sh
new file mode 100755
index 00000000000000..f764e202446a4e
--- /dev/null
+++ b/demo/quick_start/predict.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+#cfg=trainer_config.lr.py
+#cfg=trainer_config.emb.py
+#cfg=trainer_config.cnn.py
+cfg=trainer_config.lstm.py
+model="output/pass-00003"
+paddle train \
+    --config=$cfg \
+    --use_gpu=false \
+    --job=test \
+    --init_model_path=$model \
+    --config_args=is_predict=1 \
+    --predict_output_dir=. \
+
+mv rank-00000 result.txt
diff --git a/demo/quick_start/preprocess.py b/demo/quick_start/preprocess.py
new file mode 100755
index 00000000000000..0ef7e65c749e75
--- /dev/null
+++ b/demo/quick_start/preprocess.py
@@ -0,0 +1,186 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+1. remove HTML before tokensizing 
+2. pos sample : rating score 5; neg sample: rating score 1-2.
+3. size of pos : neg = 1:1.
+4. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.
+5. distinct train set and test set.
+
+Usage:
+    python preprocess.py -i data_file [random seed]
+'''
+
+import sys,os
+import re
+import operator
+import gzip,math
+import random
+import numpy as np
+from bs4 import BeautifulSoup
+from subprocess import Popen, PIPE
+from optparse import OptionParser
+
+def parse(path):
+    """
+    Open .gz file.
+    """
+    g = gzip.open(path, 'r')
+    for l in g:
+        yield eval(l)
+
+def clean(review):
+    """
+    Clean input review: remove HTML, convert words to lower cases.
+    """
+    # Remove HTML
+    review_text = BeautifulSoup(review, "html.parser").get_text()
+
+    # Convert words to lower case
+    review_text = review_text.lower()
+    return review_text
+
+def tokenize(sentences):
+    """
+    Use tokenizer.perl to tokenize input sentences.
+    tokenizer.perl is tool of Moses.
+    sentences : a list of input sentences.
+    return: a list of processed text.
+    """
+    dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
+    tokenizer_cmd = [dir, '-l', 'en', '-q', '-']
+    assert isinstance(sentences, list)
+    text = "\n".join(sentences)
+    tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
+    tok_text, _ = tokenizer.communicate(text)
+    toks = tok_text.split('\n')[:-1]
+    return toks
+
+def create_dict(data, data_dir):
+    """
+    Create dictionary based on data, and saved in data_dir/dict.txt.
+    The first line is unk \t -1. 
+    data: list, input data.
+    data_dir: path to save dict.
+    """
+    word_count = {}
+    for seq in data:
+        try:
+            for w in seq.lower().split():
+                if w not in word_count:
+                    word_count[w] = 1
+                else:
+                    word_count[w] += 1
+        except:
+            sys.stderr.write(seq+"\tERROR\n")
+    f = open(os.path.join(data_dir, 'dict.txt'), 'w')
+    f.write('%s\t%s\n' % ('unk', '-1'))
+    for k, v in sorted(word_count.items(), key=operator.itemgetter(1),\
+                      reverse=True):
+        f.write('%s\t%s\n' % (k, v))
+    f.close()
+
+def save_data(data, data_dir, prefix = ""):
+    file_name = os.path.join(data_dir, "%s.txt" % (prefix))
+    file(file_name,'w').write('\n'.join(data)+'\n')
+    file(os.path.join(data_dir, prefix+'.list'),'w').write('%s\n' % file_name)
+
+def split_data(raw_txt):
+    """
+    Extract positive and negative sample.
+    """
+    pos = []
+    neg = []
+    count = 0
+    dup_cnt = 0
+    sys.stderr.write("extract raw data")
+    for l in raw_txt:
+        rating = l["overall"]
+        text = clean(l["reviewText"])
+        if rating == 5.0 and text:
+            pos.append(text)
+        if rating < 3.0 and text:
+            neg.append(text)
+        count += 1
+        if count % 20000==0:
+            sys.stderr.write(".")
+    sys.stderr.write("\n")
+    return pos, neg
+
+def preprocess(pos_in, neg_in, data_dir, rand_seed):
+    # tokenize
+    sys.stderr.write("tokenize...\n")
+    tmppos = tokenize(pos_in)
+    tmpneg = tokenize(neg_in)
+    cnt = len(tmppos) + len(tmpneg)
+
+    # unique smaples
+    tmppos = list(set(tmppos))
+    tmpneg = list(set(tmpneg))
+    dup_cnt = cnt - len(tmppos) - len(tmpneg)
+    sys.stderr.write("\ntotal size of data set: %d, duplicate data: %d\n" % (cnt, dup_cnt))
+
+    # keep same size of positive and negative sample
+    min_len = min(len(tmppos), len(tmpneg))
+    tmppos = tmppos[0:min_len]
+    tmpneg = tmpneg[0:min_len]
+
+    # creat dictionary
+    sys.stderr.write("create dict with train and test data...\n")
+    all_data = tmppos + tmpneg
+    create_dict(all_data, data_dir)
+
+    # split into train set and test set
+    sys.stderr.write("split data...\n")
+    pos = ["1\t"+i for i in tmppos]
+    neg = ["0\t"+i for i in tmpneg]
+    random.seed(rand_seed)
+    random.shuffle(pos)
+    random.shuffle(neg)
+
+    # split into test set and train set
+    test_len = min(12500, int(min_len * 0.1))
+    test = pos[0:test_len] + neg[0:test_len]
+    train = pos[test_len:] + neg[test_len:]
+
+    # save data
+    sys.stderr.write("save data...\n")
+    save_data(train, data_dir, prefix = 'train')
+    save_data(test, data_dir, prefix = 'test')
+    file(os.path.join(data_dir,'labels.list'),'w').write('neg\t0\npos\t1\n')
+
+def option_parser():
+    parser = OptionParser(usage="usage: python preprcoess.py "\
+                                "-i data_path [options]")
+    parser.add_option("-i", "--data", action="store",
+                      dest="input", help="Input data path.")
+    parser.add_option("-s", "--seed", action="store",
+                      dest="seed", default=1024,
+                      help="Set random seed.")
+    return parser.parse_args()
+
+def main():
+    reload(sys)
+    sys.setdefaultencoding('utf-8')
+    options, args = option_parser()
+    data=options.input
+    seed=options.seed
+    data_dir = os.path.dirname(data)
+    pos, neg = split_data(parse(data))
+    preprocess(pos, neg, data_dir, seed)
+    sys.stderr.write("Done.\n")
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/quick_start/preprocess.sh b/demo/quick_start/preprocess.sh
new file mode 100755
index 00000000000000..f4d8e647a22525
--- /dev/null
+++ b/demo/quick_start/preprocess.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+python preprocess.py -i data/reviews_Electronics_5.json.gz
+
+# use 30k dict
+mv data/dict.txt data/dict_all.txt
+cat data/dict_all.txt | head -n 30001 > data/dict.txt
diff --git a/demo/quick_start/requirements.txt b/demo/quick_start/requirements.txt
new file mode 100644
index 00000000000000..c1f5f713cdafc4
--- /dev/null
+++ b/demo/quick_start/requirements.txt
@@ -0,0 +1 @@
+beautifulsoup4
diff --git a/demo/quick_start/train.sh b/demo/quick_start/train.sh
new file mode 100755
index 00000000000000..1f0a137c8bd594
--- /dev/null
+++ b/demo/quick_start/train.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+cfg=trainer_config.lr.py
+#cfg=trainer_config.emb.py
+#cfg=trainer_config.cnn.py
+#cfg=trainer_config.lstm.py
+paddle train \
+  --config=$cfg \
+  --save_dir=./output \
+  --trainer_count=4 \
+  --log_period=20 \
+  --num_passes=15 \
+  --use_gpu=false \
+  --show_parameter_stats_period=100 \
+  --test_all_data_in_one_period=1 \
+  2>&1 | tee 'train.log'
diff --git a/demo/quick_start/trainer_config.cnn.py b/demo/quick_start/trainer_config.cnn.py
new file mode 100644
index 00000000000000..253ec0aee26cf4
--- /dev/null
+++ b/demo/quick_start/trainer_config.cnn.py
@@ -0,0 +1,55 @@
+# edit-mode: -*- python -*-
+
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+dict_file = "./data/dict.txt"
+word_dict = dict()
+with open(dict_file, 'r') as f:
+    for i, line in enumerate(f):
+        w = line.strip().split()[0]
+        word_dict[w] = i
+
+is_predict = get_config_arg('is_predict', bool, False)
+trn = 'data/train.list' if not is_predict else None
+tst = 'data/test.list' if not is_predict else 'data/pred.list'
+process = 'process' if not is_predict else 'process_predict'
+define_py_data_sources2(train_list=trn,
+                        test_list=tst,
+                        module="dataprovider_emb",
+                        obj=process,
+                        args={"dictionary": word_dict})
+
+batch_size = 128 if not is_predict else 1
+settings(
+    batch_size=batch_size,
+    learning_rate=2e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(8e-4),
+    gradient_clipping_threshold=25
+)
+
+data = data_layer(name="word", size=len(word_dict))
+embedding = embedding_layer(input=data, size=128)
+conv = sequence_conv_pool(input=embedding, context_len=3, hidden_size=512)
+output = fc_layer(input=conv, size=2, act=SoftmaxActivation())
+if is_predict:
+    maxid = maxid_layer(output)
+    outputs([maxid, output])
+else:
+    label = data_layer(name="label", size=2)
+    cls = classification_cost(input=output, label=label)
+    outputs(cls)
diff --git a/demo/quick_start/trainer_config.emb.py b/demo/quick_start/trainer_config.emb.py
new file mode 100644
index 00000000000000..34dd7b96f2f142
--- /dev/null
+++ b/demo/quick_start/trainer_config.emb.py
@@ -0,0 +1,53 @@
+# edit-mode: -*- python -*-
+
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+dict_file = "./data/dict.txt"
+word_dict = dict()
+with open(dict_file, 'r') as f:
+    for i, line in enumerate(f):
+        w = line.strip().split()[0]
+        word_dict[w] = i
+
+is_predict = get_config_arg('is_predict', bool, False)
+trn = 'data/train.list' if not is_predict else None
+tst = 'data/test.list' if not is_predict else 'data/pred.list'
+process = 'process' if not is_predict else 'process_predict'
+define_py_data_sources2(train_list=trn,
+                        test_list=tst,
+                        module="dataprovider_emb",
+                        obj=process,
+                        args={"dictionary": word_dict})
+
+batch_size = 128 if not is_predict else 1
+settings(
+    batch_size=batch_size,
+    learning_rate=2e-3,
+    learning_method=AdamOptimizer()
+)
+
+data = data_layer(name="word", size=len(word_dict))
+embedding = embedding_layer(input=data, size=128)
+avg = pooling_layer(input=embedding, pooling_type=AvgPooling())
+output = fc_layer(input=avg, size=2, act=SoftmaxActivation())
+if is_predict:
+    maxid = maxid_layer(output)
+    outputs([maxid, output])
+else:
+    label = data_layer(name="label", size=2)
+    cls = classification_cost(input=output, label=label)
+    outputs(cls)
diff --git a/demo/quick_start/trainer_config.lr.py b/demo/quick_start/trainer_config.lr.py
new file mode 100644
index 00000000000000..119e3849a4b7e0
--- /dev/null
+++ b/demo/quick_start/trainer_config.lr.py
@@ -0,0 +1,73 @@
+# edit-mode: -*- python -*-
+
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+dict_file = "./data/dict.txt"
+word_dict = dict()
+with open(dict_file, 'r') as f:
+    for i, line in enumerate(f):
+        w = line.strip().split()[0]
+        word_dict[w] = i
+
+is_predict = get_config_arg('is_predict', bool, False)
+trn = 'data/train.list' if not is_predict else None
+tst = 'data/test.list' if not is_predict else 'data/pred.list'
+process = 'process' if not is_predict else 'process_predict'
+
+# define the data sources for the model.
+# We need to use different process for training and prediction.
+# For training, the input data includes both word IDs and labels.
+# For prediction, the input data only includs word Ids.
+define_py_data_sources2(train_list=trn,
+                        test_list=tst,
+                        module="dataprovider_bow",
+                        obj=process,
+                        args={"dictionary": word_dict})
+
+batch_size = 128 if not is_predict else 1
+settings(
+    batch_size=batch_size,
+    learning_rate=2e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(8e-4),
+    gradient_clipping_threshold=25
+)
+
+# Define the data for text features. The size of the data layer is the number
+# of words in the dictionary.
+data = data_layer(name="word", size=len(word_dict))
+
+# Define a fully connected layer with logistic activation.
+# (also called softmax activation).
+output = fc_layer(input=data, size=2, act=SoftmaxActivation())
+
+if not is_predict:
+    # For training, we need label and cost
+
+    # define the category id for each example.
+    # The size of the data layer is the number of labels.
+    label = data_layer(name="label", size=2)
+
+    # Define cross-entropy classification loss and error.
+    classification_cost(input=output, label=label)
+    cls = classification_cost(input=output, label=label)
+    outputs(cls)
+else:
+    # For prediction, no label is needed. We need to output
+    # We need to output classification result, and class probabilities.
+    maxid = maxid_layer(output)
+    outputs([maxid, output])
diff --git a/demo/quick_start/trainer_config.lstm.py b/demo/quick_start/trainer_config.lstm.py
new file mode 100644
index 00000000000000..ec8a2cb00abd19
--- /dev/null
+++ b/demo/quick_start/trainer_config.lstm.py
@@ -0,0 +1,66 @@
+# edit-mode: -*- python -*-
+
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+dict_file = "./data/dict.txt"
+word_dict = dict()
+with open(dict_file, 'r') as f:
+    for i, line in enumerate(f):
+        w = line.strip().split()[0]
+        word_dict[w] = i
+
+is_predict = get_config_arg('is_predict', bool, False)
+trn = 'data/train.list' if not is_predict else None
+tst = 'data/test.list' if not is_predict else 'data/pred.list'
+process = 'process' if not is_predict else 'process_predict'
+define_py_data_sources2(train_list=trn,
+                        test_list=tst,
+                        module="dataprovider_emb",
+                        obj=process,
+                        args={"dictionary": word_dict})
+
+batch_size = 128 if not is_predict else 1
+settings(
+    batch_size=batch_size,
+    learning_rate=2e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(8e-4),
+    gradient_clipping_threshold=25
+)
+
+bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
+
+data = data_layer(name="word", size=len(word_dict))
+emb = embedding_layer(input=data, size=128)
+fc = fc_layer(input=emb, size=512,
+              act=LinearActivation(),
+              bias_attr=bias_attr,
+              layer_attr=ExtraAttr(drop_rate=0.1))
+lstm = lstmemory(input=fc, act=TanhActivation(),
+                 bias_attr=bias_attr,
+                 layer_attr=ExtraAttr(drop_rate=0.25))
+lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
+output = fc_layer(input=lstm_last, size=2,
+                  bias_attr=bias_attr,
+                  act=SoftmaxActivation())
+if is_predict:
+    maxid = maxid_layer(output)
+    outputs([maxid, output])
+else:
+    label = data_layer(name="label", size=2)
+    cls = classification_cost(input=output, label=label)
+    outputs(cls)
diff --git a/demo/recommendation/.gitignore b/demo/recommendation/.gitignore
new file mode 100644
index 00000000000000..aeae0f189dbbbf
--- /dev/null
+++ b/demo/recommendation/.gitignore
@@ -0,0 +1,9 @@
+log.txt
+data/meta.bin
+data/ml-1m
+data/ratings.dat.train
+data/ratings.dat.test
+data/train.list
+data/test.list
+dataprovider_copy_1.py
+*.pyc
diff --git a/demo/recommendation/common_utils.py b/demo/recommendation/common_utils.py
new file mode 100755
index 00000000000000..a5f00b3ef9ca00
--- /dev/null
+++ b/demo/recommendation/common_utils.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer.PyDataProvider2 import *
+
+
+def meta_to_header(meta, name):
+    metas = meta[name]['__meta__']['raw_meta']
+    for each_meta in metas:
+        if each_meta['type'] == 'id':
+            yield integer_value(each_meta['max'])
+        elif each_meta['type'] == 'embedding':
+            is_seq = each_meta['seq'] == 'sequence'
+            yield integer_value(len(each_meta['dict']),
+                                seq_type=SequenceType.SEQUENCE if is_seq
+                                else SequenceType.NO_SEQUENCE)
+        elif each_meta['type'] == 'one_hot_dense':
+            yield dense_vector(len(each_meta['dict']))
diff --git a/demo/recommendation/data/config.json b/demo/recommendation/data/config.json
new file mode 100644
index 00000000000000..71a9dd7be6bd10
--- /dev/null
+++ b/demo/recommendation/data/config.json
@@ -0,0 +1,17 @@
+{
+  "user": {
+    "file": {
+      "name": "users.dat",
+      "delimiter": "::"
+    },
+    "fields": ["id", "gender", "age", "occupation"]
+  },
+  "movie": {
+    "file": {
+      "name": "movies.dat",
+      "delimiter": "::"
+    },
+    "fields": ["id", "title", "genres"]
+  }
+}
+
diff --git a/demo/recommendation/data/config_generator.py b/demo/recommendation/data/config_generator.py
new file mode 100644
index 00000000000000..29f38082693ad8
--- /dev/null
+++ b/demo/recommendation/data/config_generator.py
@@ -0,0 +1,134 @@
+#!/bin/env python2
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+config_generator.py
+
+Usage:
+    ./config_generator.py <config_file> [--output_format=<output_format>]
+    ./config_generator.py -h | --help
+
+Options:
+    -h --help                           Show this screen.
+    --output_format=<output_format>     Output Config format(json or yaml) [default: json].
+"""
+
+import json
+import docopt
+import copy
+
+DEFAULT_FILE = {
+    "type": "split",
+    "delimiter": ","
+}
+
+DEFAULT_FIELD = {
+    "id": {
+        "type": "id"
+    },
+    "gender": {
+        "name": "gender",
+        "type": "embedding",
+        "dict": {
+            "type": "char_based"
+        }
+    },
+    "age": {
+        "name": "age",
+        "type": "embedding",
+        "dict": {
+            "type": "whole_content",
+            "sort": True
+        }
+    },
+    "occupation": {
+        "name": "occupation",
+        "type": "embedding",
+        "dict": {
+            "type": "whole_content",
+            "sort": "true"
+        }
+    },
+    "title": {
+        "regex": {
+            "pattern": r"^(.*)\((\d+)\)$",
+            "group_id": 1,
+            "strip": True
+        },
+        "name": "title",
+        "type": {
+            "name": "embedding",
+            "seq_type": "sequence",
+        },
+        "dict": {
+            "type": "char_based"
+        }
+    },
+    "genres": {
+        "type": "one_hot_dense",
+        "dict": {
+            "type": "split",
+            "delimiter": "|"
+        },
+        "name": "genres"
+    }
+}
+
+
+def merge_dict(master_dict, slave_dict):
+    return dict(((k, master_dict.get(k) or slave_dict.get(k))
+                 for k in set(slave_dict) | set(master_dict)))
+
+
+def main(filename, fmt):
+    with open(filename, 'r') as f:
+        conf = json.load(f)
+        obj = dict()
+        for k in conf:
+            val = conf[k]
+            file_dict = val['file']
+            file_dict = merge_dict(file_dict, DEFAULT_FILE)
+
+            fields = []
+            for pos, field_key in enumerate(val['fields']):
+                assert isinstance(field_key, basestring)
+                field = copy.deepcopy(DEFAULT_FIELD[field_key])
+                field['pos'] = pos
+                fields.append(field)
+            obj[k] = {
+                "file": file_dict,
+                "fields": fields
+            }
+    meta = {
+        "meta": obj
+    }
+    # print meta
+    if fmt == 'json':
+        def formatter(x):
+            import json
+            return json.dumps(x, indent=2)
+    elif fmt == 'yaml':
+        def formatter(x):
+            import yaml
+            return yaml.safe_dump(x, default_flow_style=False)
+    else:
+        raise NotImplementedError("Dump format %s is not implemented" % fmt)
+
+    print formatter(meta)
+
+
+if __name__ == '__main__':
+    args = docopt.docopt(__doc__, version="0.1.0")
+    main(args["<config_file>"], args["--output_format"])
diff --git a/demo/recommendation/data/meta_config.json b/demo/recommendation/data/meta_config.json
new file mode 100644
index 00000000000000..cc6a046e271dd0
--- /dev/null
+++ b/demo/recommendation/data/meta_config.json
@@ -0,0 +1,81 @@
+{
+  "meta": {
+    "movie": {
+      "fields": [
+        {
+          "type": "id", 
+          "pos": 0
+        }, 
+        {
+          "regex": {
+            "pattern": "^(.*)\\((\\d+)\\)$", 
+            "group_id": 1, 
+            "strip": true
+          }, 
+          "type": {
+            "seq_type": "sequence", 
+            "name": "embedding"
+          }, 
+          "dict": {
+            "type": "char_based"
+          }, 
+          "name": "title", 
+          "pos": 1
+        }, 
+        {
+          "type": "one_hot_dense", 
+          "dict": {
+            "delimiter": "|", 
+            "type": "split"
+          }, 
+          "name": "genres", 
+          "pos": 2
+        }
+      ], 
+      "file": {
+        "delimiter": "::", 
+        "type": "split", 
+        "name": "movies.dat"
+      }
+    }, 
+    "user": {
+      "fields": [
+        {
+          "type": "id", 
+          "pos": 0
+        }, 
+        {
+          "type": "embedding", 
+          "dict": {
+            "type": "char_based"
+          }, 
+          "name": "gender", 
+          "pos": 1
+        }, 
+        {
+          "type": "embedding", 
+          "dict": {
+            "sort": true, 
+            "type": "whole_content"
+          }, 
+          "name": "age", 
+          "pos": 2
+        }, 
+        {
+          "type": "embedding", 
+          "dict": {
+            "sort": "true", 
+            "type": "whole_content"
+          }, 
+          "name": "occupation", 
+          "pos": 3
+        }
+      ], 
+      "file": {
+        "delimiter": "::", 
+        "type": "split", 
+        "name": "users.dat"
+      }
+    }
+  }
+}
diff --git a/demo/recommendation/data/meta_generator.py b/demo/recommendation/data/meta_generator.py
new file mode 100644
index 00000000000000..8d1a33d02aea11
--- /dev/null
+++ b/demo/recommendation/data/meta_generator.py
@@ -0,0 +1,436 @@
+#!/bin/env python2
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Preprocess Movielens dataset, to get movie/user object.
+
+Usage:
+    ./preprocess.py <dataset_dir> <binary_filename> [--config=<config_file>]
+    ./preprocess.py -h | --help
+
+Options:
+    -h --help               Show this screen.
+    --version               Show version.
+    --config=<config_file>  Get MetaData config file [default: config.json].
+"""
+import docopt
+import os
+import sys
+import re
+import collections
+
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+
+
+class UniqueIDGenerator(object):
+    def __init__(self):
+        self.pool = collections.defaultdict(self.__next_id__)
+        self.next_id = 0
+
+    def __next_id__(self):
+        tmp = self.next_id
+        self.next_id += 1
+        return tmp
+
+    def __call__(self, k):
+        return self.pool[k]
+
+    def to_list(self):
+        ret_val = [None] * len(self.pool)
+        for k in self.pool.keys():
+            ret_val[self.pool[k]] = k
+        return ret_val
+
+
+class SortedIDGenerator(object):
+    def __init__(self):
+        self.__key_set__ = set()
+        self.dict = None
+
+    def scan(self, key):
+        self.__key_set__.add(key)
+
+    def finish_scan(self, compare=None, key=None, reverse=False):
+        self.__key_set__ = sorted(list(self.__key_set__), cmp=compare,
+                                  key=key, reverse=reverse)
+        self.dict = dict()
+        for idx, each_key in enumerate(self.__key_set__):
+            self.dict[each_key] = idx
+
+    def __call__(self, key):
+        return self.dict[key]
+
+    def to_list(self):
+        return self.__key_set__
+
+
+class SplitFileReader(object):
+    def __init__(self, work_dir, config):
+        assert isinstance(config, dict)
+        self.filename = config['name']
+        self.delimiter = config.get('delimiter', ',')
+        self.work_dir = work_dir
+
+    def read(self):
+        with open(os.path.join(self.work_dir, self.filename), 'r') as f:
+            for line in f:
+                line = line.strip()
+                if isinstance(self.delimiter, unicode):
+                    self.delimiter = str(self.delimiter)
+                yield line.split(self.delimiter)
+
+    @staticmethod
+    def create(work_dir, config):
+        assert isinstance(config, dict)
+        if config['type'] == 'split':
+            return SplitFileReader(work_dir, config)
+
+
+class IFileReader(object):
+    READERS = [SplitFileReader]
+
+    def read(self):
+        raise NotImplementedError()
+
+    @staticmethod
+    def create(work_dir, config):
+        for reader_cls in IFileReader.READERS:
+            val = reader_cls.create(work_dir, config)
+            if val is not None:
+                return val
+
+
+class IDFieldParser(object):
+    TYPE = 'id'
+
+    def __init__(self, config):
+        self.__max_id__ = -sys.maxint - 1
+        self.__min_id__ = sys.maxint
+        self.__id_count__ = 0
+
+    def scan(self, line):
+        idx = int(line)
+        self.__max_id__ = max(self.__max_id__, idx)
+        self.__min_id__ = min(self.__min_id__, idx)
+        self.__id_count__ += 1
+
+    def parse(self, line):
+        return int(line)
+
+    def meta_field(self):
+        return {
+            "is_key": True,
+            'max': self.__max_id__,
+            'min': self.__min_id__,
+            'count': self.__id_count__,
+            'type': 'id'
+        }
+
+
+class SplitEmbeddingDict(object):
+    def __init__(self, delimiter):
+        self.__id__ = UniqueIDGenerator()
+        self.delimiter = delimiter
+
+    def scan(self, multi):
+        for val in multi.split(self.delimiter):
+            self.__id__(val)
+
+    def parse(self, multi):
+        return map(self.__id__, multi.split(self.delimiter))
+
+    def meta_field(self):
+        return self.__id__.to_list()
+
+
+class EmbeddingFieldParser(object):
+    TYPE = 'embedding'
+
+    NO_SEQUENCE = "no_sequence"
+    SEQUENCE = "sequence"
+
+    class CharBasedEmbeddingDict(object):
+        def __init__(self, is_seq=True):
+            self.__id__ = UniqueIDGenerator()
+            self.is_seq = is_seq
+
+        def scan(self, s):
+            for ch in s:
+                self.__id__(ch)
+
+        def parse(self, s):
+            return map(self.__id__, s) if self.is_seq else self.__id__(s[0])
+
+        def meta_field(self):
+            return self.__id__.to_list()
+
+    class WholeContentDict(object):
+        def __init__(self, need_sort=True):
+            assert need_sort
+            self.__id__ = SortedIDGenerator()
+            self.__has_finished__ = False
+
+        def scan(self, txt):
+            self.__id__.scan(txt)
+
+        def meta_field(self):
+            if not self.__has_finished__:
+                self.__id__.finish_scan()
+                self.__has_finished__ = True
+            return self.__id__.to_list()
+
+        def parse(self, txt):
+            return self.__id__(txt)
+
+    def __init__(self, config):
+        try:
+            self.seq_type = config['type']['seq_type']
+        except TypeError:
+            self.seq_type = EmbeddingFieldParser.NO_SEQUENCE
+
+        if config['dict']['type'] == 'char_based':
+            self.dict = EmbeddingFieldParser.CharBasedEmbeddingDict(
+                self.seq_type == EmbeddingFieldParser.SEQUENCE)
+        elif config['dict']['type'] == 'split':
+            self.dict = SplitEmbeddingDict(
+                config['dict'].get('delimiter', ','))
+        elif config['dict']['type'] == 'whole_content':
+            self.dict = EmbeddingFieldParser.WholeContentDict(
+                config['dict']['sort'])
+        else:
+            print config
+            assert False
+
+        self.name = config['name']
+
+    def scan(self, s):
+        self.dict.scan(s)
+
+    def meta_field(self):
+        return {
+            'name': self.name,
+            'dict': self.dict.meta_field(),
+            'type': 'embedding',
+            'seq': self.seq_type
+        }
+
+    def parse(self, s):
+        return self.dict.parse(s)
+
+
+class OneHotDenseFieldParser(object):
+    TYPE = 'one_hot_dense'
+
+    def __init__(self, config):
+        if config['dict']['type'] == 'split':
+            self.dict = SplitEmbeddingDict(config['dict']['delimiter'])
+        self.name = config['name']
+
+    def scan(self, s):
+        self.dict.scan(s)
+
+    def meta_field(self):
+        # print self.dict.meta_field()
+        return {
+            'dict': self.dict.meta_field(),
+            'name': self.name,
+            'type': 'one_hot_dense'
+        }
+
+    def parse(self, s):
+        ids = self.dict.parse(s)
+        retv = [0.0] * len(self.dict.meta_field())
+        for idx in ids:
+            retv[idx] = 1.0
+        # print retv
+        return retv
+
+
+class FieldParserFactory(object):
+    PARSERS = [IDFieldParser, EmbeddingFieldParser, OneHotDenseFieldParser]
+
+    @staticmethod
+    def create(config):
+        if isinstance(config['type'], basestring):
+            config_type = config['type']
+        elif isinstance(config['type'], dict):
+            config_type = config['type']['name']
+
+        assert config_type is not None
+
+        for each_parser_cls in FieldParserFactory.PARSERS:
+            if config_type == each_parser_cls.TYPE:
+                return each_parser_cls(config)
+        print config
+
+
+class CompositeFieldParser(object):
+    def __init__(self, parser, extractor):
+        self.extractor = extractor
+        self.parser = parser
+
+    def scan(self, *args, **kwargs):
+        self.parser.scan(self.extractor.extract(*args, **kwargs))
+
+    def parse(self, *args, **kwargs):
+        return self.parser.parse(self.extractor.extract(*args, **kwargs))
+
+    def meta_field(self):
+        return self.parser.meta_field()
+
+
+class PositionContentExtractor(object):
+    def __init__(self, pos):
+        self.pos = pos
+
+    def extract(self, line):
+        assert isinstance(line, list)
+        return line[self.pos]
+
+
+class RegexPositionContentExtractor(PositionContentExtractor):
+    def __init__(self, pos, pattern, group_id, strip=True):
+        PositionContentExtractor.__init__(self, pos)
+        pattern = pattern.strip()
+        self.pattern = re.compile(pattern)
+        self.group_id = group_id
+        self.strip = strip
+
+    def extract(self, line):
+        line = PositionContentExtractor.extract(self, line)
+        match = self.pattern.match(line)
+        # print line, self.pattern.pattern, match
+        assert match is not None
+        txt = match.group(self.group_id)
+        if self.strip:
+            txt.strip()
+        return txt
+
+
+class ContentExtractorFactory(object):
+    def extract(self, line):
+        pass
+
+    @staticmethod
+    def create(config):
+        if 'pos' in config:
+            if 'regex' not in config:
+                return PositionContentExtractor(config['pos'])
+            else:
+                extra_args = config['regex']
+                return RegexPositionContentExtractor(pos=config['pos'],
+                                                     **extra_args)
+
+
+class MetaFile(object):
+    def __init__(self, work_dir):
+        self.work_dir = work_dir
+        self.obj = dict()
+
+    def parse(self, config):
+        config = config['meta']
+
+        ret_obj = dict()
+        for key in config.keys():
+            val = config[key]
+            assert 'file' in val
+            reader = IFileReader.create(self.work_dir, val['file'])
+            assert reader is not None
+            assert 'fields' in val and isinstance(val['fields'], list)
+            fields_config = val['fields']
+            field_parsers = map(MetaFile.__field_config_mapper__, fields_config)
+
+            for each_parser in field_parsers:
+                assert each_parser is not None
+
+            for each_block in reader.read():
+                for each_parser in field_parsers:
+                    each_parser.scan(each_block)
+
+            metas = map(lambda x: x.meta_field(), field_parsers)
+            # print metas
+            key_index = filter(lambda x: x is not None, map(
+                lambda (idx, meta): idx if 'is_key' in meta and meta['is_key']
+                else None, enumerate(metas)))[0]
+
+            key_map = []
+            for i in range(min(key_index, len(metas))):
+                key_map.append(i)
+            for i in range(key_index + 1, len(metas)):
+                key_map.append(i)
+
+            obj = {
+                '__meta__': {
+                    'raw_meta': metas,
+                    'feature_map': key_map
+                }
+            }
+
+            for each_block in reader.read():
+                idx = field_parsers[key_index].parse(each_block)
+                val = []
+                for i, each_parser in enumerate(field_parsers):
+                    if i != key_index:
+                        val.append(each_parser.parse(each_block))
+                obj[idx] = val
+            ret_obj[key] = obj
+        self.obj = ret_obj
+        return ret_obj
+
+    @staticmethod
+    def __field_config_mapper__(conf):
+        assert isinstance(conf, dict)
+        extrator = ContentExtractorFactory.create(conf)
+        field_parser = FieldParserFactory.create(conf)
+        assert extrator is not None
+        assert field_parser is not None
+        return CompositeFieldParser(field_parser, extrator)
+
+    def dump(self, fp):
+        pickle.dump(self.obj, fp, pickle.HIGHEST_PROTOCOL)
+
+
+def preprocess(binary_filename, dataset_dir, config, **kwargs):
+    assert isinstance(config, str)
+    with open(config, 'r') as config_file:
+        file_loader = None
+        if config.lower().endswith('.yaml'):
+            import yaml
+            file_loader = yaml
+        elif config.lower().endswith('.json'):
+            import json
+            file_loader = json
+        config = file_loader.load(config_file)
+    meta = MetaFile(dataset_dir)
+    meta.parse(config)
+    with open(binary_filename, 'wb') as outf:
+        meta.dump(outf)
+
+
+if __name__ == '__main__':
+    args = docopt.docopt(__doc__, version='0.1.0')
+    kwargs = dict()
+    for key in args.keys():
+        if key != '--help':
+            param_name = key
+            assert isinstance(param_name, str)
+            param_name = param_name.replace('<', '')
+            param_name = param_name.replace('>', '')
+            param_name = param_name.replace('--', '')
+            kwargs[param_name] = args[key]
+    preprocess(**kwargs)
diff --git a/demo/recommendation/data/ml_data.sh b/demo/recommendation/data/ml_data.sh
new file mode 100755
index 00000000000000..408a8723e086d3
--- /dev/null
+++ b/demo/recommendation/data/ml_data.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -ex
+cd "$(dirname "$0")"
+# download the dataset
+wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
+# unzip the dataset
+unzip ml-1m.zip
+# remove the unused zip file
+rm ml-1m.zip
diff --git a/demo/recommendation/data/split.py b/demo/recommendation/data/split.py
new file mode 100644
index 00000000000000..ff1f7fab7befdb
--- /dev/null
+++ b/demo/recommendation/data/split.py
@@ -0,0 +1,67 @@
+#!/bin/env python2
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Separate movielens 1m dataset to train/test file.
+
+Usage:
+    ./separate.py <input_file> [--test_ratio=<test_ratio>] [--delimiter=<delimiter>]
+    ./separate.py -h | --help
+
+Options:
+    -h --help                       Show this screen.
+    --version                       Show version.
+    --test_ratio=<test_ratio>       Test ratio for separate [default: 0.1].
+    --delimiter=<delimiter>         File delimiter [default: ,].
+"""
+import docopt
+import collections
+import random
+
+
+def process(test_ratio, input_file, delimiter, **kwargs):
+    test_ratio = float(test_ratio)
+    rating_dict = collections.defaultdict(list)
+    with open(input_file, 'r') as f:
+        for line in f:
+            user_id = int(line.split(delimiter)[0])
+            rating_dict[user_id].append(line.strip())
+
+    with open(input_file + ".train", 'w') as train_file:
+        with open(input_file + ".test", 'w') as test_file:
+            for k in rating_dict.keys():
+                lines = rating_dict[k]
+                assert isinstance(lines, list)
+                random.shuffle(lines)
+                test_len = int(len(lines) * test_ratio)
+                for line in lines[:test_len]:
+                    print >> test_file, line
+
+                for line in lines[test_len:]:
+                    print >> train_file, line
+
+
+if __name__ == '__main__':
+    args = docopt.docopt(__doc__, version='0.1.0')
+    kwargs = dict()
+    for key in args.keys():
+        if key != '--help':
+            param_name = key
+            assert isinstance(param_name, str)
+            param_name = param_name.replace('<', '')
+            param_name = param_name.replace('>', '')
+            param_name = param_name.replace('--', '')
+            kwargs[param_name] = args[key]
+    process(**kwargs)
diff --git a/demo/recommendation/dataprovider.py b/demo/recommendation/dataprovider.py
new file mode 100755
index 00000000000000..29cfd7224803e0
--- /dev/null
+++ b/demo/recommendation/dataprovider.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+
+from paddle.trainer.PyDataProvider2 import *
+import common_utils  # parse
+
+
+def hook(settings, meta, **kwargs):
+    """
+    Init hook is invoked before process data. It will set obj.slots and store
+    data meta.
+
+    :param obj: global object. It will passed to process routine.
+    :type obj: object
+    :param meta: the meta file object, which passed from trainer_config. Meta
+                 file record movie/user features.
+    :param kwargs: unused other arguments.
+    """
+    del kwargs  # unused kwargs
+
+    # Header define slots that used for paddle.
+    #    first part is movie features.
+    #    second part is user features.
+    #    final part is rating score.
+    # header is a list of [USE_SEQ_OR_NOT?, SlotType]
+    headers = list(common_utils.meta_to_header(meta, 'movie'))
+    headers.extend(list(common_utils.meta_to_header(meta, 'user')))
+    headers.append(dense_vector(1))  # Score
+
+    # slot types.
+    settings.input_types = headers
+    settings.meta = meta
+
+
+@provider(init_hook=hook, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, filename):
+    with open(filename, 'r') as f:
+        for line in f:
+            # Get a rating from file.
+            user_id, movie_id, score = map(int, line.split('::')[:-1])
+
+            # Scale score to [-5, +5]
+            score = float(score) * 2 - 5.0
+
+            # Get movie/user features by movie_id, user_id
+            movie_meta = settings.meta['movie'][movie_id]
+            user_meta = settings.meta['user'][user_id]
+
+            outputs = [movie_id - 1]
+
+            # Then add movie features
+            for each_meta in movie_meta:
+                outputs.append(each_meta)
+
+            # Then add user id.
+            outputs.append(user_id - 1)
+
+            # Then add user features.
+            for each_meta in user_meta:
+                outputs.append(each_meta)
+
+            # Finally, add score
+            outputs.append([score])
+            # Return data to paddle
+            yield outputs
diff --git a/demo/recommendation/evaluate.sh b/demo/recommendation/evaluate.sh
new file mode 100755
index 00000000000000..38c1562c6370dd
--- /dev/null
+++ b/demo/recommendation/evaluate.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+function get_best_pass() {
+  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | sort | head -n 1
+}
+
+LOG=`get_best_pass log.txt`
+LOG=(${LOG})
+echo 'Best pass is '${LOG[1]}, ' error is '${LOG[0]}, 'which means predict get error as '`echo ${LOG[0]} | python -c 'import math; print math.sqrt(float(raw_input()))/2'`
+
+evaluate_pass="output/pass-${LOG[1]}"
+
+echo 'evaluating from pass '$evaluate_pass
diff --git a/demo/recommendation/prediction.py b/demo/recommendation/prediction.py
new file mode 100755
index 00000000000000..1a6cfce58fe537
--- /dev/null
+++ b/demo/recommendation/prediction.py
@@ -0,0 +1,51 @@
+#!/bin/env python2
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from py_paddle import swig_paddle, DataProviderWrapperConverter
+
+from common_utils import *
+from paddle.trainer.config_parser import parse_config
+
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+import sys
+
+if __name__ == '__main__':
+    model_path = sys.argv[1]
+    swig_paddle.initPaddle('--use_gpu=0')
+    conf = parse_config("trainer_config.py", "is_predict=1")
+    network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
+    assert isinstance(network, swig_paddle.GradientMachine)
+    network.loadParameters(model_path)
+    with open('meta.bin', 'rb') as f:
+        meta = pickle.load(f)
+        headers = list(meta_to_header(meta, 'movie'))
+        headers.extend(list(meta_to_header(meta, 'user')))
+        cvt = DataProviderWrapperConverter(True, map(lambda x: x[1], headers))
+        while True:
+            movie_id = int(raw_input("Input movie_id: "))
+            user_id = int(raw_input("Input user_id: "))
+            movie_meta = meta['movie'][movie_id]    # Query Data From Meta.
+            user_meta = meta['user'][user_id]
+            data = [movie_id - 1]
+            data.extend(movie_meta)
+            data.append(user_id - 1)
+            data.extend(user_meta)
+            data = map(lambda (header, val): val if header[0] else [val],
+                       zip(headers, data))
+            print "Prediction Score is %.2f" % ((network.forwardTest(cvt([
+                data]))[0]['value'][0][0] + 5) / 2)
diff --git a/demo/recommendation/preprocess.sh b/demo/recommendation/preprocess.sh
new file mode 100755
index 00000000000000..e181d0be455589
--- /dev/null
+++ b/demo/recommendation/preprocess.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+cd "$(dirname "$0")"
+delimiter='::'
+dir=ml-1m
+cd data
+echo 'generate meta config file'
+python config_generator.py config.json > meta_config.json
+echo 'generate meta file'
+python meta_generator.py $dir meta.bin --config=meta_config.json
+echo 'split train/test file'
+python split.py $dir/ratings.dat --delimiter=${delimiter} --test_ratio=0.1
+echo 'shuffle train file'
+shuf $dir/ratings.dat.train > ratings.dat.train
+cp $dir/ratings.dat.test .
+echo "./data/ratings.dat.train" > train.list
+echo "./data/ratings.dat.test" > test.list
diff --git a/demo/recommendation/requirements.txt b/demo/recommendation/requirements.txt
new file mode 100644
index 00000000000000..1ea154584a428b
--- /dev/null
+++ b/demo/recommendation/requirements.txt
@@ -0,0 +1,2 @@
+PyYAML
+docopt
diff --git a/demo/recommendation/run.sh b/demo/recommendation/run.sh
new file mode 100755
index 00000000000000..846b59cec9fc50
--- /dev/null
+++ b/demo/recommendation/run.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+paddle train \
+    --config=trainer_config.py \
+    --save_dir=./output \
+    --use_gpu=false \
+    --trainer_count=4\
+    --test_all_data_in_one_period=true \
+    --log_period=100 \
+    --dot_period=1 \
+    --num_passes=50  2>&1 | tee 'log.txt'
diff --git a/demo/recommendation/trainer_config.py b/demo/recommendation/trainer_config.py
new file mode 100755
index 00000000000000..69b9aa7a77cafd
--- /dev/null
+++ b/demo/recommendation/trainer_config.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+
+is_predict = get_config_arg('is_predict', bool, False)
+
+META_FILE = 'data/meta.bin'
+
+with open(META_FILE, 'rb') as f:
+    # load meta file
+    meta = pickle.load(f)
+
+settings(batch_size=1600, learning_rate=1e-3,
+         learning_method=RMSPropOptimizer())
+
+
+def construct_feature(name):
+    """
+    Construct movie/user features.
+
+    This method read from meta data. Then convert feature to neural network due
+    to feature type. The map relation as follow.
+
+    * id: embedding => fc
+    * embedding:
+        is_sequence:  embedding => context_projection => fc => pool
+        not sequence: embedding => fc
+    * one_hot_dense:  fc => fc
+
+    Then gather all features vector, and use a fc layer to combined them as
+    return.
+
+    :param name: 'movie' or 'user'
+    :type name: basestring
+    :return: combined feature output
+    :rtype: LayerOutput
+    """
+    __meta__ = meta[name]['__meta__']['raw_meta']
+    fusion = []
+    for each_meta in __meta__:
+        type_name = each_meta['type']
+        slot_name = each_meta.get('name', '%s_id' % name)
+        if type_name == 'id':
+            slot_dim = each_meta['max']
+            embedding = embedding_layer(input=data_layer(slot_name,
+                                                          size=slot_dim),
+                                        size=256,
+                                        param_attr=ParamAttr(
+                                            sparse_update=True))
+            fusion.append(fc_layer(input=embedding,
+                                   size=256))
+        elif type_name == 'embedding':
+            is_seq = each_meta['seq'] == 'sequence'
+            slot_dim = len(each_meta['dict'])
+            din = data_layer(slot_name, slot_dim)
+            embedding = embedding_layer(input=din, size=256)
+            if is_seq:
+                fusion.append(
+                    text_conv_pool(input=embedding, context_len=5,
+                                   hidden_size=256))
+            else:
+                fusion.append(fc_layer(input=embedding,
+                                       size=256))
+        elif type_name == 'one_hot_dense':
+            slot_dim = len(each_meta['dict'])
+            hidden = fc_layer(input=data_layer(slot_name, slot_dim),
+                              size=256)
+            fusion.append(fc_layer(input=hidden,
+                                   size=256))
+
+    return fc_layer(name="%s_fusion" % name, input=fusion, size=256)
+
+
+movie_feature = construct_feature("movie")
+user_feature = construct_feature("user")
+similarity = cos_sim(a=movie_feature, b=user_feature)
+if not is_predict:
+    outputs(regression_cost(input=similarity,
+                            label=data_layer('rating', size=1)))
+
+    define_py_data_sources2('data/train.list', 'data/test.list', module='dataprovider',
+                           obj='process', args={'meta': meta})
+else:
+    outputs(similarity)
diff --git a/demo/semantic_role_labeling/data/extract_dict_feature.py b/demo/semantic_role_labeling/data/extract_dict_feature.py
new file mode 100644
index 00000000000000..2982e54c665b41
--- /dev/null
+++ b/demo/semantic_role_labeling/data/extract_dict_feature.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+from optparse import OptionParser
+
+
+def extract_dict_features(pair_file, feature_file, src_dict_file,
+                          tgt_dict_file):
+    src_dict = set()
+    tgt_dict = set()
+
+    with open(pair_file) as fin, open(feature_file, 'w') as feature_out, open(
+            src_dict_file, 'w') as src_dict_out, open(tgt_dict_file,
+                                                      'w') as tgt_dict_out:
+        for line in fin:
+            sentence, labels = line.strip().split('\t')
+            sentence_list = sentence.split()
+            labels_list = labels.split()
+
+            src_dict.update(sentence_list)
+            tgt_dict.update(labels_list)
+
+            verb_index = labels_list.index('B-V')
+            verb_feature = sentence_list[verb_index]
+
+            mark = [0] * len(labels_list)
+            if verb_index > 0:
+                mark[verb_index - 1] = 1
+                ctx_n1 = sentence_list[verb_index - 1]
+            else:
+                ctx_n1 = 'bos'
+            ctx_n1_feature = ctx_n1
+
+            mark[verb_index] = 1
+            ctx_0_feature = sentence_list[verb_index]
+
+            if verb_index < len(labels_list) - 2:
+                mark[verb_index + 1] = 1
+                ctx_p1 = sentence_list[verb_index + 1]
+            else:
+                ctx_p1 = 'eos'
+            ctx_p1_feature = ctx_p1
+
+            feature_str  = sentence + '\t' \
+                           + verb_feature + '\t' \
+                           + ctx_n1_feature + '\t' \
+                           + ctx_0_feature + '\t' \
+                           + ctx_p1_feature + '\t' \
+                           + ' '.join([str(i) for i in mark]) + '\t' \
+                           + labels
+
+            feature_out.write(feature_str + '\n')
+
+        src_dict_out.write('<unk>\n')
+        src_dict_out.write('\n'.join(list(src_dict)))
+
+        tgt_dict_out.write('\n'.join(list(tgt_dict)))
+
+
+if __name__ == '__main__':
+
+    usage = '-p pair_file -f feature_file -s source dictionary -t target dictionary '
+    parser = OptionParser(usage)
+    parser.add_option('-p', dest='pair_file', help='the pair file')
+    parser.add_option(
+        '-f', dest='feature_file', help='the file to store feature')
+    parser.add_option(
+        '-s', dest='src_dict', help='the file to store source dictionary')
+    parser.add_option(
+        '-t', dest='tgt_dict', help='the file to store target dictionary')
+
+    (options, args) = parser.parse_args()
+
+    extract_dict_features(options.pair_file, options.feature_file,
+                          options.src_dict, options.tgt_dict)
diff --git a/demo/semantic_role_labeling/data/extract_pairs.py b/demo/semantic_role_labeling/data/extract_pairs.py
new file mode 100644
index 00000000000000..4d1bef8f958a62
--- /dev/null
+++ b/demo/semantic_role_labeling/data/extract_pairs.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+from optparse import OptionParser
+
+
+def read_labels(props_file):
+    '''
+    a sentence maybe has more than one verb, each verb has its label sequence
+    label[],  is a 3-dimension list. 
+    the first dim is to store all sentence's label seqs, len is the sentence number
+    the second dim is to store all label sequences for one sentences
+    the third dim is to store each label for one word
+    '''
+    labels = []
+    with open(props_file) as fin:
+        label_seqs_for_one_sentences = []
+        one_seg_in_file = []
+        for line in fin:
+            line = line.strip()
+            if line == '':
+                for i in xrange(len(one_seg_in_file[0])):
+                    a_kind_lable = [x[i] for x in one_seg_in_file]
+                    label_seqs_for_one_sentences.append(a_kind_lable)
+                labels.append(label_seqs_for_one_sentences)
+                one_seg_in_file = []
+                label_seqs_for_one_sentences = []
+            else:
+                part = line.split()
+                one_seg_in_file.append(part)
+    return labels
+
+
+def read_sentences(words_file):
+    sentences = []
+    with open(words_file) as fin:
+        s = ''
+        for line in fin:
+            line = line.strip()
+            if line == '':
+                sentences.append(s.lower())
+                s = ''
+            else:
+                s += line + ' '
+    return sentences
+
+
+def transform_labels(sentences, labels):
+    sen_lab_pair = []
+    for i in xrange(len(sentences)):
+        if len(labels[i]) == 1:
+            continue
+        else:
+            for j in xrange(1, len(labels[i])):
+                label_list = labels[i][j]
+                current_tag = 'O'
+                is_in_bracket = False
+                label_seq = []
+                verb_word = ''
+                for ll in label_list:
+                    if ll == '*' and is_in_bracket == False:
+                        label_seq.append('O')
+                    elif ll == '*' and is_in_bracket == True:
+                        label_seq.append('I-' + current_tag)
+                    elif ll == '*)':
+                        label_seq.append('I-' + current_tag)
+                        is_in_bracket = False
+                    elif ll.find('(') != -1 and ll.find(')') != -1:
+                        current_tag = ll[1:ll.find('*')]
+                        label_seq.append('B-' + current_tag)
+                        is_in_bracket = False
+                    elif ll.find('(') != -1 and ll.find(')') == -1:
+                        current_tag = ll[1:ll.find('*')]
+                        label_seq.append('B-' + current_tag)
+                        is_in_bracket = True
+                    else:
+                        print 'error:', ll
+
+                sen_lab_pair.append((sentences[i], label_seq))
+    return sen_lab_pair
+
+
+def write_file(sen_lab_pair, output_file):
+    with open(output_file, 'w') as fout:
+        for x in sen_lab_pair:
+            sentence = x[0]
+            label_seq = ' '.join(x[1])
+            assert len(sentence.split()) == len(x[1])
+            fout.write(sentence + '\t' + label_seq + '\n')
+
+
+if __name__ == '__main__':
+
+    usage = '-w words_file -p props_file -o output_file'
+    parser = OptionParser(usage)
+    parser.add_option('-w', dest='words_file', help='the words file')
+    parser.add_option('-p', dest='props_file', help='the props file')
+    parser.add_option('-o', dest='output_file', help='the output_file')
+    (options, args) = parser.parse_args()
+
+    sentences = read_sentences(options.words_file)
+    labels = read_labels(options.props_file)
+    sen_lab_pair = transform_labels(sentences, labels)
+
+    write_file(sen_lab_pair, options.output_file)
diff --git a/demo/semantic_role_labeling/data/get_data.sh b/demo/semantic_role_labeling/data/get_data.sh
new file mode 100644
index 00000000000000..268c0995e27006
--- /dev/null
+++ b/demo/semantic_role_labeling/data/get_data.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+wget http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz
+tar -xzvf conll05st-tests.tar.gz
+rm conll05st-tests.tar.gz
+cp ./conll05st-release/test.wsj/words/test.wsj.words.gz  .
+cp ./conll05st-release/test.wsj/props/test.wsj.props.gz  . 
+gunzip test.wsj.words.gz
+gunzip test.wsj.props.gz
+
+python extract_pairs.py  -w test.wsj.words -p test.wsj.props -o test.wsj.seq_pair
+python extract_dict_feature.py -p test.wsj.seq_pair -f feature  -s src.dict  -t tgt.dict
diff --git a/demo/semantic_role_labeling/data/test.list b/demo/semantic_role_labeling/data/test.list
new file mode 100644
index 00000000000000..ec370e897a7811
--- /dev/null
+++ b/demo/semantic_role_labeling/data/test.list
@@ -0,0 +1 @@
+./data/feature
diff --git a/demo/semantic_role_labeling/data/train.list b/demo/semantic_role_labeling/data/train.list
new file mode 100644
index 00000000000000..ec370e897a7811
--- /dev/null
+++ b/demo/semantic_role_labeling/data/train.list
@@ -0,0 +1 @@
+./data/feature
diff --git a/demo/semantic_role_labeling/dataprovider.py b/demo/semantic_role_labeling/dataprovider.py
new file mode 100644
index 00000000000000..ca7346b3db97e8
--- /dev/null
+++ b/demo/semantic_role_labeling/dataprovider.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+
+UNK_IDX = 0
+
+
+def hook(settings, word_dict, label_dict, **kwargs):
+    settings.word_dict = word_dict
+    settings.label_dict = label_dict
+    #all inputs are integral and sequential type
+    settings.slots = [
+        integer_value(len(word_dict), seq_type=SequenceType.SEQUENCE),
+        integer_value(len(word_dict), seq_type=SequenceType.SEQUENCE),
+        integer_value(len(word_dict), seq_type=SequenceType.SEQUENCE),
+        integer_value(len(word_dict), seq_type=SequenceType.SEQUENCE),
+        integer_value(len(word_dict), seq_type=SequenceType.SEQUENCE),
+        integer_value(2, seq_type=SequenceType.SEQUENCE),
+        integer_value(len(label_dict), seq_type=SequenceType.SEQUENCE)]
+
+
+@provider(init_hook=hook)
+def process(obj, file_name):
+    with open(file_name, 'r') as fdata:
+        for line in fdata:
+            sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = \
+                line.strip().split('\t')
+
+            words = sentence.split()
+            sen_len = len(words)
+            word_slot = [obj.word_dict.get(w, UNK_IDX) for w in words]
+
+            predicate_slot = [obj.word_dict.get(predicate, UNK_IDX)] * sen_len
+            ctx_n1_slot = [obj.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+            ctx_0_slot = [obj.word_dict.get(ctx_0, UNK_IDX)] * sen_len
+            ctx_p1_slot = [obj.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+
+            marks = mark.split()
+            mark_slot = [int(w) for w in marks]
+
+            label_list = label.split()
+            label_slot = [obj.label_dict.get(w) for w in label_list]
+
+            yield word_slot, predicate_slot, ctx_n1_slot, \
+                  ctx_0_slot, ctx_p1_slot, mark_slot, label_slot
diff --git a/demo/semantic_role_labeling/db_lstm.py b/demo/semantic_role_labeling/db_lstm.py
new file mode 100644
index 00000000000000..364460afbe31ca
--- /dev/null
+++ b/demo/semantic_role_labeling/db_lstm.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import math
+import os
+import sys
+from paddle.trainer_config_helpers import *
+
+#file paths
+word_dict_file = './data/src.dict'
+label_dict_file = './data/tgt.dict'
+train_list_file = './data/train.list'
+test_list_file = './data/test.list'
+
+is_test = get_config_arg('is_test', bool, False)
+is_predict = get_config_arg('is_predict', bool, False)
+
+if not is_predict:
+    #load dictionaries
+    word_dict = dict()
+    label_dict = dict()
+    with open(word_dict_file, 'r') as f_word, \
+         open(label_dict_file, 'r') as f_label:
+        for i, line in enumerate(f_word):
+            w = line.strip()
+            word_dict[w] = i
+
+        for i, line in enumerate(f_label):
+            w = line.strip()
+            label_dict[w] = i
+
+    if is_test:
+        train_list_file = None 
+
+    #define data provider
+    define_py_data_sources2(
+        train_list=train_list_file,
+        test_list=test_list_file,
+        module='dataprovider',
+        obj='process',
+        args={'word_dict': word_dict,
+              'label_dict': label_dict})
+
+    word_dict_len = len(word_dict)
+    label_dict_len = len(label_dict)
+
+else:
+    word_dict_len = get_config_arg('dict_len', int)
+    label_dict_len = get_config_arg('label_len', int)
+
+mark_dict_len = 2
+word_dim = 32
+mark_dim = 5
+hidden_dim = 128
+depth = 8
+emb_lr = 1e-2
+fc_lr = 1e-2
+lstm_lr = 2e-2
+
+settings(
+    batch_size=150,
+    learning_method=AdamOptimizer(),
+    learning_rate=1e-3,
+    regularization=L2Regularization(8e-4),
+    gradient_clipping_threshold=25)
+
+#6 features
+word = data_layer(name='word_data', size=word_dict_len)
+predicate = data_layer(name='verb_data', size=word_dict_len)
+ctx_n1 = data_layer(name='ctx_n1_data', size=word_dict_len)
+ctx_0 = data_layer(name='ctx_0_data', size=word_dict_len)
+ctx_p1 = data_layer(name='ctx_p1_data', size=word_dict_len)
+mark = data_layer(name='mark_data', size=mark_dict_len)
+
+if not is_predict:
+    target = data_layer(name='target', size=label_dict_len)
+
+ptt = ParameterAttribute(name='src_emb', learning_rate=emb_lr)
+layer_attr = ExtraLayerAttribute(drop_rate=0.5)
+fc_para_attr = ParameterAttribute(learning_rate=fc_lr)
+lstm_para_attr = ParameterAttribute(initial_std=0., learning_rate=lstm_lr)
+para_attr = [fc_para_attr, lstm_para_attr]
+
+word_embedding = embedding_layer(size=word_dim, input=word, param_attr=ptt)
+predicate_embedding = embedding_layer(
+    size=word_dim, input=predicate, param_attr=ptt)
+ctx_n1_embedding = embedding_layer(size=word_dim, input=ctx_n1, param_attr=ptt)
+ctx_0_embedding = embedding_layer(size=word_dim, input=ctx_0, param_attr=ptt)
+ctx_p1_embedding = embedding_layer(size=word_dim, input=ctx_p1, param_attr=ptt)
+mark_embedding = embedding_layer(size=mark_dim, input=mark)
+
+hidden_0 = mixed_layer(
+    size=hidden_dim,
+    input=[
+        full_matrix_projection(input=word_embedding),
+        full_matrix_projection(input=predicate_embedding),
+        full_matrix_projection(input=ctx_n1_embedding),
+        full_matrix_projection(input=ctx_0_embedding),
+        full_matrix_projection(input=ctx_p1_embedding),
+        full_matrix_projection(input=mark_embedding),
+    ])
+
+lstm_0 = lstmemory(input=hidden_0, layer_attr=layer_attr)
+
+#stack L-LSTM and R-LSTM with direct edges
+input_tmp = [hidden_0, lstm_0]
+
+for i in range(1, depth):
+
+    fc = fc_layer(input=input_tmp, size=hidden_dim, param_attr=para_attr)
+
+    lstm = lstmemory(
+        input=fc,
+        act=ReluActivation(),
+        reverse=(i % 2) == 1,
+        layer_attr=layer_attr)
+    input_tmp = [fc, lstm]
+
+prob = fc_layer(
+    input=input_tmp,
+    size=label_dict_len,
+    act=SoftmaxActivation(),
+    param_attr=para_attr)
+
+if not is_predict:
+    cls = classification_cost(input=prob, label=target)
+    outputs(cls)
+else:
+    outputs(prob)
diff --git a/demo/semantic_role_labeling/predict.py b/demo/semantic_role_labeling/predict.py
new file mode 100644
index 00000000000000..5250ec6dc68559
--- /dev/null
+++ b/demo/semantic_role_labeling/predict.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+from optparse import OptionParser
+from py_paddle import swig_paddle, util, DataProviderWrapperConverter
+from paddle.trainer.PyDataProviderWrapper import IndexSlot
+from paddle.trainer.config_parser import parse_config
+"""
+Usage: run following command to show help message.
+  python predict.py -h 
+"""
+UNK_IDX = 0
+
+
+class Prediction():
+    def __init__(self, train_conf, dict_file, model_dir, label_file):
+        """
+        train_conf: trainer configure.
+        dict_file: word dictionary file name.
+        model_dir: directory of model.
+        """
+
+        self.dict = {}
+        self.labels = {}
+        self.labels_reverse = {}
+        self.load_dict_label(dict_file, label_file)
+
+        len_dict = len(self.dict)
+        len_label = len(self.labels)
+
+        conf = parse_config(
+            train_conf,
+            'dict_len=' + str(len_dict) + 
+            ',label_len=' + str(len_label) +
+            ',is_predict=True')
+        self.network = swig_paddle.GradientMachine.createFromConfigProto(
+            conf.model_config)
+        self.network.loadParameters(model_dir)
+
+        slots = [IndexSlot(len_dict), IndexSlot(len_dict), IndexSlot(len_dict),
+                 IndexSlot(len_dict), IndexSlot(len_dict), IndexSlot(2)]
+        self.converter = util.DataProviderWrapperConverter(True, slots)
+
+    def load_dict_label(self, dict_file, label_file):
+        """
+        Load dictionary from self.dict_file.
+        """
+        for line_count, line in enumerate(open(dict_file, 'r')):
+            self.dict[line.strip()] = line_count
+
+        for line_count, line in enumerate(open(label_file, 'r')):
+            self.labels[line.strip()] = line_count
+            self.labels_reverse[line_count] = line.strip()
+
+    def get_data(self, data_file):
+        """
+        Get input data of paddle format.
+        """
+        with open(data_file, 'r') as fdata:
+            for line in fdata:
+                sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = line.strip(
+                ).split('\t')
+                words = sentence.split()
+                sen_len = len(words)
+
+                word_slot = [self.dict.get(w, UNK_IDX) for w in words]
+                predicate_slot = [self.dict.get(predicate, UNK_IDX)] * sen_len
+                ctx_n1_slot = [self.dict.get(ctx_n1, UNK_IDX)] * sen_len
+                ctx_0_slot = [self.dict.get(ctx_0, UNK_IDX)] * sen_len
+                ctx_p1_slot = [self.dict.get(ctx_p1, UNK_IDX)] * sen_len
+
+                marks = mark.split()
+                mark_slot = [int(w) for w in marks]
+
+                yield word_slot, predicate_slot, ctx_n1_slot, \
+                      ctx_0_slot, ctx_p1_slot, mark_slot
+
+    def predict(self, data_file):
+        """
+        data_file: file name of input data.
+        """
+        input = self.converter(self.get_data(data_file))
+        output = self.network.forwardTest(input)
+        prob = output[0]["value"]
+        lab = list(np.argsort(-prob)[:, 0])
+
+        with open(data_file, 'r') as fin, open('predict.res', 'w') as fout:
+            index = 0
+            for line in fin:
+                sen = line.split('\t')[0]
+                len_sen = len(sen.split())
+                line_labels = lab[index:index + len_sen]
+                index += len_sen
+                fout.write(sen + '\t' + ' '.join([self.labels_reverse[
+                    i] for i in line_labels]) + '\n')
+
+
+def option_parser():
+    usage = ("python predict.py -c config -w model_dir " 
+             "-d word dictionary -l label_file -i input_file")
+    parser = OptionParser(usage="usage: %s [options]" % usage)
+    parser.add_option(
+        "-c",
+        "--tconf",
+        action="store",
+        dest="train_conf",
+        help="network config")
+    parser.add_option(
+        "-d",
+        "--dict",
+        action="store",
+        dest="dict_file",
+        help="dictionary file")
+    parser.add_option(
+        "-l",
+        "--label",
+        action="store",
+        dest="label_file",
+        default=None,
+        help="label file")
+    parser.add_option(
+        "-i",
+        "--data",
+        action="store",
+        dest="data_file",
+        help="data file to predict")
+    parser.add_option(
+        "-w",
+        "--model",
+        action="store",
+        dest="model_path",
+        default=None,
+        help="model path")
+    return parser.parse_args()
+
+
+def main():
+    options, args = option_parser()
+    train_conf = options.train_conf
+    data_file = options.data_file
+    dict_file = options.dict_file
+    model_path = options.model_path
+    label_file = options.label_file
+
+    swig_paddle.initPaddle("--use_gpu=0")
+    predict = Prediction(train_conf, dict_file, model_path, label_file)
+    predict.predict(data_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/semantic_role_labeling/predict.sh b/demo/semantic_role_labeling/predict.sh
new file mode 100644
index 00000000000000..a545b9a5d591b4
--- /dev/null
+++ b/demo/semantic_role_labeling/predict.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+function get_best_pass() {
+  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
+  sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
+  sort | head -n 1
+}   
+
+log=train.log
+LOG=`get_best_pass $log`
+LOG=(${LOG})
+best_model_path="output/pass-${LOG[1]}"
+
+
+config_file=db_lstm.py
+dict_file=./data/src.dict
+label_file=./data/tgt.dict 
+input_file=./data/feature
+ 
+python predict.py \
+     -c $config_file \
+     -w $best_model_path \
+     -l $label_file \
+     -d $dict_file \
+     -i $input_file
diff --git a/demo/semantic_role_labeling/test.sh b/demo/semantic_role_labeling/test.sh
new file mode 100644
index 00000000000000..804f722e5b8e9e
--- /dev/null
+++ b/demo/semantic_role_labeling/test.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+function get_best_pass() {
+  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
+  sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
+  sort | head -n 1
+}
+
+log=train.log
+LOG=`get_best_pass $log`
+LOG=(${LOG})
+evaluate_pass="output/pass-${LOG[1]}"
+
+echo 'evaluating from pass '$evaluate_pass
+model_list=./model.list
+touch $model_list | echo $evaluate_pass > $model_list
+
+paddle train \
+  --config=./db_lstm.py \
+  --model_list=$model_list \
+  --job=test \
+  --use_gpu=false \
+  --config_args=is_test=1 \
+2>&1 | tee 'test.log'
+
diff --git a/demo/semantic_role_labeling/train.sh b/demo/semantic_role_labeling/train.sh
new file mode 100644
index 00000000000000..94c7b6f31df3b5
--- /dev/null
+++ b/demo/semantic_role_labeling/train.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+paddle train \
+  --config=./db_lstm.py \
+  --save_dir=./output \
+  --trainer_count=4 \
+  --log_period=10 \
+  --num_passes=500 \
+  --use_gpu=false \
+  --show_parameter_stats_period=10 \
+  --test_all_data_in_one_period=1 \
+2>&1 | tee 'train.log'
+
diff --git a/demo/sentiment/.gitignore b/demo/sentiment/.gitignore
new file mode 100644
index 00000000000000..bf2a9ab1ce3c93
--- /dev/null
+++ b/demo/sentiment/.gitignore
@@ -0,0 +1,11 @@
+data/aclImdb
+data/imdb
+data/pre-imdb
+data/mosesdecoder-master
+logs/
+model_output
+dataprovider_copy_1.py
+model.list
+test.log
+train.log
+*.pyc
diff --git a/demo/sentiment/data/get_imdb.sh b/demo/sentiment/data/get_imdb.sh
new file mode 100755
index 00000000000000..41523927afe754
--- /dev/null
+++ b/demo/sentiment/data/get_imdb.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+set -x
+
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd $DIR
+
+#download the dataset
+echo "Downloading aclImdb..."
+#http://ai.stanford.edu/%7Eamaas/data/sentiment/
+wget http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
+
+echo "Downloading mosesdecoder..."
+#https://github.com/moses-smt/mosesdecoder
+wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
+
+#extract package
+echo "Unzipping..."
+tar -zxvf aclImdb_v1.tar.gz
+unzip master.zip
+
+#move train and test set to imdb_data directory 
+#in order to process when traing
+mkdir -p imdb/train
+mkdir -p imdb/test
+
+cp -r aclImdb/train/pos/ imdb/train/
+cp -r aclImdb/train/neg/ imdb/train/
+
+cp -r aclImdb/test/pos/ imdb/test/
+cp -r aclImdb/test/neg/ imdb/test/
+
+#remove compressed package
+rm aclImdb_v1.tar.gz
+rm master.zip
+
+echo "Done."
diff --git a/demo/sentiment/dataprovider.py b/demo/sentiment/dataprovider.py
new file mode 100755
index 00000000000000..c325d33485c872
--- /dev/null
+++ b/demo/sentiment/dataprovider.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer.PyDataProvider2 import *
+
+
+def hook(settings, dictionary, **kwargs):
+    settings.word_dict = dictionary
+    settings.input_types = [
+        integer_value(len(settings.word_dict), seq_type=SequenceType.SEQUENCE),
+        integer_value(2)]
+    settings.logger.info('dict len : %d' % (len(settings.word_dict)))
+
+
+@provider(init_hook=hook)
+def process(settings, file_name):
+    with open(file_name, 'r') as fdata:
+        for line_count, line in enumerate(fdata):
+            label, comment = line.strip().split('\t\t')
+            label = int(label)
+            words = comment.split()
+            word_slot = [settings.word_dict[w] for w in words if w in
+                         settings.word_dict]
+            yield word_slot, label
diff --git a/demo/sentiment/predict.py b/demo/sentiment/predict.py
new file mode 100755
index 00000000000000..4ece6bb06d9e30
--- /dev/null
+++ b/demo/sentiment/predict.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+from optparse import OptionParser
+from py_paddle import swig_paddle, util, DataProviderWrapperConverter
+from paddle.trainer.PyDataProviderWrapper import IndexSlot
+from paddle.trainer.config_parser import parse_config
+
+"""
+Usage: run following command to show help message.
+  python predict.py -h 
+"""
+
+class SentimentPrediction():
+    def __init__(self, train_conf, dict_file, model_dir=None, label_file = None):
+        """
+        train_conf: trainer configure.
+        dict_file: word dictionary file name.
+        model_dir: directory of model.
+        """
+        self.train_conf = train_conf
+        self.dict_file = dict_file
+        self.word_dict = {}
+        self.dict_dim = self.load_dict()
+        self.model_dir = model_dir
+        if model_dir is None:
+            self.model_dir = os.path.dirname(train_conf)
+
+        self.label = None
+        if label_file is not None:
+            self.load_label(label_file)
+
+        conf = parse_config(train_conf, "is_predict=1")
+        self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
+        self.network.loadParameters(self.model_dir)
+        slots = [IndexSlot(self.dict_dim)]
+        self.converter = util.DataProviderWrapperConverter(True, slots)
+
+    def load_dict(self):
+        """
+        Load dictionary from self.dict_file.
+        """
+        for line_count, line in enumerate(open(self.dict_file, 'r')):
+            self.word_dict[line.strip().split('\t')[0]] = line_count
+        return len(self.word_dict)
+
+    def load_label(self, label_file):
+        """
+        Load label.
+        """
+        self.label={}
+        for v in open(label_file, 'r'):
+            self.label[int(v.split('\t')[1])] = v.split('\t')[0]
+
+    def get_data(self, data_file):
+        """
+        Get input data of paddle format.
+        """
+        with open(data_file, 'r') as fdata:
+            for line in fdata:
+                words = line.strip().split()
+                word_slot = [self.word_dict[w] for w in words if w in self.word_dict]
+                if not word_slot:
+                    print "all words are not in dictionary: %s", line
+                    continue
+                yield [word_slot]
+
+    def predict(self, data_file):
+        """
+        data_file: file name of input data.
+        """
+        input = self.converter(self.get_data(data_file))
+        output = self.network.forwardTest(input)
+        prob = output[0]["value"]
+        lab = np.argsort(-prob)
+        if self.label is None:
+            print("%s: predicting label is %d" % (data_file, lab[0][0]))
+        else:
+            print("%s: predicting label is %s" % (data_file, self.label[lab[0][0]]))
+
+def option_parser():
+    usage = "python predict.py -n config -w model_dir -d dictionary -i input_file "
+    parser = OptionParser(usage="usage: %s [options]" % usage)
+    parser.add_option("-n", "--tconf", action="store",
+                      dest="train_conf", help="network config")
+    parser.add_option("-d", "--dict", action="store",
+                      dest="dict_file",help="dictionary file")
+    parser.add_option("-b", "--label", action="store",
+                      dest="label", default=None,
+                      help="dictionary file")
+    parser.add_option("-i", "--data", action="store",
+                      dest="data", help="data file to predict")
+    parser.add_option("-w", "--model", action="store",
+                      dest="model_path", default=None,
+                      help="model path")
+    return parser.parse_args()
+
+def main():
+    options, args = option_parser()
+    train_conf = options.train_conf
+    data = options.data
+    dict_file = options.dict_file
+    model_path = options.model_path
+    label = options.label
+    swig_paddle.initPaddle("--use_gpu=0")
+    predict = SentimentPrediction(train_conf, dict_file, model_path, label)
+    predict.predict(data)
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/sentiment/predict.sh b/demo/sentiment/predict.sh
new file mode 100755
index 00000000000000..c3bfc1c8b61921
--- /dev/null
+++ b/demo/sentiment/predict.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+config=trainer_config.py
+model=model_output/pass-00002/
+label=data/pre-imdb/labels.list
+python predict.py \
+     -n $config\
+     -w $model \
+     -b $label \
+     -d ./data/pre-imdb/dict.txt \
+     -i ./data/aclImdb/test/pos/10007_10.txt 
diff --git a/demo/sentiment/preprocess.py b/demo/sentiment/preprocess.py
new file mode 100755
index 00000000000000..49b53d500a1bf8
--- /dev/null
+++ b/demo/sentiment/preprocess.py
@@ -0,0 +1,338 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import random
+import operator
+import numpy as np
+from subprocess import Popen, PIPE
+from os.path import join as join_path
+from optparse import OptionParser
+
+from paddle.utils.preprocess_util import *
+
+"""
+Usage: run following command to show help message.
+  python preprocess.py -h 
+"""
+
+def save_dict(dict, filename, is_reverse = True):
+    """
+    Save dictionary into file.
+    dict:   input dictionary.
+    filename: output file name, string.
+    is_reverse: True, descending order by value.
+                False, ascending order by value.
+    """
+    f = open(filename, 'w')
+    for k, v in sorted(dict.items(), key=operator.itemgetter(1),\
+                       reverse=is_reverse):
+        f.write('%s\t%s\n'%(k, v))
+    f.close()
+
+def tokenize(sentences):
+    """
+    Use tokenizer.perl to tokenize input sentences.
+    tokenizer.perl is tool of Moses.
+    sentences : a list of input sentences.
+    return: a list of processed text.
+    """
+    dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
+    tokenizer_cmd = [dir, '-l', 'en', '-q', '-']
+    assert isinstance(sentences, list)
+    text = "\n".join(sentences)
+    tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
+    tok_text, _ = tokenizer.communicate(text)
+    toks = tok_text.split('\n')[:-1]
+    return toks
+
+def read_lines(path):
+    """
+    path: String, file path.
+    return a list of sequence.
+    """
+    seqs = []
+    with open(path, 'r') as f:
+        for line in f.readlines():
+            line = line.strip()
+            if len(line):
+                seqs.append(line)
+    return seqs
+
+class SentimentDataSetCreate():
+    """
+    A class to process data for sentiment analysis task.
+    """
+    def __init__(self, data_path, output_path,
+                 use_okenizer = True, multi_lines = False):
+        """
+        data_path: string, traing and testing dataset path
+        output_path: string, output path, store processed dataset
+        multi_lines: whether a file has multi lines.
+                     In order to shuffle fully, it needs to read all files into
+                     memory, then shuffle them if one file has multi lines.
+        """
+        self.output_path = output_path
+        self.data_path = data_path
+
+        self.train_dir = 'train'
+        self.test_dir = 'test'
+
+        self.train_list = "train.list"
+        self.test_list = "test.list"
+
+        self.label_list = "labels.list"
+        self.classes_num = 0
+
+        self.batch_size = 50000
+        self.batch_dir = 'batches'
+
+        self.dict_file = "dict.txt"
+        self.dict_with_test = False
+        self.dict_size = 0
+        self.word_count = {}
+
+        self.tokenizer = use_okenizer
+        self.overwrite = False
+
+        self.multi_lines = multi_lines
+
+        self.train_dir = join_path(data_path, self.train_dir)
+        self.test_dir = join_path(data_path, self.test_dir)
+        self.train_list = join_path(output_path, self.train_list)
+        self.test_list = join_path(output_path, self.test_list)
+        self.label_list = join_path(output_path, self.label_list)
+        self.dict_file = join_path(output_path, self.dict_file)
+
+    def data_list(self, path):
+        """
+        create dataset from path
+        path: data path
+        return: data list
+        """
+        label_set = get_label_set_from_dir(path)
+        data = []
+        for lab_name in label_set.keys():
+            file_paths = list_files(join_path(path, lab_name))
+            for p in file_paths:
+                data.append({"label"  : label_set[lab_name],\
+                             "seq_path": p})
+        return data, label_set
+
+    def create_dict(self, data):
+        """
+        create dict for input data.
+        data: list, [sequence, sequnce, ...]
+        """
+        for seq in data:
+            for w in seq.strip().lower().split():
+                if w not in self.word_count:
+                    self.word_count[w] = 1
+                else:
+                    self.word_count[w] += 1
+
+    def create_dataset(self):
+        """
+        create file batches and dictionary of train data set.
+        If the self.overwrite is false and train.list already exists in
+        self.output_path, this function will not create and save file
+        batches from the data set path.
+        return: dictionary size, class number.
+        """
+        out_path = self.output_path
+        if out_path and not os.path.exists(out_path):
+            os.makedirs(out_path)
+
+        # If self.overwrite is false or self.train_list has existed,
+        # it will not process dataset.
+        if not (self.overwrite or not os.path.exists(self.train_list)):
+            print "%s already exists." % self.train_list
+            return
+
+        # Preprocess train data.
+        train_data, train_lab_set = self.data_list(self.train_dir)
+        print "processing train set..."
+        file_lists = self.save_data(train_data,
+                                     "train",
+                                     self.batch_size,
+                                     True,
+                                     True)
+        save_list(file_lists, self.train_list)
+
+        # If have test data path, preprocess test data.
+        if os.path.exists(self.test_dir):
+            test_data, test_lab_set = self.data_list(self.test_dir)
+            assert(train_lab_set == test_lab_set)
+            print "processing test set..."
+            file_lists = self.save_data(test_data,
+                                        "test",
+                                        self.batch_size,
+                                        False,
+                                        self.dict_with_test)
+            save_list(file_lists, self.test_list)
+
+        # save labels set.
+        save_dict(train_lab_set, self.label_list, False)
+        self.classes_num = len(train_lab_set.keys())
+
+        # save dictionary.
+        save_dict(self.word_count, self.dict_file, True)
+        self.dict_size = len(self.word_count)
+
+    def save_data(self, data, prefix = "",
+                  batch_size=50000,
+                  is_shuffle=False,
+                  build_dict=False):
+        """
+        Create batches for a Dataset object.
+        data: the Dataset object to process.
+        prefix: the prefix of each batch.
+        batch_size: number of data in each batch.
+        build_dict: whether to build dictionary for data
+
+        return: list of batch names
+        """
+        if is_shuffle and self.multi_lines:
+           return self.save_data_multi_lines(data, prefix, batch_size, build_dict)
+
+        if is_shuffle:
+            random.shuffle(data)
+        num_batches = int(math.ceil(len(data) / float(batch_size)))
+        batch_names = []
+        for i in range(num_batches):
+            batch_name = join_path(self.output_path,
+                                   "%s_part_%03d" %(prefix, i))
+            begin = i * batch_size
+            end = min((i + 1) * batch_size, len(data))
+            # read a batch of data
+            label_list, data_list = self.get_data_list(begin, end, data)
+            if build_dict:
+                self.create_dict(data_list)
+            self.save_file(label_list, data_list, batch_name)
+            batch_names.append(batch_name)
+
+        return batch_names
+
+    def get_data_list(self, begin, end, data):
+        """
+        begin: int, begining index of data.
+        end: int, ending index of data.
+        data: a list of {"seq_path": seqquence path, "label": label index}
+
+        return a list of label and a list of sequence.
+        """
+        label_list = []
+        data_list = []
+        for j in range(begin, end):
+            seqs = read_lines(data[j]["seq_path"])
+            lab = int(data[j]["label"])
+            #File may have multiple lines.
+            for seq in seqs:
+                data_list.append(seq)
+                label_list.append(lab)
+        if self.tokenizer:
+            data_list = tokenize(data_list)
+        return label_list, data_list
+
+    def save_data_multi_lines(self, data, prefix = "",
+                              batch_size=50000,
+                              build_dict=False):
+        """
+        In order to shuffle fully, there is no need to load all data if
+        each file only contains one sample, it only needs to shuffle list
+        of file name. But one file contains multi lines, each line is one
+        sample. It needs to read all data into memory to shuffle fully.
+        This interface is mainly for data containning multi lines in each
+        file, which consumes more memory if there is a great mount of data.
+
+        data: the Dataset object to process.
+        prefix: the prefix of each batch.
+        batch_size: number of data in each batch.
+        build_dict: whether to build dictionary for data
+
+        return: list of batch names
+        """
+        assert self.multi_lines
+        label_list = []
+        data_list = []
+
+        # read all data
+        label_list, data_list = self.get_data_list(0, len(data), data)
+        if build_dict:
+            self.create_dict(data_list)
+
+        length = len(label_list)
+        perm_list = np.array([ i for i in xrange(length) ])
+        random.shuffle(perm_list)
+
+        num_batches = int(math.ceil(length / float(batch_size)))
+        batch_names = []
+        for i in range(num_batches):
+            batch_name = join_path(self.output_path,
+                                   "%s_part_%03d" %(prefix, i))
+            begin = i * batch_size
+            end = min((i + 1) * batch_size, length)
+            sub_label = [label_list[perm_list[i]] for i in range(begin, end)]
+            sub_data = [data_list[perm_list[i]] for i in range(begin, end)]
+            self.save_file(sub_label, sub_data, batch_name)
+            batch_names.append(batch_name)
+
+        return batch_names
+
+    def save_file(self, label_list, data_list, filename):
+        """
+        Save data into file.
+        label_list: a list of int value.
+        data_list: a list of sequnece.
+        filename: output file name.
+        """
+        f = open(filename, 'w')
+        print "saving file: %s" % filename
+        for lab, seq in zip(label_list, data_list):
+            f.write('%s\t\t%s\n' % (lab, seq))
+        f.close()
+
+def option_parser():
+    parser = OptionParser(usage="usage: python preprcoess.py "\
+                                "-i data_dir [options]")
+    parser.add_option("-i", "--data", action="store",
+                      dest="input", help="Input data directory.")
+    parser.add_option("-o", "--output", action="store",
+                      dest="output", default=None,
+                      help="Output directory.")
+    parser.add_option("-t", "--tokenizer", action="store",
+                      dest="use_tokenizer", default=True,
+                      help="Whether to use tokenizer.")
+    parser.add_option("-m", "--multi_lines", action="store",
+                      dest="multi_lines", default=False,
+                      help="If input text files have multi lines and they "\
+                           "need to be shuffled, you should set -m True,")
+    return parser.parse_args()
+
+def main():
+    options, args = option_parser()
+    data_dir=options.input
+    output_dir=options.output
+    use_tokenizer=options.use_tokenizer
+    multi_lines=options.multi_lines
+    if output_dir is None:
+        outname = os.path.basename(options.input)
+        output_dir = join_path(os.path.dirname(data_dir), 'pre-' + outname)
+    data_creator = SentimentDataSetCreate(data_dir, output_dir,
+                                          use_tokenizer, multi_lines)
+    data_creator.create_dataset()
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/sentiment/preprocess.sh b/demo/sentiment/preprocess.sh
new file mode 100755
index 00000000000000..5f5c78e222917d
--- /dev/null
+++ b/demo/sentiment/preprocess.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+echo "Start to preprcess..."
+
+data_dir="./data/imdb"
+python preprocess.py -i $data_dir
+
+echo "Done."
diff --git a/demo/sentiment/sentiment_net.py b/demo/sentiment/sentiment_net.py
new file mode 100644
index 00000000000000..f9f784c1f0b20e
--- /dev/null
+++ b/demo/sentiment/sentiment_net.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from os.path import join as join_path
+
+from paddle.trainer_config_helpers import *
+
+
+def sentiment_data(data_dir=None,
+                   is_test=False,
+                   is_predict=False,
+                   train_list="train.list",
+                   test_list="test.list",
+                   dict_file="dict.txt"):
+    """
+    Predefined data provider for sentiment analysis.
+    is_test: whether this config is used for test.
+    is_predict: whether this config is used for prediction.
+    train_list: text file name, containing a list of training set.
+    test_list: text file name, containing a list of testing set.
+    dict_file: text file name, containing dictionary.
+    """
+    dict_dim = len(open(join_path(data_dir, "dict.txt")).readlines())
+    class_dim = len(open(join_path(data_dir, 'labels.list')).readlines())
+    if is_predict:
+        return dict_dim, class_dim
+
+    if data_dir is not None:
+        train_list = join_path(data_dir, train_list)
+        test_list = join_path(data_dir, test_list)
+        dict_file = join_path(data_dir, dict_file)
+
+    train_list = train_list if not is_test else None
+    word_dict = dict()
+    with open(dict_file, 'r') as f:
+        for i, line in enumerate(open(dict_file, 'r')):
+            word_dict[line.split('\t')[0]] = i
+
+    define_py_data_sources2(train_list, test_list,
+                           module="dataprovider",
+                           obj="process",
+                           args={'dictionary': word_dict})
+
+    return dict_dim, class_dim
+
+
+def bidirectional_lstm_net(input_dim,
+                           class_dim=2,
+                           emb_dim=128,
+                           lstm_dim=128,
+                           is_predict=False):
+    data = data_layer("word", input_dim)
+    emb = embedding_layer(input=data, size=emb_dim)
+    bi_lstm = bidirectional_lstm(input=emb, size=lstm_dim)
+    dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
+    output = fc_layer(input=dropout, size=class_dim,
+                      act_type=SoftmaxActivation())
+
+    if not is_predict:
+        lbl = data_layer("label", 1)
+        outputs(classification_cost(input=output, label=lbl))
+    else:
+        outputs(output)
+
+
+def stacked_lstm_net(input_dim,
+                     class_dim=2,
+                     emb_dim=128,
+                     hid_dim=512,
+                     stacked_num=3,
+                     is_predict=False):
+    """
+    A Wrapper for sentiment classification task.
+    This network uses bi-directional recurrent network,
+    consisting three LSTM layers. This configure is referred to
+    the paper as following url, but use fewer layrs.
+        http://www.aclweb.org/anthology/P15-1109
+
+    input_dim: here is word dictionary dimension.
+    class_dim: number of categories.
+    emb_dim: dimension of word embedding.
+    hid_dim: dimension of hidden layer.
+    stacked_num: number of stacked lstm-hidden layer.
+    is_predict: is predicting or not.
+                Some layers is not needed in network when predicting.
+    """
+    hid_lr = 1e-3
+    assert stacked_num % 2 == 1
+
+    layer_attr = ExtraLayerAttribute(drop_rate=0.5)
+    fc_para_attr = ParameterAttribute(learning_rate=hid_lr)
+    lstm_para_attr = ParameterAttribute(initial_std=0., learning_rate=1.)
+    para_attr = [fc_para_attr, lstm_para_attr]
+    bias_attr = ParameterAttribute(initial_std=0., l2_rate=0.)
+    relu = ReluActivation()
+    linear = LinearActivation()
+
+    data = data_layer("word", input_dim)
+    emb = embedding_layer(input=data, size=emb_dim)
+
+    fc1 = fc_layer(input=emb, size=hid_dim, act=linear,
+                   bias_attr=bias_attr)
+    lstm1 = lstmemory(input=fc1, act=relu, bias_attr=bias_attr,
+                      layer_attr=layer_attr)
+
+    inputs = [fc1, lstm1]
+    for i in range(2, stacked_num + 1):
+        fc = fc_layer(input=inputs, size=hid_dim, act=linear,
+                      param_attr=para_attr, bias_attr=bias_attr)
+        lstm = lstmemory(input=fc, reverse=(i % 2) == 0, act=relu,
+                         bias_attr=bias_attr, layer_attr=layer_attr)
+        inputs = [fc, lstm]
+
+    fc_last = pooling_layer(input=inputs[0], pooling_type=MaxPooling())
+    lstm_last = pooling_layer(input=inputs[1], pooling_type=MaxPooling())
+    output = fc_layer(input=[fc_last, lstm_last], size=class_dim,
+                      act=SoftmaxActivation(),
+                      bias_attr=bias_attr, param_attr=para_attr)
+
+    if is_predict:
+        outputs(output)
+    else:
+        outputs(
+            classification_cost(input=output, label=data_layer('label', 1)))
diff --git a/demo/sentiment/test.sh b/demo/sentiment/test.sh
new file mode 100755
index 00000000000000..ffe404de6b5227
--- /dev/null
+++ b/demo/sentiment/test.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+function get_best_pass() {
+  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
+  sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
+  sort | head -n 1
+}
+
+log=train.log
+LOG=`get_best_pass $log`
+LOG=(${LOG})
+evaluate_pass="model_output/pass-${LOG[1]}"
+
+echo 'evaluating from pass '$evaluate_pass
+
+model_list=./model.list
+touch $model_list | echo $evaluate_pass > $model_list
+net_conf=trainer_config.py
+paddle train --config=$net_conf \
+             --model_list=$model_list \
+             --job=test \
+             --use_gpu=false \
+             --trainer_count=4 \
+             --config_args=is_test=1 \
+             2>&1 | tee 'test.log'
diff --git a/demo/sentiment/train.sh b/demo/sentiment/train.sh
new file mode 100755
index 00000000000000..f44a9a53f2db9a
--- /dev/null
+++ b/demo/sentiment/train.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+config=trainer_config.py
+output=./model_output
+paddle train --config=$config \
+             --save_dir=$output \
+             --job=train \
+             --use_gpu=false \
+             --trainer_count=4 \
+             --num_passes=10 \
+             --log_period=10 \
+             --dot_period=20 \
+             --show_parameter_stats_period=100 \
+             --test_all_data_in_one_period=1 \
+             2>&1 | tee 'train.log'
diff --git a/demo/sentiment/trainer_config.py b/demo/sentiment/trainer_config.py
new file mode 100644
index 00000000000000..db24182a8d7359
--- /dev/null
+++ b/demo/sentiment/trainer_config.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from sentiment_net import *
+from paddle.trainer_config_helpers import *
+
+# whether this config is used for test
+is_test = get_config_arg('is_test', bool, False)
+# whether this config is used for prediction
+is_predict = get_config_arg('is_predict', bool, False)
+
+data_dir  = "./data/pre-imdb"
+dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict)
+
+################## Algorithm Config #####################
+
+settings(
+  batch_size=128,
+  learning_rate=2e-3,
+  learning_method=AdamOptimizer(),
+  regularization=L2Regularization(8e-4),
+  gradient_clipping_threshold=25
+)
+
+#################### Network Config ######################
+stacked_lstm_net(dict_dim, class_dim=class_dim,
+                 stacked_num=3, is_predict=is_predict)
+# bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
diff --git a/demo/seqToseq/.gitignore b/demo/seqToseq/.gitignore
new file mode 100644
index 00000000000000..21cec2c2c1f342
--- /dev/null
+++ b/demo/seqToseq/.gitignore
@@ -0,0 +1,17 @@
+data/wmt14
+data/pre-wmt14
+data/wmt14_model
+data/paraphrase
+data/pre-paraphrase
+data/paraphrase_model
+translation/gen.log
+translation/gen_result
+translation/train.log
+paraphrase/train.log
+dataprovider_copy_1.py
+translation/thirdparty.tgz
+translation/thirdparty/train.conf
+translation/thirdparty/dataprovider.py
+translation/thirdparty/seqToseq_net.py
+translation/thirdparty/*.dict
+*.pyc
diff --git a/demo/seqToseq/data/paraphrase_data.sh b/demo/seqToseq/data/paraphrase_data.sh
new file mode 100755
index 00000000000000..ea1f8dbcfad356
--- /dev/null
+++ b/demo/seqToseq/data/paraphrase_data.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+set -x
+
+# download the in-house paraphrase dataset
+# following is the google drive address
+# you can also directly download from https://pan.baidu.com/s/1o8q577s
+wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/paraphrase.tar.gz --no-check-certificate
+
+# untar the dataset
+tar -zxvf paraphrase.tar.gz
+rm paraphrase.tar.gz
diff --git a/demo/seqToseq/data/paraphrase_model.sh b/demo/seqToseq/data/paraphrase_model.sh
new file mode 100755
index 00000000000000..041f69cf467b13
--- /dev/null
+++ b/demo/seqToseq/data/paraphrase_model.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+set -x
+
+dim=32
+pretrained_dir='../../model_zoo/embedding/'
+preModel=$pretrained_dir'model_'$dim'.emb'
+preDict=$pretrained_dir'baidu.dict'
+
+usrDict_dir='pre-paraphrase/'
+srcDict=$usrDict_dir'src.dict'
+trgDict=$usrDict_dir'trg.dict'
+
+usrModel_dir='paraphrase_model/'
+mkdir $usrModel_dir
+srcModel=$usrModel_dir'_source_language_embedding'
+trgModel=$usrModel_dir'_target_language_embedding'
+
+echo 'extract desired parameters based on user dictionary'
+script=$pretrained_dir'extract_para.py'
+python $script --preModel $preModel --preDict $preDict \
+          --usrModel $srcModel --usrDict $srcDict -d $dim
+python $script --preModel $preModel --preDict $preDict \
+          --usrModel $trgModel --usrDict $trgDict -d $dim
diff --git a/demo/seqToseq/data/wmt14_data.sh b/demo/seqToseq/data/wmt14_data.sh
new file mode 100755
index 00000000000000..6c360b206011a7
--- /dev/null
+++ b/demo/seqToseq/data/wmt14_data.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+set -x
+mkdir wmt14
+cd wmt14
+
+# download the dataset
+wget http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz
+wget http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz
+
+# untar the dataset
+tar -zxvf bitexts.tgz
+tar -zxvf dev+test.tgz
+gunzip bitexts.selected/*
+mv bitexts.selected train
+rm bitexts.tgz
+rm dev+test.tgz
+
+# separate the dev and test dataset
+mkdir test gen
+mv dev/ntst1213.* test
+mv dev/ntst14.* gen 
+rm -rf dev
+
+set +x
+# rename the suffix, .fr->.src, .en->.trg
+for dir in train test gen
+do 
+  filelist=`ls $dir`
+  cd $dir
+  for file in $filelist
+  do 
+    if [ ${file##*.} = "fr" ]; then
+      mv $file ${file/%fr/src}
+    elif [ ${file##*.} = 'en' ]; then
+      mv $file ${file/%en/trg}
+    fi
+  done
+  cd ..
+done
diff --git a/demo/seqToseq/data/wmt14_model.sh b/demo/seqToseq/data/wmt14_model.sh
new file mode 100755
index 00000000000000..2cec30688d27a5
--- /dev/null
+++ b/demo/seqToseq/data/wmt14_model.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+set -x
+
+# download the pretrained model
+# following is the google drive address
+# you can also directly download from https://pan.baidu.com/s/1o8q577s
+wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/wmt14_model.tar.gz --no-check-certificate
+
+# untar the model
+tar -zxvf wmt14_model.tar.gz
+rm wmt14_model.tar.gz 
diff --git a/demo/seqToseq/dataprovider.py b/demo/seqToseq/dataprovider.py
new file mode 100755
index 00000000000000..a646667977d3eb
--- /dev/null
+++ b/demo/seqToseq/dataprovider.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+
+UNK_IDX = 2
+START = "<s>"
+END = "<e>"
+
+
+def hook(settings, src_dict, trg_dict, file_list, **kwargs):
+    # job_mode = 1: training mode
+    # job_mode = 0: generating mode
+    settings.job_mode = trg_dict is not None
+    settings.src_dict = src_dict
+    settings.logger.info("src dict len : %d" % (len(settings.src_dict)))
+    settings.sample_count = 0
+
+    if settings.job_mode:
+        settings.trg_dict = trg_dict
+        settings.slots = [
+            integer_value(
+                len(settings.src_dict),
+                seq_type=SequenceType.SEQUENCE), integer_value(
+                    len(settings.trg_dict),
+                    seq_type=SequenceType.SEQUENCE), integer_value(
+                        len(settings.trg_dict),
+                        seq_type=SequenceType.SEQUENCE)
+        ]
+        settings.logger.info("trg dict len : %d" % (len(settings.trg_dict)))
+    else:
+        settings.slots = [
+            integer_value(
+                len(settings.src_dict),
+                seq_type=SequenceType.SEQUENCE), integer_value(
+                    len(open(file_list[0], "r").readlines()),
+                    seq_type=SequenceType.SEQUENCE)
+        ]
+
+
+def _get_ids(s, dictionary):
+    words = s.strip().split()
+    return [dictionary[START]] + \
+           [dictionary.get(w, UNK_IDX) for w in words] + \
+           [dictionary[END]]
+
+
+@provider(init_hook=hook, pool_size=50000)
+def process(settings, file_name):
+    with open(file_name, 'r') as f:
+        for line_count, line in enumerate(f):
+            line_split = line.strip().split('\t')
+            if settings.job_mode and len(line_split) != 2:
+                continue
+            src_seq = line_split[0]  # one source sequence
+            src_ids = _get_ids(src_seq, settings.src_dict)
+
+            if settings.job_mode:
+                trg_seq = line_split[1]  # one target sequence
+                trg_words = trg_seq.split()
+                trg_ids = [settings.trg_dict.get(w, UNK_IDX)
+                           for w in trg_words]
+
+                # remove sequence whose length > 80 in training mode
+                if len(src_ids) > 80 or len(trg_ids) > 80:
+                    continue
+                trg_ids_next = trg_ids + [settings.trg_dict[END]]
+                trg_ids = [settings.trg_dict[START]] + trg_ids
+                yield src_ids, trg_ids, trg_ids_next
+            else:
+                yield src_ids, [line_count]
diff --git a/demo/seqToseq/paraphrase/train.conf b/demo/seqToseq/paraphrase/train.conf
new file mode 100644
index 00000000000000..748920e2c72537
--- /dev/null
+++ b/demo/seqToseq/paraphrase/train.conf
@@ -0,0 +1,33 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append("..")
+
+from seqToseq_net import *
+
+is_generating = False
+### Data Definiation
+train_conf = seq_to_seq_data(data_dir = "./data/pre-paraphrase",
+                             is_generating = is_generating)
+
+### Algorithm Configuration
+settings(
+      learning_method = AdamOptimizer(),
+      batch_size = 50,
+      learning_rate = 5e-4)
+
+### Network Architecture
+gru_encoder_decoder(train_conf, is_generating, word_vector_dim = 32)
diff --git a/demo/seqToseq/paraphrase/train.sh b/demo/seqToseq/paraphrase/train.sh
new file mode 100755
index 00000000000000..2aa7b84060b198
--- /dev/null
+++ b/demo/seqToseq/paraphrase/train.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+cd ..
+
+paddle train \
+    --config='paraphrase/train.conf' \
+    --save_dir='paraphrase/model' \
+    --init_model_path='data/paraphrase_model' \
+    --load_missing_parameter_strategy=rand \
+    --use_gpu=false \
+    --num_passes=16 \
+    --show_parameter_stats_period=100 \
+    --trainer_count=4 \
+    --log_period=10 \
+    --dot_period=5 \
+    2>&1 | tee 'paraphrase/train.log'
diff --git a/demo/seqToseq/preprocess.py b/demo/seqToseq/preprocess.py
new file mode 100755
index 00000000000000..5efb17a664b9a2
--- /dev/null
+++ b/demo/seqToseq/preprocess.py
@@ -0,0 +1,204 @@
+#!/bin/env python
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Example:
+    python preprocess.py -i INPUT [-d DICTSIZE] [-m]
+
+Options:
+    -h, --help     show this help message and exit
+    -i INPUT       input original dataset path
+    -d DICTSIZE    specified word count of dictionary
+    -m --mergeDict merge source and target dictionary
+"""
+import os
+import sys 
+
+import string
+from optparse import OptionParser
+from paddle.utils.preprocess_util import save_list, DatasetCreater
+
+class SeqToSeqDatasetCreater(DatasetCreater):
+    """
+    A class to process data for sequence to sequence application.
+    """
+
+    def __init__(self, data_path, output_path):
+        """
+        data_path: the path to store the train data, test data and gen data
+        output_path: the path to store the processed dataset
+        """
+        DatasetCreater.__init__(self, data_path)
+        self.gen_dir_name = 'gen'
+        self.gen_list_name = 'gen.list'
+        self.output_path = output_path
+
+    def concat_file(self, file_path, file1, file2, output_path, output):
+        """
+        Concat file1 and file2 to be one output file 
+        The i-th line of output = i-th line of file1 + '\t' + i-th line of file2
+        file_path: the path to store file1 and file2
+        output_path: the path to store output file
+        """
+        file1 = os.path.join(file_path, file1)
+        file2 = os.path.join(file_path, file2)
+        output = os.path.join(output_path, output)
+        if not os.path.exists(output):
+            os.system('paste ' + file1 + ' ' + file2 + ' > ' + output)
+
+    def cat_file(self, dir_path, suffix, output_path, output):
+        """
+        Cat all the files in dir_path with suffix to be one output file 
+        dir_path: the base directory to store input file
+        suffix: suffix of file name
+        output_path: the path to store output file
+        """
+        cmd = 'cat '
+        file_list = os.listdir(dir_path)
+        file_list.sort()
+        for file in file_list:
+            if file.endswith(suffix):
+                cmd += os.path.join(dir_path, file) + ' '
+        output = os.path.join(output_path, output)
+        if not os.path.exists(output):
+            os.system(cmd + '> ' + output)
+
+    def build_dict(self, file_path, dict_path, dict_size = -1):
+        """ 
+        Create the dictionary for the file, Note that
+        1. Valid characters include all printable characters
+        2. There is distinction between uppercase and lowercase letters
+        3. There is 3 special token: 
+           <s>: the start of a sequence
+           <e>: the end of a sequence
+           <unk>: a word not included in dictionary
+        file_path: the path to store file 
+        dict_path: the path to store dictionary
+        dict_size: word count of dictionary
+                   if is -1, dictionary will contains all the words in file 
+        """
+        if not os.path.exists(dict_path):
+            dictory = dict()
+            with open(file_path, "r") as fdata:
+                for line in fdata:
+                    line = line.split('\t')
+                    for line_split in line:
+                        words = line_split.strip().split()
+                        for word in words:
+                            if word not in dictory:
+                                dictory[word] = 1
+                            else: 
+                                dictory[word] += 1
+            output = open(dict_path, "w+")
+            output.write('<s>\n<e>\n<unk>\n')
+            count = 3
+            for key, value in sorted(dictory.items(), key = lambda d:d[1], reverse = True):
+                output.write(key + "\n")
+                count += 1
+                if count == dict_size:
+                    break
+            self.dict_size = count
+      
+    def create_dataset(self, dict_size = -1, mergeDict = False,
+                       suffixes = ['.src', '.trg']):
+        """
+        Create seqToseq dataset 
+        """
+        # dataset_list and dir_list has one-to-one relationship
+        train_dataset = os.path.join(self.data_path, self.train_dir_name)
+        test_dataset = os.path.join(self.data_path, self.test_dir_name)
+        gen_dataset = os.path.join(self.data_path, self.gen_dir_name)
+        dataset_list = [train_dataset, test_dataset, gen_dataset]
+
+        train_dir = os.path.join(self.output_path, self.train_dir_name)
+        test_dir = os.path.join(self.output_path, self.test_dir_name)
+        gen_dir = os.path.join(self.output_path, self.gen_dir_name)
+        dir_list = [train_dir, test_dir, gen_dir]
+
+        # create directory
+        for dir in dir_list:
+            if not os.path.exists(dir):
+                os.mkdir(dir)
+
+        # checkout dataset should be parallel corpora
+        suffix_len = len(suffixes[0])
+        for dataset in dataset_list:
+          file_list = os.listdir(dataset)
+          if len(file_list) % 2 == 1:
+              raise RuntimeError("dataset should be parallel corpora")
+          file_list.sort()
+          for i in range(0, len(file_list), 2):
+              if file_list[i][:-suffix_len] != file_list[i + 1][:-suffix_len]:
+                  raise RuntimeError("source and target file name should be equal")
+
+        # cat all the files with the same suffix in dataset
+        for suffix in suffixes:
+            for dataset in dataset_list:
+                outname = os.path.basename(dataset) + suffix
+                self.cat_file(dataset, suffix, dataset, outname)
+
+        # concat parallel corpora and create file.list
+        print 'concat parallel corpora for dataset'
+        id = 0
+        list = ['train.list', 'test.list', 'gen.list']
+        for dataset in dataset_list:
+            outname = os.path.basename(dataset)
+            self.concat_file(dataset, outname + suffixes[0], 
+                             outname + suffixes[1], dir_list[id], outname)
+            save_list([os.path.join(dir_list[id], outname)], 
+                      os.path.join(self.output_path, list[id]))
+            id += 1
+
+        # build dictionary for train data
+        dict = ['src.dict', 'trg.dict']
+        dict_path = [os.path.join(self.output_path, dict[0]), 
+                     os.path.join(self.output_path, dict[1])]
+        if mergeDict:
+            outname = os.path.join(train_dir, train_dataset.split('/')[-1])
+            print 'build src dictionary for train data'
+            self.build_dict(outname, dict_path[0], dict_size)
+            print 'build trg dictionary for train data'
+            os.system('cp ' + dict_path[0] + ' ' + dict_path[1])
+        else:
+            outname = os.path.join(train_dataset, self.train_dir_name)
+            for id in range(0,2):
+                suffix = suffixes[id]
+                print 'build ' + suffix[1:] + ' dictionary for train data'
+                self.build_dict(outname + suffix, dict_path[id], dict_size)
+        print 'dictionary size is', self.dict_size
+
+def main():
+    usage = "usage: \n" \
+            "python %prog -i INPUT [-d DICTSIZE] [-m]"
+    parser = OptionParser(usage)
+    parser.add_option("-i", action="store", dest="input",
+                      help="input original dataset path")
+    parser.add_option("-d", action="store", dest="dictsize",
+                      help="specified word count of dictionary")
+    parser.add_option("-m", "--mergeDict", action="store_true", dest="mergeDict",
+                      help="merge source and target dictionary")
+    (options, args) = parser.parse_args()
+    if options.input[-1] == os.path.sep:
+        options.input = options.input[:-1]
+    outname = os.path.basename(options.input)
+    output_path = os.path.join(os.path.dirname(options.input), 'pre-' + outname)
+    dictsize = int(options.dictsize) if options.dictsize else -1
+    if not os.path.exists(output_path):
+        os.mkdir(output_path)
+        data_creator = SeqToSeqDatasetCreater(options.input, output_path)
+        data_creator.create_dataset(dictsize, options.mergeDict)
+
+if __name__ == "__main__":
+    main(); 
diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py
new file mode 100644
index 00000000000000..8b613de71ade4d
--- /dev/null
+++ b/demo/seqToseq/seqToseq_net.py
@@ -0,0 +1,183 @@
+# edit-mode: -*- python -*-
+
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+from paddle.trainer_config_helpers import *
+
+
+def seq_to_seq_data(data_dir,
+                    is_generating,
+                    dict_size=30000,
+                    train_list='train.list',
+                    test_list='test.list',
+                    gen_list='gen.list',
+                    gen_result='gen_result'):
+    """
+    Predefined seqToseq train data provider for application
+    is_generating: whether this config is used for generating
+    dict_size: word count of dictionary
+    train_list: a text file containing a list of training data
+    test_list: a text file containing a list of testing data
+    gen_list: a text file containing a list of generating data
+    gen_result: a text file containing generating result
+    """
+    src_lang_dict = os.path.join(data_dir, 'src.dict')
+    trg_lang_dict = os.path.join(data_dir, 'trg.dict')
+    src_dict = dict()
+    for line_count, line in enumerate(open(src_lang_dict, "r")):
+        src_dict[line.strip()] = line_count
+    trg_dict = dict()
+    for line_count, line in enumerate(open(trg_lang_dict, "r")):
+        trg_dict[line.strip()] = line_count
+
+    if is_generating:
+        train_list = None
+        test_list = os.path.join(data_dir, gen_list)
+        trg_dict = None
+    else:
+        train_list = os.path.join(data_dir, train_list)
+        test_list = os.path.join(data_dir,test_list)
+
+    define_py_data_sources2(train_list, test_list,
+                           module = "dataprovider",
+                           obj = "process",
+                           args = {"src_dict": src_dict,
+                                   "trg_dict": trg_dict})
+
+    return {"src_dict_path": src_lang_dict, "trg_dict_path": trg_lang_dict, 
+            "gen_result": gen_result}
+
+
+def gru_encoder_decoder(data_conf,
+                        is_generating,
+                        word_vector_dim=512,
+                        encoder_size=512,
+                        decoder_size=512,
+                        beam_size=3,
+                        max_length=250):
+    """
+    A wrapper for an attention version of GRU Encoder-Decoder network
+    is_generating: whether this config is used for generating
+    encoder_size: dimension of hidden unit in GRU Encoder network
+    decoder_size: dimension of hidden unit in GRU Decoder network
+    word_vector_dim: dimension of word vector
+    beam_size: expand width in beam search
+    max_length: a stop condition of sequence generation
+    """
+    for k, v in data_conf.iteritems():
+        globals()[k] = v
+    source_dict_dim = len(open(src_dict_path, "r").readlines())
+    target_dict_dim = len(open(trg_dict_path, "r").readlines())
+    gen_trans_file = gen_result
+
+    src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
+    src_embedding = embedding_layer(
+        input=src_word_id,
+        size=word_vector_dim,
+        param_attr=ParamAttr(name='_source_language_embedding'), )
+    src_forward = simple_gru(input=src_embedding, size=encoder_size, )
+    src_backward = simple_gru(input=src_embedding,
+                              size=encoder_size,
+                              reverse=True, )
+    encoded_vector = concat_layer(input=[src_forward, src_backward])
+
+    with mixed_layer(size=decoder_size) as encoded_proj:
+        encoded_proj += full_matrix_projection(encoded_vector)
+
+    backward_first = first_seq(input=src_backward)
+    with mixed_layer(size=decoder_size,
+                     act=TanhActivation(), ) as decoder_boot:
+        decoder_boot += full_matrix_projection(backward_first)
+
+    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
+        decoder_mem = memory(name='gru_decoder',
+                             size=decoder_size,
+                             boot_layer=decoder_boot)
+
+        context = simple_attention(encoded_sequence=enc_vec,
+                                   encoded_proj=enc_proj,
+                                   decoder_state=decoder_mem, )
+
+        with mixed_layer(size=decoder_size * 3) as decoder_inputs:
+            decoder_inputs += full_matrix_projection(context)
+            decoder_inputs += full_matrix_projection(current_word)
+
+        gru_step = gru_step_layer(name='gru_decoder',
+                                  input=decoder_inputs,
+                                  output_mem=decoder_mem,
+                                  size=decoder_size)
+
+        with mixed_layer(size=target_dict_dim,
+                         bias_attr=True,
+                         act=SoftmaxActivation()) as out:
+            out += full_matrix_projection(input=gru_step)
+        return out
+
+    decoder_group_name = "decoder_group"
+    if not is_generating:
+        trg_embedding = embedding_layer(
+            input=data_layer(name='target_language_word',
+                             size=target_dict_dim),
+            size=word_vector_dim,
+            param_attr=ParamAttr(name='_target_language_embedding'))
+
+        # For decoder equipped with attention mechanism, in training,
+        # target embeding (the groudtruth) is the data input,
+        # while encoded source sequence is accessed to as an unbounded memory.
+        # Here, the StaticInput defines a read-only memory
+        # for the recurrent_group.
+        decoder = recurrent_group(name=decoder_group_name,
+                                  step=gru_decoder_with_attention,
+                                  input=[
+                                      StaticInput(input=encoded_vector,
+                                                  is_seq=True),
+                                      StaticInput(input=encoded_proj,
+                                                  is_seq=True), trg_embedding
+                                  ], )
+
+        lbl = data_layer(name='target_language_next_word',
+                         size=target_dict_dim)
+        cost = classification_cost(input=decoder, label=lbl, )
+        outputs(cost)
+    else:
+        gen_inputs = [StaticInput(input=encoded_vector,
+                                  is_seq=True),
+                      StaticInput(input=encoded_proj,
+                                  is_seq=True), ]
+        # In generation, decoder predicts a next target word based on
+        # the encoded source sequence and the last generated target word.
+        # The encoded source sequence (encoder's output) must be specified by
+        # StaticInput which is a read-only memory.
+        # Here, GeneratedInputs automatically fetchs the last generated word,
+        # which is initialized by a start mark, such as <s>.
+        trg_embedding = GeneratedInput(
+            size=target_dict_dim,
+            embedding_name='_target_language_embedding',
+            embedding_size=word_vector_dim)
+        gen_inputs.append(trg_embedding)
+        beam_gen = beam_search(name=decoder_group_name,
+                               step=gru_decoder_with_attention,
+                               input=gen_inputs,
+                               id_input=data_layer(name="sent_id",
+                                                   size=1),
+                               dict_file=trg_dict_path,
+                               bos_id=0,
+                               eos_id=1,
+                               beam_size=beam_size,
+                               max_length=max_length,
+                               result_file=gen_trans_file)
+        outputs(beam_gen)
diff --git a/demo/seqToseq/translation/eval_bleu.sh b/demo/seqToseq/translation/eval_bleu.sh
new file mode 100755
index 00000000000000..ef0ede717a740f
--- /dev/null
+++ b/demo/seqToseq/translation/eval_bleu.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+gen_file=$1
+beam_size=$2
+
+# find top1 generating result
+top1=$(printf '%s_top1.txt' `basename $gen_file .txt`)
+if [ $beam_size -eq 1 ]; then
+    awk -F "\t" '{sub(" <e>","",$2);sub(" ","",$2);print $2}' $gen_file >$top1
+else
+    awk 'BEGIN{
+        FS="\t";
+        OFS="\t";
+        read_pos = 2} {
+        if (NR == read_pos){
+            sub(" <e>","",$3);
+            sub(" ","",$3);
+            print $3;
+            read_pos += (2 + res_num);
+      }}' res_num=$beam_size $gen_file >$top1
+fi 
+
+# evalute bleu value
+bleu_script=multi-bleu.perl
+standard_res=../data/wmt14/gen/ntst14.trg
+bleu_res=`perl $bleu_script $standard_res <$top1`
+
+echo $bleu_res
+rm $top1
diff --git a/demo/seqToseq/translation/gen.conf b/demo/seqToseq/translation/gen.conf
new file mode 100644
index 00000000000000..63c5c2f9a6052c
--- /dev/null
+++ b/demo/seqToseq/translation/gen.conf
@@ -0,0 +1,36 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append("..")
+
+from seqToseq_net import *
+
+# whether this config is used for generating
+is_generating = True
+
+### Data Definiation
+gen_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14", 
+                           is_generating = is_generating,
+                           gen_result = "./translation/gen_result")
+
+### Algorithm Configuration
+settings(
+      learning_method = AdamOptimizer(),
+      batch_size = 1,
+      learning_rate = 0)
+
+### Network Architecture
+gru_encoder_decoder(gen_conf, is_generating)
diff --git a/demo/seqToseq/translation/gen.sh b/demo/seqToseq/translation/gen.sh
new file mode 100755
index 00000000000000..ad977c05ff9897
--- /dev/null
+++ b/demo/seqToseq/translation/gen.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+cd ..
+
+paddle train \
+    --job=test \
+    --config='translation/gen.conf' \
+    --save_dir='data/wmt14_model' \
+    --use_gpu=false \
+    --num_passes=13 \
+    --test_pass=12 \
+    --trainer_count=1 \
+    2>&1 | tee 'translation/gen.log'
diff --git a/demo/seqToseq/translation/moses_bleu.sh b/demo/seqToseq/translation/moses_bleu.sh
new file mode 100755
index 00000000000000..bfaba40b26905c
--- /dev/null
+++ b/demo/seqToseq/translation/moses_bleu.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+set -x
+echo "Downloading multi-bleu.perl"
+wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl --no-check-certificate
diff --git a/demo/seqToseq/translation/train.conf b/demo/seqToseq/translation/train.conf
new file mode 100644
index 00000000000000..cf1bde15c4a8aa
--- /dev/null
+++ b/demo/seqToseq/translation/train.conf
@@ -0,0 +1,36 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append("..")
+
+from seqToseq_net import *
+
+# whether this config is used for generating
+is_generating = False
+
+### Data Definiation
+data_dir  = "./data/pre-wmt14"
+train_conf = seq_to_seq_data(data_dir = data_dir, 
+                             is_generating = is_generating)
+
+### Algorithm Configuration
+settings(
+    learning_method = AdamOptimizer(),
+    batch_size = 50,
+    learning_rate = 5e-4)
+
+### Network Architecture
+gru_encoder_decoder(train_conf, is_generating)
diff --git a/demo/seqToseq/translation/train.sh b/demo/seqToseq/translation/train.sh
new file mode 100755
index 00000000000000..976b5ba3b054c4
--- /dev/null
+++ b/demo/seqToseq/translation/train.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+cd ..
+
+paddle train \
+--config='translation/train.conf' \
+--save_dir='translation/model' \
+--use_gpu=false \
+--num_passes=16 \
+--show_parameter_stats_period=100 \
+--trainer_count=4 \
+--log_period=10 \
+--dot_period=5 \
+2>&1 | tee 'translation/train.log'
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
new file mode 100644
index 00000000000000..b8ccfc6be5d34c
--- /dev/null
+++ b/doc/CMakeLists.txt
@@ -0,0 +1,49 @@
+
+
+
+if(NOT DEFINED SPHINX_THEME)
+    set(SPHINX_THEME default)
+endif()
+
+if(NOT DEFINED SPHINX_THEME_DIR)
+    set(SPHINX_THEME_DIR)
+endif()
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
+
+# HTML output directory
+set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html")
+
+
+set(PADDLE_DOXYGEN_OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/doxygen_xml")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in"
+    "${BINARY_BUILD_DIR}/conf.py"
+    @ONLY)
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in"
+    "${CMAKE_CURRENT_BINARY_DIR}/Doxyfile"
+    @ONLY
+  )
+
+add_custom_target(paddle_doxygen_docs ALL
+    ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+sphinx_add_target(paddle_docs
+                  html
+                  ${BINARY_BUILD_DIR}
+                  ${SPHINX_CACHE_DIR}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR})
+
+add_dependencies(paddle_docs 
+  gen_proto_py
+  paddle_doxygen_docs)
\ No newline at end of file
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
new file mode 100644
index 00000000000000..a1fc3801925dd3
--- /dev/null
+++ b/doc/Doxyfile.in
@@ -0,0 +1,2384 @@
+# Doxyfile 1.8.10
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
+# for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "paddle"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         = 1.0.0
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = @PADDLE_DOXYGEN_OUTPUT@
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 2
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
+# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
+# Fortran. In the later case the parser tries to guess whether the code is fixed
+# or free formatted code, this is the default for Fortran type files), VHDL. For
+# instance to make doxygen treat .inc files as Fortran files (default is PHP),
+# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = YES
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO, these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES, upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = NO
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = NO
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = @PROJ_ROOT@/paddle
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd,
+# *.vhdl, *.ucf, *.qsf, *.as and *.js.
+
+FILE_PATTERNS          = *.c *.cc *.cpp *.cu *.h *.hpp *.cuh *.ph
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                = 
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = */x86_64-scm-linux-gnu/* */internals/* */mkl/* */test/* */tests/* */platform/*
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see http://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = NO
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: http://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the master .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# http://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from http://www.mathjax.org before deployment.
+# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES, to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = YES
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sf.net) file that captures the
+# structure of the code including all documentation. Note that this feature is
+# still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/doc/build/build_from_source.md b/doc/build/build_from_source.md
new file mode 100644
index 00000000000000..5ab97d1e167990
--- /dev/null
+++ b/doc/build/build_from_source.md
@@ -0,0 +1,136 @@
+Build and Install
+=================
+
+## Requirement
+
+### Dependents
+
+- **CMake**: required for 2.8+ version
+- **g++**: a recent c++ compiler supporting c++11, >= 4.6, < 5
+- **BLAS library**: such as openBLAS, MKL, ATLAS
+- **protobuf**: required for 2.4+ version, 3.x is not supported
+- **python**: currently only 2.7 version is supported
+
+### Optional
+
+PaddlePaddle also support some build options, you have to install related libraries. 
+
+- **WITH_GPU**: Compile with gpu mode
+  - The GPU version works best with Cuda Toolkit 7.5 and cuDNN v5
+  - Other versions Cuda Toolkit 6.5, 7.0 and cuDNN v2, v3, v4 are also supported
+  - Note: to utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa
+- **WITH_DOUBLE**: Compile with double precision, otherwise use single precision 
+- **WITH_GLOG**: Compile with glog, otherwise use a log implement internally
+- **WITH_GFLAGS**: Compile with gflags, otherwise use a flag implement internally
+- **WITH_TESTING**: Compile with gtest and run unittest for PaddlePaddle 
+- **WITH_DOC**: Compile with documentation
+- **WITH_SWIG_PY**: Compile with python predict api
+- **WITH_STYLE_CHECK**: Style check for source code
+
+
+## Building on Ubuntu14.04
+
+### Install Dependencies
+
+- **CPU Dependencies**
+
+```bash
+# necessary
+sudo apt-get update
+sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git 
+# optional
+sudo apt-get install libgoogle-glog-dev
+sudo apt-get install libgflags-dev
+sudo apt-get install libgtest-dev
+pushd /usr/src/gtest
+cmake .
+make
+sudo cp *.a /usr/lib
+popd
+```
+    
+  
+- **GPU Dependencies(optional)**
+
+If you need to build GPU version, the first thing you need is a machine that has GPU and CUDA installed.
+And you also need to install cuDNN.
+
+You can download CUDA toolkit and cuDNN from nvidia website:
+    
+```bash
+https://developer.nvidia.com/cuda-downloads
+https://developer.nvidia.com/cudnn
+```
+You can copy cuDNN files into the CUDA toolkit directory, such as:
+
+```bash
+sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
+sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+```
+Then you need to set LD\_LIBRARY\_PATH, CUDA\_HOME and PATH environment variables in ~/.bashrc.
+
+```bash
+export LD_LIBARAY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+export CUDA_HOME=/usr/local/cuda
+export PATH=/usr/local/cuda/bin:$PATH
+```
+- **Python Dependencies(optional)**
+
+If you want to compile PaddlePaddle with python predict api, you need to add -DWITH_SWIG_PY=ON in cmake command and install these first:
+
+```bash
+sudo apt-get install swig
+```
+
+- **Doc Dependencies(optional)**
+
+If you want to compile PaddlePaddle with doc, you need to add -DWITH_DOC=ON in cmake command and install these first:
+
+```bash
+pip install sphinx
+pip install sphinx_rtd_theme breathe recommonmark
+sudo apt-get install python-sphinx doxygen 
+```
+
+### Build an Install
+
+CMake will find dependent libraries in system default paths first. After installing some optional libraries, corresponding build option will automatically be on(such as glog, gtest and gflags). And if libraries are not found, you have to set following variables manually in cmake command(CUDNN_ROOT, ATLAS_ROOT, MKL_ROOT, OPENBLAS_ROOT).
+
+Here are some examples of cmake command with different options:
+
+**only cpu**
+
+```bash
+cmake -DWITH_GPU=OFF -DWITH_DOC=OFF
+```
+
+**gpu**
+
+```bash
+cmake -DWITH_GPU=ON -DWITH_DOC=OFF
+```
+
+**gpu with doc and swig**
+
+```bash
+cmake -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
+``` 
+
+Finally, you can download source code and build:
+
+```bash
+git clone https://github.com/baidu/Paddle paddle
+cd paddle
+mkdir build
+cd build
+# you can add build option here, such as:    
+cmake -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<path to install> ..
+make -j `nproc` && make install
+```
+**Note**
+
+And if you set WITH_SWIG_PY=ON, you have to install related python predict api at the same time:
+
+```bash
+pip install <path to install>/opt/paddle/share/wheels/*.whl
+```
diff --git a/doc/build/cmake.png b/doc/build/cmake.png
new file mode 100644
index 00000000000000..a58cd09ad99cf2
Binary files /dev/null and b/doc/build/cmake.png differ
diff --git a/doc/build/contribute_to_paddle.md b/doc/build/contribute_to_paddle.md
new file mode 100644
index 00000000000000..ea5b104255a4f1
--- /dev/null
+++ b/doc/build/contribute_to_paddle.md
@@ -0,0 +1,83 @@
+# Contribute to PaddlePaddle
+
+We sincerely appreciate your contributions. You can use fork and pull request
+workflow to merge your code. 
+ 
+## Code Requirements
+- Your code mush be fully documented by
+  [doxygen](http://www.stack.nl/~dimitri/doxygen/) style.
+- Make sure the compiler option WITH\_STYLE\_CHECK is on and the compiler
+  passes the code style check.
+- All code must have unit test.
+- Pass all unit tests.
+
+The following tutorial guides you into submitting your contibution.
+ 
+## [Creating a Fork](https://help.github.com/articles/fork-a-repo/)
+ 
+Just head over to the GitHub page and click the "Fork" button.
+It's just that simple. 
+
+## Clone
+
+Once you've created a fork, you can use your favorite git client to clone your
+repo or just head straight to the command line:
+ 
+```shell
+# Clone your fork to your local machine
+git clone git@github.com:USERNAME/paddle.git
+```
+Then you can start to develop. 
+
+## Commit
+
+Commit your changes by following command lines:
+
+```shell
+# show the working tree status
+git status
+# add modified files
+git add xx
+git commit -m "commit info"
+```
+The first line of commit infomation is the title. The second and later lines
+are the details if any.
+
+## Keeping Fork Up to Date
+
+Before pull your request, you shold sync you code from the latest Paddle.
+To do this, you'll need to add a remote at first:
+
+```shell
+# see the current configured remote repository
+git remote -v
+# add upstream repository
+git remote add upstream https://github.com/paddle/paddle.git
+# verify the new upstream
+git remote -v
+```
+
+Update your fork with the latest upstream changes:
+
+```shell
+git fetch upstream
+git pull upstream master
+```
+
+If there are no unique commits locally, git will simply perform a fast-forward.
+However, if you have been making changes (in the vast majority of cases you
+probably shouldn't be), you may have to deal with conflicts. 
+
+Now, your local master branch is up-to-date with everything modified upstream.
+
+## Push to GitHub
+
+```shell
+# push to your repository in Github
+git push origin master
+```
+
+## Pull Request
+
+Go to the page for your fork on GitHub, select your development branch,
+and click the **pull request button**.
diff --git a/doc/build/index.rst b/doc/build/index.rst
new file mode 100644
index 00000000000000..57d3d3bfc4db04
--- /dev/null
+++ b/doc/build/index.rst
@@ -0,0 +1,32 @@
+Build And Install PaddlePaddle
+================================
+
+Install PaddlePaddle
+----------------------
+
+..  toctree::
+    :glob:
+
+    install_*
+
+Build from Source
+-----------------
+
+If you want to hack and contribute PaddlePaddle source code, following guides can help you\:
+
+..  toctree::
+    :glob:
+
+    build_from_source.md
+    internal/contribute_code.md
+    contribute_to_paddle.md
+
+Build Docker Images
+-------------------
+
+If you want to pack docker image, the following guide can help you\:
+
+..  toctree::
+    :glob:
+
+    docker/*
diff --git a/doc/cluster/index.rst b/doc/cluster/index.rst
new file mode 100644
index 00000000000000..942248665cbcd0
--- /dev/null
+++ b/doc/cluster/index.rst
@@ -0,0 +1,8 @@
+Cluster Train
+====================
+
+.. toctree::
+  :glob:
+
+  internal/index.md
+  opensource/cluster_train.md
diff --git a/doc/cluster/opensource/cluster_train.md b/doc/cluster/opensource/cluster_train.md
new file mode 100644
index 00000000000000..af447339842529
--- /dev/null
+++ b/doc/cluster/opensource/cluster_train.md
@@ -0,0 +1,137 @@
+# Cluster Training
+
+We provide this simple scripts to help you to launch cluster training Job to harness PaddlePaddle's distributed trainning. For MPI and other cluster scheduler refer this naive script to implement more robust cluster training platform by yourself.
+
+The following cluster demo is based on RECOMMENDATION local training demo in PaddlePaddle ```demo/recommendation``` directory.  Assuming you enter the cluster_scripts/ directory.
+
+## Pre-requirements
+
+Firstly,
+
+```bash
+pip install fabric
+```
+
+Secondly, go through installing scripts to install PaddlePaddle at all nodes to make sure demo can run as local mode.
+
+Then you should prepare same ROOT_DIR directory in all nodes. ROOT_DIR is from in cluster_scripts/conf.py. Assuming that the ROOT_DIR = /home/paddle, you can create ```paddle``` user account as well, at last ```paddle.py``` can ssh connections to all nodes with ```paddle``` user automatically.
+
+At last you can create ssh mutual trust relationship between all nodes for easy ssh login, otherwise ```password``` should be provided at runtime from ```paddle.py```.
+
+## Prepare Job Workspace
+
+```Job workspace``` is defined as one package directory which contains dependency libraries, train data, test data, model config file and all other related file dependencies.
+
+These ```train/test``` data should be prepared before launching cluster job. To  satisfy the requirement that train/test data are placed in different directory from workspace, PADDLE refers train/test data according to index file named as ```train.list/test.list``` which are used in model config file. So the train/test data also contains train.list/test.list two list file. All local training demo already provides scripts to help you create these two files,  and all nodes in cluster job will handle files with same logical code in normal condition.
+
+Generally, you can use same model file from local training for cluster training. What you should have in mind that, the ```batch_size``` set in ```setting``` function in model file means batch size in ```each``` node of cluster job instead of total batch size if synchronization SGD was used.
+
+Following steps are based on demo/recommendation demo in demo directory.
+
+You just go through demo/recommendation tutorial doc until ```Train``` section, and at last you will get train/test data and model configuration file. Besides, you can place paddle binaries and related dependencies files in this demo/recommendation directory as well. Finaly, just use demo/recommendation as workspace for cluster training.
+
+At last your workspace should look like as follow:
+```
+.
+|-- conf
+|   `-- trainer_config.conf
+|-- test
+|   |-- dnn_instance_000000
+|-- test.list
+|-- train
+|   |-- dnn_instance_000000
+|   |-- dnn_instance_000001
+`-- train.list
+```
+```conf/trainer_config.conf```
+Indicates the model config file.
+
+```test``` and ```train```
+Train/test data. Different node should owns different parts of all Train data. This simple script did not do this job, so you should prepare it at last. All test data should be placed at node 0 only.
+
+```train.list``` and ```test.list```
+File index. It stores all relative or absolute file paths of all train/test data at current node.
+
+
+
+## Prepare Cluster Job Configuration
+
+Set serveral options must be carefully set in cluster_scripts/conf.py
+
+```HOSTS```  all nodes hostname or ip that will run cluster job. You can also append user and ssh port with hostname, such as root@192.168.100.17:9090.
+
+```ROOT_DIR``` workspace ROOT directory for placing JOB workspace directory
+
+```PADDLE_NIC``` the NIC(Network Interface Card) interface name for cluster communication channel, such as eth0 for ethternet, ib0 for infiniband.
+
+```PADDLE_PORT``` port number for cluster commnunication channel
+
+```PADDLE_PORTS_NUM``` the number of port used for cluster communication channle. if the number of cluster nodes is small(less than 5~6nodes), recommend you set it to larger, such as 2 ~ 8, for better network performance.
+
+```PADDLE_PORTS_NUM_FOR_SPARSE``` the number of port used for sparse updater cluster commnunication channel. if sparse remote update is used, set it like ```PADDLE_PORTS_NUM```
+
+Default Configuration as follow:
+
+```python
+HOSTS = [
+        "root@192.168.100.17",
+        "root@192.168.100.18",
+        ]
+
+'''
+workspace configuration
+'''
+
+#root dir for workspace
+ROOT_DIR = "/home/paddle"
+
+'''
+network configuration
+'''
+#pserver nics
+PADDLE_NIC = "eth0"
+#pserver port
+PADDLE_PORT = 7164
+#pserver ports num
+PADDLE_PORTS_NUM = 2
+#pserver sparse ports num
+PADDLE_PORTS_NUM_FOR_SPARSE = 2
+```
+
+### Launching Cluster Job
+```paddle.py``` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can set as ```paddle.py``` command options and ```paddle.py``` will transparently and automatically set these options to PaddlePaddle lower level processes.
+
+```paddle.py```provides two distinguished command option for easy job launching.
+
+```job_dispatch_package```  set it with local ```workspace```directory, it will be dispatched to all nodes set in conf.py. It could be helpful for frequent hacking workspace files, otherwise frequent mulit-nodes workspace deployment could make your crazy.
+```job_workspace```  set it with already deployed workspace directory, ```paddle.py``` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
+dispatch latency.
+
+```cluster_scripts/run.sh``` provides command line sample to run ```demo/recommendation``` cluster job, just modify ```job_dispatch_package``` and ```job_workspace``` with your defined directory, then:
+```
+sh run.sh
+```
+
+The cluster Job will start in several seconds.
+
+### Kill Cluster Job
+```paddle.py``` can capture ```Ctrl + C``` SIGINT signal to automatically kill all processes launched by it. So just stop ```paddle.py``` to kill cluster job.
+
+### Check Cluster Training Result
+Check log in $workspace/log for details, each node owns same log structure.
+
+```paddle_trainer.INFO```
+It provides almost all interal output log for training,  same as local training. Check runtime model convergence here.
+
+```paddle_pserver2.INFO```
+It provides pserver running log, which could help to diagnose distributed error.
+
+```server.log```
+It provides stderr and stdout of pserver process. Check error log if training crashs.
+
+```train.log```
+It provides stderr and stdout of trainer process. Check error log if training crashs.
+
+### Check Model Output
+After one pass finished, model files will be writed in ```output``` directory in node 0.
+```nodefile``` in workspace indicates the node id of current cluster job.
diff --git a/doc/conf.py.in b/doc/conf.py.in
new file mode 100644
index 00000000000000..8515042747ec33
--- /dev/null
+++ b/doc/conf.py.in
@@ -0,0 +1,164 @@
+# -*- coding: utf-8 -*-
+#
+# documentation build configuration file, created by
+# sphinx-quickstart on Thu Jul 23 19:40:08 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+import sys
+import os, subprocess
+import shlex
+from recommonmark import parser, transform
+
+MarkdownParser = parser.CommonMarkParser
+AutoStructify = transform.AutoStructify
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+sys.path.insert(0, '@PROJ_ROOT@/python')
+
+# -- Doxygen Settings
+breathe_projects = {
+   'paddle': '@PADDLE_DOXYGEN_OUTPUT@/xml'
+}
+breathe_default_project = 'paddle'
+
+breathe_domain_by_extension = {
+    'h': 'cpp',  # mapping XXX.h XXX.cpp together
+}
+
+breathe_default_members = {
+    'protected-members','undoc-members'
+}
+
+# -- General configuration ------------------------------------------------
+
+# General information about the project.
+project = u'PaddlePaddle'
+author = u'%s developers' % project
+copyright = u'2016, %s' % author
+github_doc_root = ''
+
+# add markdown parser
+MarkdownParser.github_doc_root = github_doc_root
+source_parsers = {
+    '.md': MarkdownParser,
+    '.Rmd': MarkdownParser,
+}
+os.environ['PADDLE_BUILD_DOC'] = '1'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.napoleon',
+    'breathe'
+]
+
+
+autodoc_member_order = 'bysource'
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+# source_suffix = ['.rst', '.md']
+source_suffix = ['.rst', '.md', '.Rmd']
+
+# The encoding of source files.
+source_encoding = 'utf-8'
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#html_theme = 'sphinx_rtd_theme'
+html_theme = 'classic'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = project + 'doc'
+
+# -- Options for LaTeX output ---------------------------------------------
+latex_elements = {
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  (master_doc, '%s.tex' % project, project,
+   author, 'manual'),
+]
+
+# Use the .. admonition:: directive for Notes sections.
+# False to use the .. rubric:: directive instead.
+napoleon_use_admonition_for_notes = True
+
+def setup(app):
+    # Add hook for building doxygen xml when needed
+    # no c++ API for now
+    app.add_config_value('recommonmark_config', {
+            'url_resolver': lambda url: github_doc_root + url,
+            }, True)
+    app.add_transform(AutoStructify)
diff --git a/doc/demo/embedding_model/index.md b/doc/demo/embedding_model/index.md
new file mode 100644
index 00000000000000..b544492e47fa7b
--- /dev/null
+++ b/doc/demo/embedding_model/index.md
@@ -0,0 +1,139 @@
+# Chinese Word Embedding Model Tutorial #
+----------
+This tutorial is to guide you through the process of using a Pretrained Chinese Word Embedding Model in the PaddlePaddle standard format.
+
+We thank @lipeng for the pull request that defined the model schemas and pretrained the models.
+
+## Introduction ###
+### Chinese Word Dictionary ###
+Our Chinese-word dictionary is created on Baidu ZhiDao and Baidu Baike by using in-house word segmentor. For example, the participle of "《红楼梦》" is "《"，"红楼梦"，"》"，and "《红楼梦》". Our dictionary (using UTF-8 format) has has two columns: word and its frequency. The total word count is 3206325, including 3 special token:
+  - `<s>`: the start of a sequence
+  - `<e>`: the end of a sequence
+  - `<unk>`: a word not included in dictionary
+
+### Pretrained Chinese Word Embedding Model ###
+Inspired by paper [A Neural Probabilistic Language Model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf), our model architecture (**Embedding joint of six words->FullyConnect->SoftMax**) is as following graph. And for our dictionary, we pretrain four models with different word vector dimenstions, i.e 32, 64, 128, 256.
+<center>![](./neural-n-gram-model.png)</center>
+<center>Figure 1. neural-n-gram-model</center>
+
+### Download and Extract ###
+To download and extract our dictionary and pretrained model, run the following commands.
+
+    cd $PADDLE_ROOT/demo/model_zoo/embedding
+    ./pre_DictAndModel.sh
+
+## Chinese Paraphrasing Example ##
+We provide a paraphrasing task to show the usage of pretrained Chinese Word Dictionary and Embedding Model.
+
+### Data Preparation and Preprocess ###
+
+First, run the following commands to download and extract the in-house dataset. The dataset (using UTF-8 format) has 20 training samples, 5 testing samples and 2 generating samples.
+
+    cd $PADDLE_ROOT/demo/seqToseq/data
+    ./paraphrase_data.sh
+
+Second, preprocess data and build dictionary on train data by running the following commands, and the preprocessed dataset is stored in `$PADDLE_SOURCE_ROOT/demo/seqToseq/data/pre-paraphrase`:
+
+    cd $PADDLE_ROOT/demo/seqToseq/
+    python preprocess.py -i data/paraphrase [--mergeDict]
+
+- `--mergeDict`: if using this option, the source and target dictionary are merged, i.e, two dictionaries have the same context. Here, as source and target data are all chinese words, this option can be used.
+
+### User Specified Embedding Model ###
+The general command of extracting desired parameters from the pretrained embedding model based on user dictionary is:
+
+    cd $PADDLE_ROOT/demo/model_zoo/embedding
+    python extract_para.py --preModel PREMODEL --preDict PREDICT --usrModel USRMODEL--usrDict USRDICT -d DIM
+
+- `--preModel PREMODEL`: the name of pretrained embedding model
+- `--preDict PREDICT`: the name of pretrained dictionary
+- `--usrModel USRMODEL`: the name of extracted embedding model
+- `--usrDict USRDICT`: the name of user specified dictionary
+- `-d DIM`: dimension of parameter
+
+Here, you can simply run the command:
+
+    cd $PADDLE_ROOT/demo/seqToseq/data/
+    ./paraphase_model.sh
+
+And you will see following embedding model structure:
+
+    paraphase_model
+    |--- _source_language_embedding
+    |--- _target_language_embedding
+
+### Training Model in PaddlePaddle ###
+First, create a model config file, see example `demo/seqToseq/paraphrase/train.conf`:
+
+    from seqToseq_net import *
+    is_generating = False
+
+    ################## Data Definition #####################
+    train_conf = seq_to_seq_data(data_dir = "./data/pre-paraphrase",
+                                 job_mode = job_mode)
+
+    ############## Algorithm Configuration ##################
+    settings(
+          learning_method = AdamOptimizer(),
+          batch_size = 50,
+          learning_rate = 5e-4)
+
+    ################# Network configure #####################
+    gru_encoder_decoder(train_conf, is_generating, word_vector_dim = 32)
+
+This config is almost the same as `demo/seqToseq/translation/train.conf`.
+
+Then, train the model by running the command:
+
+    cd $PADDLE_SOURCE_ROOT/demo/seqToseq/paraphrase
+    ./train.sh
+
+where `train.sh` is almost the same as `demo/seqToseq/translation/train.sh`, the only difference is following two command arguments:
+
+- `--init_model_path`: path of the initialization model, here is `data/paraphase_model`
+- `--load_missing_parameter_strategy`: operations when model file is missing, here use a normal distibution to initialize the other parameters except for the embedding layer
+
+For users who want to understand the dataset format, model architecture and training procedure in detail, please refer to [Text generation Tutorial](text_generation.md).
+
+## Optional Function ##
+###  Embedding Parameters Observation
+For users who want to observe the embedding parameters, this function can convert a PaddlePaddle binary embedding model to a text model by running the command:
+
+    cd $PADDLE_ROOT/demo/model_zoo/embedding
+    python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM
+
+- `-i INPUT`: the name of input binary embedding model
+- `-o OUTPUT`: the name of output text embedding model
+- `-d DIM`: the dimension of parameter
+
+You will see parameters like this in output text model:
+
+    0,4,32156096
+    -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ......
+    0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ......
+    ......
+
+- 1st line is **PaddlePaddle format file head**, it has 3 attributes:
+  - version of PaddlePaddle, here is 0
+  - sizeof(float), here is 4
+  - total number of parameter, here is 32156096
+- Other lines print the paramters (assume `<dim>` = 32)
+  - each line print 32 paramters splitted by ','
+  - there is 32156096/32 = 1004877 lines, meaning there is 1004877 embedding words
+
+### Embedding Parameters Revision
+For users who want to revise the embedding parameters, this function can convert a revised text embedding model to a PaddlePaddle binary model by running the command:
+
+    cd $PADDLE_ROOT/demo/model_zoo/embedding
+    python paraconvert.py --t2b -i INPUT -o OUTPUT
+
+- `-i INPUT`: the name of input text embedding model.
+- `-o OUTPUT`: the name of output binary embedding model
+
+Note that the format of input text model is as follows:
+
+    -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ......
+    0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ......
+    ......
+- there is no file header in 1st line
+- each line stores parameters for one word, the separator is commas ','
diff --git a/doc/demo/embedding_model/neural-n-gram-model.png b/doc/demo/embedding_model/neural-n-gram-model.png
new file mode 100644
index 00000000000000..f70b765b3fd698
Binary files /dev/null and b/doc/demo/embedding_model/neural-n-gram-model.png differ
diff --git a/doc/demo/image_classification/cifar.png b/doc/demo/image_classification/cifar.png
new file mode 100644
index 00000000000000..f54a0c58837cb3
Binary files /dev/null and b/doc/demo/image_classification/cifar.png differ
diff --git a/doc/demo/image_classification/image_classification.md b/doc/demo/image_classification/image_classification.md
new file mode 100644
index 00000000000000..6da20da8a8559c
--- /dev/null
+++ b/doc/demo/image_classification/image_classification.md
@@ -0,0 +1,199 @@
+#Image Classification Tutorial
+
+This tutorial will guide you through training a convolutional neural network to classify objects using the CIFAR-10 image classification dataset.
+As shown in the following figure, the convolutional neural network can recognize the main object in images, and output the classification result.
+
+<center>![Image Classification](./image_classification.png)</center>
+
+## Data Preparation
+First, download CIFAR-10 dataset. CIFAR-10 dataset can be downloaded from its official website.
+
+<https://www.cs.toronto.edu/~kriz/cifar.html>
+
+We have prepared a script to download and process CIFAR-10 dataset. The script will download CIFAR-10 dataset from the official dataset.
+It will convert it to jpeg images and organize them into a directory with the required structure for the tutorial. Make sure that you have installed the python dependency (PIL).
+
+```bash
+cd demo/image_classification/data/
+sh download_cifar.sh
+```
+
+The CIFAR-10 dataset consists of 60000 32x32 color images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images.
+
+Here are the classes in the dataset, as well as 10 random images from each:
+<center>![Image Classification](./cifar.png)</center>
+
+
+After downloading and converting, we should find a directory (cifar-out) containing the dataset in the following format:
+
+```
+train
+---airplane
+---automobile
+---bird
+---cat
+---deer
+---dog
+---frog
+---horse
+---ship
+---truck
+test
+---airplane
+---automobile
+---bird
+---cat
+---deer
+---dog
+---frog
+---horse
+---ship
+---truck
+```
+
+It has two directories:`train` and `test`. These two directories contain training data and testing data of CIFAR-10, respectively. Each of these two folders contains 10 sub-folders, ranging from `airplane` to `truck`. Each sub-folder contains images with the corresponding label. After the images are organized into this structure, we are ready to train an image classification model.
+
+## Preprocess
+After the data has been downloaded, it needs to be pre-processed into the Paddle format. We can run the following command for preprocessing.
+
+```
+cd demo/image_classification/
+sh preprocess.sh
+```
+
+`preprocess.sh` calls `./demo/image_classification/preprocess.py` to preprocess image data.
+```sh
+export PYTHONPATH=$PYTHONPATH:../../
+data_dir=./data/cifar-out
+python preprocess.py -i $data_dir -s 32 -c 1
+```
+
+`./demo/image_classification/preprocess.py` has the following arguments
+
+- `-i` or `--input` specifes  the input data directory.
+- `-s` or `--size` specifies the processed size of images.
+- `-c` or `--color` specifes whether images are color images or gray images.
+
+
+## Model Training
+We need to create a model config file before training the model. An example of the config file (vgg_16_cifar.py) is listed below. **Note**, it is slightly different from the `vgg_16_cifar.py` which also applies to the prediction.
+
+```python
+from paddle.trainer_config_helpers import *
+data_dir='data/cifar-out/batches/'
+meta_path=data_dir+'batches.meta'
+args = {'meta':meta_path, 'mean_img_size': 32,
+        'img_size': 32, 'num_classes': 10,
+        'use_jpeg': 1, 'color': "color"}
+define_py_data_sources2(train_list=data_dir+"train.list",
+                        test_list=data_dir+'test.list',
+                        module='image_provider',
+                        obj='processData',
+                        args=args)
+settings(
+    batch_size = 128,
+    learning_rate = 0.1 / 128.0,
+    learning_method = MomentumOptimizer(0.9),
+    regularization = L2Regularization(0.0005 * 128))
+
+img = data_layer(name='image', size=3*32*32)
+lbl = data_layer(name="label", size=10)
+# small_vgg is predined in trainer_config_helpers.network
+predict = small_vgg(input_image=img, num_channels=3)
+outputs(classification_cost(input=predict, label=lbl))
+```
+
+The first line imports python functions for defining networks.
+```python
+from paddle.trainer_config_helpers import *
+```
+
+Then define an `define_py_data_sources2` which use python data provider
+interface. The arguments in `args` are used in `image_provider.py` which
+yeilds image data and transform them to Paddle.
+ - `meta`: the mean value of training set.
+ - `mean_img_size`: the size of mean feature map.
+ - `img_size`: the height and width of input image.
+ - `num_classes`: the number of classes.
+ - `use_jpeg`: the data storage type when preprocessing.
+ - `color`: specify color image.
+
+`settings` specifies the training algorithm. In the following example,
+it specifies learning rate as 0.1, but divided by batch size, and the weight decay
+is 0.0005 and multiplied by batch size.
+```python
+settings(
+    batch_size = 128,
+    learning_rate = 0.1 / 128.0,
+    learning_method = MomentumOptimizer(0.9),
+    regularization = L2Regularization(0.0005 * 128)
+)
+```
+
+The `small_vgg` specifies the network. We use a small version of VGG convolutional network as our network
+for classification. A description of VGG network can be found here [http://www.robots.ox.ac.uk/~vgg/research/very_deep/](http://www.robots.ox.ac.uk/~vgg/research/very_deep/).
+```python
+# small_vgg is predined in trainer_config_helpers.network
+predict = small_vgg(input_image=img, num_channels=3)
+```
+After writing the config, we can train the model by running the script train.sh. Notice that the following script assumes the you run the script in the `./demo/image_classification` folder. If you run the script in a different folder, you need to change the paths of the scripts and the configuration files accordingly.
+
+```bash
+config=vgg_16_cifar.py
+output=./cifar_vgg_model
+log=train.log
+
+paddle train \
+--config=$config \
+--dot_period=10 \
+--log_period=100 \
+--test_all_data_in_one_period=1 \
+--use_gpu=1 \
+--save_dir=$output \
+2>&1 | tee $log
+
+python -m paddle.utils.plotcurve -i $log > plot.png
+```
+
+- Here we use GPU mode to train. If you have no gpu environment, just set `use_gpu=0`.
+
+- `./demo/image_classification/vgg_16_cifar.py` is the network and data configuration file. The meaning of the other flags can be found in the documentation of the command line flags.
+
+- The script `plotcurve.py` requires the python module of `matplotlib`, so if it fails, maybe you need to install `matplotlib`.
+
+
+After training finishes, the training and testing error curve will be saved to `plot.png` using `plotcurve.py` script. An example of the plot is shown below:
+
+<center>![Training and testing curves.](./plot.png)</center>
+
+
+## Prediction
+After we train the model, the model file as well as the model parameters are stored in path `./cifar_vgg_model/pass-%05d`. For example, the model of the 300-th pass is stored at `./cifar_vgg_model/pass-00299`.
+
+To make a prediction for an image, such as `test.jpg`,  one can run `sh classify.sh ./cifar_vgg_model/pass-00299 test.jpg`. The script will output the label of the classfiication.
+
+
+## Exercise
+Train a image classification of birds using VGG model and CUB-200 dataset. The birds dataset can be downloaded here. It contains an image dataset with photos of 200 bird species (mostly North American).
+
+<http://www.vision.caltech.edu/visipedia/CUB-200.html>
+
+
+
+
+## Delve into Details
+### Convolutional Neural Network
+A Convolutional Neural Network is a feedforward neural network that uses convolution layers. It is very suitable for building neural networks that process and understand images. A standard convolutional neural network is shown below:
+
+![Convolutional Neural Network](./lenet.png)
+
+Convolutional Neural Network contains the following layers:
+
+- Convolutional layer: It uses convolution operation to extract features from an image or a feature map.
+- Pooling layer: It uses max-pooling to downsample feature maps.
+- Fully Connected layer: It uses fully connected connections to transform features.
+
+Convolutional Neural Network achieves amazing performance for image classification because it exploits two important characteristics of images: *local correlation* and *spatial invariance*. By iteratively applying convolution and max-pooing operations, convolutional neural network can well represent these two characteristics of images.
+
+
+For more details of how to define layers and their connections, please refer to the documentation of layers.
diff --git a/doc/demo/image_classification/image_classification.png b/doc/demo/image_classification/image_classification.png
new file mode 100644
index 00000000000000..14f255805081c1
Binary files /dev/null and b/doc/demo/image_classification/image_classification.png differ
diff --git a/doc/demo/image_classification/index.rst b/doc/demo/image_classification/index.rst
new file mode 100644
index 00000000000000..1ea68f14164b22
--- /dev/null
+++ b/doc/demo/image_classification/index.rst
@@ -0,0 +1,10 @@
+Image Classification Tutorial
+=============================
+
+.. toctree::
+    :maxdepth: 3
+    :glob:
+
+    Training Locally <image_classification.md>
+    cluster_train/internal/cluster_train.md
+    cluster_train/opensource/cluster_train.md
diff --git a/doc/demo/image_classification/lenet.png b/doc/demo/image_classification/lenet.png
new file mode 100644
index 00000000000000..1e6f2b32bad797
Binary files /dev/null and b/doc/demo/image_classification/lenet.png differ
diff --git a/doc/demo/image_classification/plot.png b/doc/demo/image_classification/plot.png
new file mode 100644
index 00000000000000..a31f99791c670e
Binary files /dev/null and b/doc/demo/image_classification/plot.png differ
diff --git a/doc/demo/imagenet_model/resnet_block.jpg b/doc/demo/imagenet_model/resnet_block.jpg
new file mode 100644
index 00000000000000..e16bd3c624030c
Binary files /dev/null and b/doc/demo/imagenet_model/resnet_block.jpg differ
diff --git a/doc/demo/imagenet_model/resnet_model.md b/doc/demo/imagenet_model/resnet_model.md
new file mode 100644
index 00000000000000..21c3a4cee53e16
--- /dev/null
+++ b/doc/demo/imagenet_model/resnet_model.md
@@ -0,0 +1,281 @@
+# Model Zoo - ImageNet #
+
+[ImageNet](http://www.image-net.org/) is a popular dataset for generic object classification. This tutorial provided convolutional neural network(CNN) models for ImageNet.
+
+## ResNet Introduction
+
+ResNets from paper [Deep Residual Learning for Image Recognition](http://arxiv.org/abs/1512.03385) won the 1st place on the ILSVRC 2015 classification task. They present residual learning framework to ease the training of networks that are substantially deeper than those used previously. The residual connections are shown in following figure. The left building block is used in network of 34 layers and the right bottleneck building block is used in network of 50, 101, 152 layers .
+
+<center>![resnet_block](./resnet_block.jpg)</center>
+<center>Figure 1. ResNet Block</center>
+
+We present three ResNet models, which are converted from the models provided by the authors <https://github.com/KaimingHe/deep-residual-networks>.  The classfication errors tested in PaddlePaddle on 50,000 ILSVRC validation set with input images channel order of **BGR** by single scale with the shorter side of 256 and single crop as following table.
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+<colgroup>
+<col  class="left" />
+<col  class="left" />
+<col  class="left" />
+</colgroup>
+<thead>
+<tr>
+<th scope="col" class="left">ResNet</th>
+<th scope="col" class="left">Top-1</th>
+<th scope="col" class="left">Model Size</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">ResNet-50</td>
+<td class="left">24.9%</td>
+<td class="left">99M</td>
+</tr>
+<tr>
+<td class="left">ResNet-101</td>
+<td class="left">23.7%</td>
+<td class="left">173M</td>
+</tr>
+<tr>
+<td class="left">ResNet-152</td>
+<td class="left">23.2%</td>
+<td class="left">234M</td>
+</tr>
+</tbody>
+
+</table></center>
+<br>
+
+## ResNet Model
+
+See ```demo/model_zoo/resnet/resnet.py```. This confgiure contains network of 50, 101 and 152 layers. You can specify layer number by adding argument like this ```--config_args=layer_num=50``` in command line arguments.
+
+### Network Visualization
+
+You can get a diagram of ResNet network by running the following command. The script generates dot file and then converts dot file to PNG file, which uses installed draw_dot tool in our server. If you can not access the server, just install graphviz to convert dot file.
+
+```
+cd demo/model_zoo/resnet
+./net_diagram.sh
+```
+
+### Model Download
+
+```
+cd demo/model_zoo/resnet
+./get_model.sh
+```
+You can run above command to download all models and mean file and save them in ```demo/model_zoo/resnet/model``` if downloading successfully.
+
+```
+mean_meta_224  resnet_101  resnet_152  resnet_50
+```
+   * resnet_50: model of 50 layers.
+   * resnet_101: model of 101 layers.
+   * resnet_152: model of 152 layers.
+   * mean\_meta\_224: mean file with 3 x 224 x 224 size in **BGR** order. You also can use three mean values: 103.939, 116.779, 123.68.
+
+### Parameter Info
+
+* **Convolution Layer Weight**
+
+  As batch normalization layer is connected after each convolution layer, there is no parameter of bias and only one weight in this layer.
+  shape: `(Co, ky, kx, Ci)`
+   * Co: channle number of output feature map.
+   * ky: filter size in vertical direction.
+   * kx: filter size in horizontal direction.
+   * Ci: channle number of input feature map.
+
+  2-Dim matrix: (Co * ky * kx, Ci), saved in row-major order.
+
+* **Fully connected Layer Weight**
+
+  2-Dim matrix: (input layer size, this layer size), saved in row-major order.
+
+* **[Batch Normalization](<http://arxiv.org/abs/1502.03167>) Layer Weight**
+
+There are four parameters in this layer. In fact, only .w0 and .wbias are the learned parameters. The other two are therunning mean and variance respectively. They will be loaded in testing. Following table shows parameters of a batch normzalization layer.
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+<colgroup>
+<col  class="left" />
+<col  class="left" />
+<col  class="left" />
+</colgroup>
+<thead>
+<tr>
+<th scope="col" class="left">Parameter Name</th>
+<th scope="col" class="left">Number</th>
+<th scope="col" class="left">Meaning</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">_res2_1_branch1_bn.w0</td>
+<td class="left">256</td>
+<td class="left">gamma, scale parameter</td>
+</tr>
+<tr>
+<td class="left">_res2_1_branch1_bn.w1</td>
+<td class="left">256</td>
+<td class="left">mean value of feature map</td>
+</tr>
+<tr>
+<td class="left">_res2_1_branch1_bn.w2</td>
+<td class="left">256</td>
+<td class="left">variance of feature map</td>
+</tr>
+<tr>
+<td class="left">_res2_1_branch1_bn.wbias</td>
+<td class="left">256</td>
+<td class="left">beta, shift parameter</td>
+</tr>
+</tbody>
+
+</table></center>
+<br>
+
+### Parameter Observation
+
+Users who want to observe the parameters can use python to read:
+
+```
+import sys
+import numpy as np
+
+def load(file_name):
+    with open(file_name, 'rb') as f:
+        f.read(16) # skip header for float type.
+        return np.fromfile(f, dtype=np.float32)
+
+if __name__=='__main__':
+    weight = load(sys.argv[1])
+```
+
+or simply use following shell command:
+
+```
+od -j 16 -f _res2_1_branch1_bn.w0
+```
+
+## Feature Extraction
+
+We provide both C++ and Python interfaces to extract features. The following examples use data in `demo/model_zoo/resnet/example` to show the extracting process in detail.
+
+### C++ Interface
+
+First, specify image data list in `define_py_data_sources` in the config, see example `demo/model_zoo/resnet/resnet.py`.
+
+```
+    train_list = 'train.list' if not is_test else None
+    # mean.meta is mean file of ImageNet dataset.
+    # mean.meta size : 3 x 224 x 224.
+    # If you use three mean value, set like:
+    # "mean_value:103.939,116.779,123.68;"
+    args={
+        'mean_meta': "model/mean_meta_224/mean.meta",
+        'image_size': 224, 'crop_size': 224,
+        'color': True,'swap_channel:': [2, 1, 0]}
+    define_py_data_sources2(train_list,
+                           'example/test.list',
+                           module="example.image_list_provider",
+                           obj="processData",
+                           args=args)
+```
+
+Second, specify layers to extract features in `Outputs()` of `resnet.py`. For example,
+
+```
+Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn")
+```
+
+Third, specify model path and output directory in `extract_fea_c++.sh
+`, and then run following commands
+
+```
+cd demo/model_zoo/resnet
+./extract_fea_c++.sh
+```
+
+If successful, features are saved in `fea_output/rank-00000` as follows. And you can use `load_feature_c` interface in `load_feature.py ` to load such a file.
+
+```
+-0.115318 -0.108358 ... -0.087884;-1.27664 ... -1.11516 -2.59123;
+-0.126383 -0.116248 ... -0.00534909;-1.42593 ... -1.04501 -1.40769;
+```
+
+* Each line stores features of a sample. Here, the first line stores features of `example/dog.jpg` and second line stores features of `example/cat.jpg`.
+* Features of different layers are splitted by `;`, and their order is consistent with the layer order in `Outputs()`. Here, the left features are `res5_3_branch2c_conv` layer and right features are `res5_3_branch2c_bn` layer.
+
+### Python Interface
+
+`demo/model_zoo/resnet/classify.py` is an example to show how to use python to extract features. Following example still uses data of `./example/test.list`. Command is as follows:
+
+```
+cd demo/model_zoo/resnet
+./extract_fea_py.sh
+```
+
+extract_fea_py.sh:
+
+```
+python classify.py \
+     --job=extract \
+     --conf=resnet.py\
+     --mean=model/mean_meta_224/mean.meta \
+     --model=model/resnet_50 \
+     --data=./example/test.list \
+     --output_layer="res5_3_branch2c_conv,res5_3_branch2c_bn" \
+     --output_dir=features
+
+```
+* --job=extract:              specify job mode to extract feature.
+* --conf=resnet.py:           network configure.
+* --model=model/resnet_5:     model path.
+* --data=./example/test.list: data list.
+* --output_layer="xxx,xxx":   specify layers to extract features.
+* --output_dir=features:      output diretcoty.
+
+If run successfully, you will see features saved in `features/batch_0`, this file is produced with cPickle. You can use `load_feature_py` interface in `load_feature.py` to open the file, and it returns a dictionary as follows:
+
+```
+{
+'cat.jpg': {'res5_3_branch2c_conv': array([[-0.12638293, -0.116248  , -0.11883899, ..., -0.00895038, 0.01994277, -0.00534909]], dtype=float32), 'res5_3_branch2c_bn': array([[-1.42593431, -1.28918779, -1.32414699, ..., -1.45933616, -1.04501402, -1.40769434]], dtype=float32)},
+'dog.jpg': {'res5_3_branch2c_conv': array([[-0.11531784, -0.10835785, -0.08809858, ...,0.0055237, 0.01505112, -0.08788397]], dtype=float32), 'res5_3_branch2c_bn': array([[-1.27663755, -1.18272924, -0.90937918, ..., -1.25178063, -1.11515927, -2.59122872]], dtype=float32)}
+}
+```
+
+Observed carefully, these feature values are consistent with the above results extracted by C++ interface.
+
+## Prediction
+
+`classify.py` also can be used to predict. We provide an example script `predict.sh` to predict data in `example/test.list` using a ResNet model with 50 layers.
+
+```
+cd demo/model_zoo/resnet
+./predict.sh
+```
+
+predict.sh calls the `classify.py`:
+
+```
+python classify.py \
+     --job=predict \
+     --conf=resnet.py\
+     --multi_crop \
+     --model=model/resnet_50 \
+     --data=./example/test.list
+```
+* --job=extract:              speficy job mode to predict.
+* --conf=resnet.py:           network configure.
+* --multi_crop:               use 10 crops and average predicting probability.
+* --model=model/resnet_50:    model path.
+* --data=./example/test.list: data list.
+
+If run successfully, you will see following results, where 156 and 285 are labels of the images.
+
+```
+Label of example/dog.jpg is: 156
+Label of example/cat.jpg is: 282
+```
diff --git a/doc/demo/index.md b/doc/demo/index.md
new file mode 100644
index 00000000000000..5ca0e56dae0843
--- /dev/null
+++ b/doc/demo/index.md
@@ -0,0 +1,24 @@
+# Examples and demos
+There are serveral examples and demos here.
+
+## Image
+
+* [Image Classification](image_classification/index.rst)
+
+## NLP
+
+* [Sentiment Analysis](sentiment_analysis/index.rst)
+* [Text Generation](text_generation/index.rst)
+* [Semantic Role Labeling](semantic_role_labeling/index.md)
+
+## Recommendation
+
+* [MovieLens Dataset](rec/ml_dataset.md)
+* [MovieLens Regression](rec/ml_regression.rst)
+
+## Model Zoo
+* [ImageNet: ResNet](imagenet_model/resnet_model.md)
+* [Embedding: Chinese Word](embedding_model/index.md)
+
+## Customization
+* [Writing New Layers](new_layer/index.md)
diff --git a/doc/demo/new_layer/FullyConnected.jpg b/doc/demo/new_layer/FullyConnected.jpg
new file mode 100644
index 00000000000000..b2241f401434e5
Binary files /dev/null and b/doc/demo/new_layer/FullyConnected.jpg differ
diff --git a/doc/demo/new_layer/index.md b/doc/demo/new_layer/index.md
new file mode 100644
index 00000000000000..00058ca0c3731b
--- /dev/null
+++ b/doc/demo/new_layer/index.md
@@ -0,0 +1,290 @@
+Writing New Layers
+----------
+This tutorial will guide you to write customized layers in PaddlePaddle. We will utilize fully connected layer as an example to guide you through the following steps for writing a new layer.
+- Derive equations for the forward and backward part of the layer.
+- Implement C++ class for the layer.
+- Implement Python Wrapper for the layer.
+
+## Derive Equations
+First we need to derive equations of the *forward* and *backward* part of the layer. The forward part computes the output given an input. The backward part computes the gradients of the input and the parameters given the the gradients of the output.
+
+The illustration of a fully connected layer is shown in the following figure. In a fully connected layer, all output nodes are connected to all the input nodes.
+<center> ![](./FullyConnected.jpg) </center>
+
+The *forward part* of a layer transforms an input into the corresponding output.
+Fully connected layer takes a dense input vector with dimension $D_i$. It uses a transformation matrix $W$ with size $D_i \times D_o$ to project x into a $D_o$ dimensional vector, and add a bias vector  $b$ with dimension $D_o$ to the vector.
+\[y = f(W^T x + b) \]
+where $f(.)$ is an nonlinear *activation* function, such as sigmoid, tanh, and Relu.
+
+The transformation matrix $W$ and bias vector $b$ are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter. Suppose our loss function is $c(y)$, then
+\[\frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x} \]
+
+Suppose $z = f(W^T x + b)$, then
+\[ \frac{\partial y}{\partial z} = \frac{\partial f(z)}{\partial z}\]
+ This derivative can be automatically computed by our base layer class.
+
+Then, for fully connected layer, we need to compute $\frac{\partial z}{\partial x}$, and $\frac{\partial z}{\partial W}$, and $\frac{\partial z}{\partial b}$.
+\[ \frac{\partial z}{\partial x} = W \]
+\[ \frac{\partial z_j}{\partial W_{ij}} = x_i \]
+\[ \frac{\partial z}{\partial b} = \mathbf 1 \]
+where $\mathbf 1$ is an all one vector, $W_{ij}$ is the number at the i-th row and j-th column of the matrix $W$, $z_j$ is the j-th component of the vector $z$, and $x_i$ is the i-th component of the vector $x$.
+
+Then we can use chain rule to calculate $\frac{\partial z}{\partial x}$, and $\frac{\partial z}{\partial W}$. The details of the computation will be given in the next section.
+
+## Implement C++ Class
+The C++ class of the layer implements the initialization, forward, and backward part of the layer. The fully connected layer is at `paddle/gserver/layers/FullyConnectedLayer.h` and `paddle/gserver/layers/FullyConnectedLayer.cpp`. We list simplified version of the code below.
+
+It needs to derive the base class `paddle::BaseLayer`, and it needs to override the following functions:
+
+- constructor and destructor.
+- `init` function. It is used to initialize the parameters and settings.
+- `forward`. It implements the forward part of the layer.
+- `backward`. It implements the backward part of the layer.
+- `prefetch`. It is utilized to determine the rows corresponding parameter matrix to prefetch from parameter server. You do not need to override this function if your layer does not need remote sparse update. (most layers do not need to support remote sparse update)
+
+
+The header file is listed below:
+
+```C
+namespace paddle {
+/**
+ * A layer has full connections to all neurons in the previous layer.
+ * It computes an inner product with a set of learned weights, and
+ * (optionally) adds biases.
+ *
+ * The config file api is fc_layer.
+ */
+
+class FullyConnectedLayer : public Layer {
+protected:
+  WeightList weights_;
+  std::unique_ptr<Weight> biases_;
+
+public:
+  explicit FullyConnectedLayer(const LayerConfig& config)
+      : Layer(config) {}
+  ~FullyConnectedLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  Weight& getWeight(int idx) { return *weights_[idx]; }
+
+  void prefetch();
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+}  // namespace paddle
+```
+
+It defines the parameters as class variables. We use `Weight` class as abstraction of parameters. It supports multi-thread update. The details of this class will be described in details in the implementations.
+- `weights_` is a list of weights for the transformation matrices. The current implementation can have more than one inputs. Thus, it has a list of weights. One weight corresponds to an input.
+- `biases_` is a weight for the bias vector.
+
+The fully connected layer does not have layer configuration hyper-parameters. If there are some layer hyper-parameters, a common practice is to store it in `LayerConfig& config`, and put it into a class variable in the constructor.
+
+The following code snippet implements the `init` function.
+- First, every `init` function must call the `init` function of the base class `Layer::init(layerMap, parameterMap);`. This statement will initialize the required variables and connections for each layer.
+- The it initializes all the weights matrices $W$. The current implementation can have more than one inputs. Thus, it has a list of weights.
+- Finally, it initializes the bias.
+
+
+```C
+bool FullyConnectedLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* initialize the weightList */
+  CHECK(inputLayers_.size() == parameters_.size());
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    // Option the parameters
+    size_t height = inputLayers_[i]->getSize();
+    size_t width = getSize();
+
+    // create a new weight
+    if (parameters_[i]->isSparse()) {
+      CHECK_LE(parameters_[i]->getSize(), width * height);
+    } else {
+      CHECK_EQ(parameters_[i]->getSize(), width * height);
+    }
+    Weight* w = new Weight(height, width, parameters_[i]);
+
+    // append the new weight to the list
+    weights_.emplace_back(w);
+  }
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  return true;
+}
+
+```
+
+
+The implementation of the forward part has the following steps.
+- Every layer must call `Layer::forward(passType);` at the beginning of its `forward` function.
+- Then it allocates memory for the output using `reserveOutput(batchSize, size);`. This step is necessary because we support the batches to have different batch sizes. `reserveOutput` will change the size of the output accordingly. For the sake of efficiency, we will allocate new memory if we want to expand the matrix, but we will reuse the existing memory block if we want to shrink the matrix.
+- Then it computes $\sum_i W_i x + b$ using Matrix operations. `getInput(i).value` retrieve the matrix of the i-th input. Each input is a $batchSize \times dim$ matrix, where each row represents an single input in a batch. For a complete lists of supported matrix operations, please refer to `paddle/math/Matrix.h` and `paddle/math/BaseMatrix.h`.
+- Finally it applies the activation function using `forwardActivation();`. It will automatically applies the corresponding activation function specifies in the network configuration.
+
+
+```C
+void FullyConnectedLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInput(0).getBatchSize();
+  int size = getSize();
+
+  {
+    // Settup the size of the output.
+    reserveOutput(batchSize, size);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  // Apply the the transformation matrix to each input.
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    auto input = getInput(i);
+    CHECK(input.value) << "The input of 'fc' layer must be matrix";
+    i == 0 ? outV->mul(input.value, weights_[i]->getW(), 1, 0)
+           : outV->mul(input.value, weights_[i]->getW(), 1, 1);
+  }
+
+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */ {
+    forwardActivation();
+  }
+}
+```
+
+The implementation of the backward part has the following steps.
+- ` backwardActivation();` computes the gradients of the activation. The gradients will be multiplies in place to the gradients of the output, which can be retrieved using `getOutputGrad()`.
+- Compute the gradients of bias. Notice that we an use `biases_->getWGrad()` to get the gradient matrix of the corresponding parameter. After the gradient of one parameter is updated, it *MUST* call `getParameterPtr()->incUpdate(callback);`. This is utilize for parameter update over multiple threads or multiple machines.
+- Then it computes the gradients of the transformation matrices and inputs, and it calls `incUpdate` for the corresponding parameter. This gives the framework the chance to know whether it has gathered all the gradient to one parameter so that it can do some overlapping work (e.g., network communication)
+
+
+```C
+void FullyConnectedLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation for activations.*/ {
+    backwardActivation();
+  }
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  bool syncFlag = hl_get_sync_flag();
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    /* Calculate the W-gradient for the current layer */
+    if (weights_[i]->getWGrad()) {
+      MatrixPtr input_T = getInputValue(i)->getTranspose();
+      MatrixPtr oGrad = getOutputGrad();
+      {
+        weights_[i]->getWGrad()->mul(input_T, oGrad, 1, 1);
+      }
+    }
+
+
+    /* Calculate the input layers error */
+    MatrixPtr preGrad = getInputGrad(i);
+    if (NULL != preGrad) {
+      MatrixPtr weights_T = weights_[i]->getW()->getTranspose();
+      preGrad->mul(getOutputGrad(), weights_T, 1, 1);
+    }
+
+    {
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+```
+
+The `prefetch` function specifies the rows that need to be fetched from parameter server during training. It is only useful for remote sparse training. In remote sparse training, the full parameter matrix is stored distributedly at the parameter server. When the layer uses a batch for training, only a subset of locations of the input is non-zero in this batch. Thus, this layer only needs the rows of the transformation matrix corresponding to the locations of these non-zero entries. The `prefetch` function specifies the ids of these rows.
+
+Most of the layers do not need remote sparse training function. You do not need to override this function in this case.
+
+```C
+void FullyConnectedLayer::prefetch() {
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    auto* sparseParam =
+        dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
+    if (sparseParam) {
+      MatrixPtr input = getInputValue(i);
+      sparseParam->addRows(input);
+    }
+  }
+}
+```
+
+Finally, you can use `REGISTER_LAYER(fc, FullyConnectedLayer);` to register the layer. `fc` is the identifier of the layer, and `FullyConnectedLayer` is the class name of the layer.
+
+```C
+namespace paddle {
+REGISTER_LAYER(fc, FullyConnectedLayer);
+}
+```
+
+If the `cpp` file is put into `paddle/gserver/layers`, it will be automatically compiled.
+
+## Implement Python Wrapper
+Implementing Python wrapper allows us to use the added layer in configuration files. All the Python wrappers are in file `python/paddle/trainer/config_parser.py`. An example of the Python wrapper for fully connected layer is listed below. It has the following steps:
+- Use `@config_layer('fc’)` at the decorator for all the Python wrapper class. `fc` is the identifier of the layer.
+- Implements `__init__` constructor function.
+	- It first call  `super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)` base constructor function. `FCLayer` is the Python wrapper class name, and `fc` is the layer identifier name. They must be correct in order for the wrapper to work.
+	- Then it computes the size and format (whether sparse) of each transformation matrix as well as the size.
+
+```python
+@config_layer('fc')
+class FCLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            size,
+            inputs,
+            bias=True,
+            **xargs):
+        super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            psize = self.config.size * input_layer.size
+            dims = [input_layer.size, self.config.size]
+            format = self.inputs[input_index].format
+            sparse = format == "csr" or format == "csc"
+
+            if sparse:
+                psize = self.inputs[input_index].nnz
+
+            self.create_input_parameter(input_index, psize, dims, sparse, format)
+        self.create_bias_parameter(bias, self.config.size)
+```
+
+In network configuration, the layer can be specifies using the following code snippets. The arguments of this class are:
+- `name` is the name identifier of the layer instance.
+- `type` is the type of the layer, specified using layer identifier.
+- `size` is the output size of the layer.
+- `bias` specifies whether this layer instance has bias.
+- `inputs` specifies a list of layer instance names as inputs.
+
+```python
+Layer(
+    name = "fc1",
+    type = "fc",
+    size = 64,
+    bias = True,
+    inputs = [Input("pool3")]
+)
+```
+
+You are also recommended to implement a helper for the Python wrapper, which makes it easier to write models. You can refer to `python/paddle/trainer_config_helpers/layers.py` for examples.
diff --git a/doc/demo/quick_start/NetContinuous_en.png b/doc/demo/quick_start/NetContinuous_en.png
new file mode 100644
index 00000000000000..69be34f962c51d
Binary files /dev/null and b/doc/demo/quick_start/NetContinuous_en.png differ
diff --git a/doc/demo/quick_start/NetConv_en.png b/doc/demo/quick_start/NetConv_en.png
new file mode 100644
index 00000000000000..01fe4d725df6e1
Binary files /dev/null and b/doc/demo/quick_start/NetConv_en.png differ
diff --git a/doc/demo/quick_start/NetLR_en.png b/doc/demo/quick_start/NetLR_en.png
new file mode 100644
index 00000000000000..d6dc01f87e1f33
Binary files /dev/null and b/doc/demo/quick_start/NetLR_en.png differ
diff --git a/doc/demo/quick_start/NetRNN_en.png b/doc/demo/quick_start/NetRNN_en.png
new file mode 100644
index 00000000000000..8c11b8ae0c3d32
Binary files /dev/null and b/doc/demo/quick_start/NetRNN_en.png differ
diff --git a/doc/demo/quick_start/PipelineNetwork_en.jpg b/doc/demo/quick_start/PipelineNetwork_en.jpg
new file mode 100644
index 00000000000000..e779aed06d5cdb
Binary files /dev/null and b/doc/demo/quick_start/PipelineNetwork_en.jpg differ
diff --git a/doc/demo/quick_start/PipelineTest_en.png b/doc/demo/quick_start/PipelineTest_en.png
new file mode 100644
index 00000000000000..35fa4314d68fdc
Binary files /dev/null and b/doc/demo/quick_start/PipelineTest_en.png differ
diff --git a/doc/demo/quick_start/PipelineTrain_en.png b/doc/demo/quick_start/PipelineTrain_en.png
new file mode 100644
index 00000000000000..c443f416821d9e
Binary files /dev/null and b/doc/demo/quick_start/PipelineTrain_en.png differ
diff --git a/doc/demo/quick_start/Pipeline_en.jpg b/doc/demo/quick_start/Pipeline_en.jpg
new file mode 100644
index 00000000000000..21a7a7bb6a1af7
Binary files /dev/null and b/doc/demo/quick_start/Pipeline_en.jpg differ
diff --git a/doc/demo/quick_start/index_en.md b/doc/demo/quick_start/index_en.md
new file mode 100644
index 00000000000000..3a2c39d11bf143
--- /dev/null
+++ b/doc/demo/quick_start/index_en.md
@@ -0,0 +1,556 @@
+# PaddlePaddle Quick Start Tutorial
+
+This tutorial will teach the basics of deep learning (DL), including how to implement many different models in PaddlePaddle. You will learn how to:
+  - Prepare data into the standardized format that PaddlePaddle accepts.
+  - Write data providers that read data into PaddlePaddle.
+  - Configure neural networks in PaddlePaddle layer by layer.
+  - Train models.
+  - Perform inference with trained models.
+
+
+## Install
+
+To get started, please install PaddlePaddle on your computer. Throughout this tutorial, you will learn by implementing different DL models for text classification.
+
+To install PaddlePaddle, please follow the instructions here: <a href = "../../build/index.html" >Build and Install</a>.
+
+## Overview
+For the first step, you will use PaddlePaddle to build a **text classification** system. For example, suppose you run an e-commence  website, and you want to analyze the sentiment of user reviews to evaluate product quality.
+
+For example, given the input
+
+```
+This monitor is fantastic.
+```
+
+Your classifier should output “positive”, since this text snippet shows that the user is satisfied with the product. Given this input:
+
+```
+The monitor breaks down two months after purchase.
+```
+
+the classifier should output “negative“.
+
+To build your text classification system, your code will need to perform five steps:
+<center> ![](./Pipeline_en.jpg) </center>
+
+  - Preprocess data into a standardized format.
+  - Provide data to the learning model.
+  - Specify the neural network structure.
+  - Train the model.
+  - Inference (make prediction on test examples).
+
+
+1. Preprocess data into standardized format
+    - In the text classification example, you will start with a text file with one training example per line. Each line contains category id (in machine learning, often denoted the target y), followed by the input text (often denoted x); these two elements are separated by a Tab. For example: ```positive [tab] This monitor is fantastic```. You will preprocess this raw data into a format that Paddle can use.
+
+2. Provide data to the learning model.
+    - You can write data providers in Python. For any required data preprocessing step, you can add the preprocessing code to the PyDataProvider Python file.
+    - In our text classification example, every word or character will be converted into an integer id, specified in a dictionary file. It perform a dictionary lookup in PyDataProvider to get the id.
+3. Specify neural network structure.  (From easy to hard, we provide 4 kinds of network configurations)
+    - A logistic regression model.
+    - A word embedding model.
+    - A convolutional neural network model.
+    - A sequential recurrent neural network model.
+    - You will also learn different learning algorithms.
+4. Training model.
+5. Inference.
+
+## Preprocess data into standardized format
+In this example, you are going to use [Amazon electronic product review dataset](http://jmcauley.ucsd.edu/data/amazon/) to build a bunch of deep neural network models for text classification. Each text in this dataset is a product review. This dataset has two categories: “positive” and “negative”. Positive means the reviewer likes the product, while negative means the reviewer does not like the product.
+
+`demo/quick_start` provides scripts for downloading data and preprocessing data, as shown below:
+
+```bash
+cd demo/quick_start
+./data/get_data.sh
+pip install -r requirements.txt
+./preprocess.sh
+```
+
+## Transfer Data to Model
+### Write Data Provider with Python
+The following `dataprovider_bow.py` gives a complete example of writing data provider with Python. It includes the following parts:
+
+* initalizer： define the additional meta-data of the data provider and the types of the input data.
+* process： Each `yield` returns a data sample. In this case, it return the text representation and category id. The order of features in the returned result needs to be consistent with the definition of the input types in `initalizer`.
+
+```python
+from paddle.trainer.PyDataProvider2 import *
+
+# id of the word not in dictionary
+UNK_IDX = 0
+
+# initializer is called by the framework during initialization.
+# It allows the user to describe the data types and setup the
+# necessary data structure for later use.
+# `settings` is an object. initializer need to properly fill settings.input_types.
+# initializer can also store other data structures needed to be used at process().
+# In this example, dictionary is stored in settings.
+# `dictionay` and `kwargs` are arguments passed from trainer_config.lr.py
+def initializer(settings, dictionary, **kwargs):
+    # Put the word dictionary into settings
+    settings.word_dict = dictionary
+
+    # setting.input_types specifies what the data types the data provider
+    # generates.
+    settings.input_types = [
+        # The first input is a sparse_binary_vector,
+        # which means each dimension of the vector is either 0 or 1. It is the
+        # bag-of-words (BOW) representation of the texts.
+        sparse_binary_vector(len(dictionary)),
+        # The second input is an integer. It represents the category id of the
+        # sample. 2 means there are two labels in the dataset.
+        # (1 for positive and 0 for negative)
+        integer_value(2)]
+
+# Delaring a data provider. It has an initializer 'data_initialzer'.
+# It will cache the generated data of the first pass in memory, so that
+# during later pass, no on-the-fly data generation will be needed.
+# `setting` is the same object used by initializer()
+# `file_name` is the name of a file listed train_list or test_list file given
+# to define_py_data_sources2(). See trainer_config.lr.py.
+@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_name):
+    # Open the input data file.
+    with open(file_name, 'r') as f:
+        # Read each line.
+        for line in f:
+            # Each line contains the label and text of the comment, separated by \t.
+            label, comment = line.strip().split('\t')
+
+            # Split the words into a list.
+            words = comment.split()
+
+            # convert the words into a list of ids by looking them up in word_dict.
+            word_vector = [settings.word_dict.get(w, UNK_IDX) for w in words]
+
+            # Return the features for the current comment. The first is a list
+            # of ids representing a 0-1 binary sparse vector of the text,
+            # the second is the integer id of the label.
+            yield word_vector, int(label)
+```
+
+### Define Python Data Provider in Configuration files.
+You need to add a data provider definition `define_py_data_sources2` in our network configuration. This definition specifies:
+
+- The path of the training and testing data (`data/train.list`, `data/test.list`).
+- The location of the data provider file (`dataprovider_pow`).
+- The function to call to get data. (`process`).
+- Additional arguments or data. Here it passes the path of word dictionary.
+
+```python
+from paddle.trainer_config_helpers import *
+
+file = "data/dict.txt"
+word_dict = dict()
+with open(dict_file, 'r') as f:
+    for i, line in enumerate(f):
+        w = line.strip().split()[0]
+        word_dict[w] = i
+# define the data sources for the model.
+# We need to use different process for training and prediction.
+# For training, the input data includes both word IDs and labels.
+# For prediction, the input data only includs word Ids.
+define_py_data_sources2(train_list='data/train.list',
+                        test_list='data/test.list',
+                        module="dataprovider_bow",
+                        obj="process",
+                        args={"dictionary": word_dict})
+```
+
+You can refer to the following link for more detailed examples
+: <a href = "../../ui/data_provider/python_case.html">Python Use Case</a>，The detailed documentation on data format is: <a href = "../../ui/api/py_data_provider_wrapper.html"> PyDataProviderWrapper</a>。
+
+## Network Architecture
+You will describe four kinds of network architectures in this section.
+<center> ![](./PipelineNetwork_en.jpg) </center>
+
+First, you will build a logistic regression model. Later, you will also get chance to build other more powerful network architectures.
+For more detailed documentation, you could refer to: <a href = "../../ui/api/trainer_config_helpers/layers_index.html">Layer documentation</a>。All configuration files are in `demo/quick_start` directory.
+
+### Logistic Regression
+The architecture is illustrated in the following picture:
+<center> ![](./NetLR_en.png) </center>
+
+- You need define the data for text features. The size of the data layer is the number of words in the dictionary.
+
+```python
+word = data_layer(name="word",  size=voc_dim)
+```
+
+- You also need to define the category id for each example. The size of the data layer is the number of labels.
+
+```python
+label = data_layer(name="label", size=label_dim)
+```
+
+- It uses logistic regression model to classify the vector, and it will output the classification error during training.
+	- Each layer has an *input* argument that specifies its input layer. Some layers can have multiple input layers. You can use a list of the input layers as input in that case.
+	- *size* for each layer means the number of neurons of the layer.
+	- *act_type* means activation function applied to the output of each neuron independently.
+	- Some layers can have additional special inputs. For example, `classification_cost` needs ground truth label as input to compute classification loss and error.
+```python
+# Define a fully connected layer with logistic activation (also called softmax activation).
+output = fc_layer(input=word,
+                  size=label_dim,
+                  act_type=SoftmaxActivation())
+# Define cross-entropy classification loss and error.
+classification_cost(input=output, label=label)
+```
+
+Performance summary: You can refer to the training and testing scripts later. In order to compare different network architectures, the model complexity and test classification error are listed in the following table:
+
+<html>
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+
+<thead>
+<th scope="col" class="left">Network name</th>
+<th scope="col" class="left">Number of parameters</th>
+<th scope="col" class="left">Test error</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">Logistic regression</td>
+<td class="left">252 KB</td>
+<td class="left">8.652%</td>
+</tr>
+
+</tbody>
+</table></center>
+</html>
+<br>
+
+### Word Embedding Model
+In order to use the word embedding model, you need to change the data provider a little bit to make the input words as a sequence of word IDs. The revised data provider is listed below. You only need to change initializer() for the type of the first input. It is changed from sparse_binary_vector to sequence of intergers.  process() remains the same. This data provider can also be used for later sequence models.
+
+```python
+def initializer(settings, dictionary, **kwargs):
+    # Put the word dictionary into settings
+    settings.word_dict = dictionary
+    settings.input_types = [
+        # Define the type of the first input as a sequence of integers.
+        integer_value_sequence(len(dictionary)),
+        # Define the second input for label id
+        integer_value(2)]
+
+@provider(init_hook=initializer)
+def process(settings, file_name):
+    ...
+    # omitted, it is same as the data provider for LR model
+```
+
+This model is very similar to the framework of logistic regression, but it uses word embedding vectors instead of a sparse vectors to represent words.
+<center> ![](./NetContinuous_en.png) </center>
+
+- It can look up the dense word embedding vector in the dictionary  (its words embedding vector is `word_dim`). The input is a sequence of N words, the output is N word_dim dimensional vectors.
+
+```python
+emb = embedding_layer(input=word, dim=word_dim)
+```
+
+- It averages all the word embedding in a sentence to get its sentence representation.
+
+```python
+avg = pooling_layer(input=emb, pooling_type=AvgPooling())
+```
+
+The other parts of the model are the same as logistic regression network.
+
+The performance is summarized in the following table:：
+
+<html>
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+
+<thead>
+<th scope="col" class="left">Network name</th>
+<th scope="col" class="left">Number of parameters</th>
+<th scope="col" class="left">Test error</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">Word embedding model</td>
+<td class="left">15 MB</td>
+<td class="left">8.484%</td>
+</tr>
+
+</tbody>
+</table>
+</html></center>
+<br>
+
+### Convolutional Neural Network Model
+Convolutional neural network converts a sequence of word embeddings into a sentence representation using temporal convolutions. You will transform the fully connected layer of the word embedding model to 3 new sub-steps.
+<center> ![](./NetConv_en.png) </center>
+
+
+Text convolution has 3 steps:
+1. Get K nearest neighbor context of each word in a sentence, stack them into a 2D vector representation.
+2. Apply temporal convolution to this representation to produce a new hidden_dim dimensional vector.
+3. Apply max-pooling to the new vectors at all the time steps in a sentence to get a sentence representation.
+
+```python
+# context_len means convolution kernel size.
+# context_start means the start of the convolution. It can be negative. In that case, zero padding is applied.
+text_conv = sequence_conv_pool(input=emb,
+	                           context_start=k,
+	                           context_len=2 * k + 1)
+```
+
+The performance is summarized in the following table：
+
+<html>
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+
+<thead>
+<th scope="col" class="left">Network name</th>
+<th scope="col" class="left">Number of parameters</th>
+<th scope="col" class="left">Test error</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">Convolutional model</td>
+<td class="left">16 MB</td>
+<td class="left">5.628%</td>
+</tr>
+
+</tbody>
+</table></center>
+<br>
+
+### Recurrent Model
+<center> ![](./NetRNN_en.png) </center>
+
+You can use Recurrent neural network as our time sequence model, including simple RNN model, GRU model, and LSTM model。
+
+- GRU model can be specified via：
+
+```python
+gru = simple_gru(input=emb, size=gru_size)
+```
+
+- LSTM model can be specified via：
+
+```python
+lstm = simple_lstm(input=emb, size=lstm_size)
+```
+
+You can use single layer LSTM model with Dropout for our text classification problem. The performance is summarized in the following table:
+
+<html>
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+
+<thead>
+<th scope="col" class="left">Network name</th>
+<th scope="col" class="left">Number of parameters</th>
+<th scope="col" class="left">Test error</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">Recurrent model</td>
+<td class="left">16 MB</td>
+<td class="left">4.812%</td>
+</tr>
+
+</tbody>
+</table></center>
+</html>
+<br>
+
+## Optimization Algorithm
+<a href = "../../ui/api/trainer_config_helpers/optimizers.html">Optimization algorithms</a> include Momentum, RMSProp, AdaDelta, AdaGrad, Adam, and Adamax. You can use Adam optimization method here, with L2 regularization and gradient clipping, because Adam has been proved to work very well for training recurrent neural network.
+
+```python
+settings(batch_size=128,
+         learning_rate=2e-3,
+         learning_method=AdamOptimizer(),
+         regularization=L2Regularization(8e-4),
+         gradient_clipping_threshold=25)
+```
+
+## Training Model
+After completing data preparation and network architecture specification, you will run the training script.
+<center> ![](./PipelineTrain_en.png) </center>
+
+Training script: our training script is in `train.sh` file. The training arguments are listed below:
+
+```bash
+paddle train \
+--config=trainer_config.py \
+--log_period=20 \
+--save_dir=./output \
+--num_passes=15 \
+--use_gpu=false
+```
+
+If you want to install the remote training platform, which enables distributed training on clusters, follow the instructions here: <a href = "../../platform/index.html">Platform</a> documentation. We do not provide examples on how to train on clusters. Please refer to other demos or platform training documentation for mode details on training on clusters.
+## Inference
+You can use the trained model to perform prediction on the dataset with no labels. You can also evaluate the model on dataset with labels to obtain its test accuracy.
+<center> ![](./PipelineTest_en.png) </center>
+
+The test script (test.sh) is listed below. PaddlePaddle can evaluate a model on the data with labels specified in `test.list`.
+
+```bash
+paddle train \
+--config=trainer_config.lstm.py \
+--use_gpu=false \
+--job=test \
+--init_model_path=./output/pass-0000x
+```
+
+We will give an example of performing prediction using Recurrent model on a dataset with no labels. You can refer to: <a href = "../../ui/predict/swig_py_paddle.html">Python Prediction API</a> tutorial，or other <a href = "../../demo/index.html">demo</a> for the prediction process using Python. You can also use the following script for inference or evaluation.
+
+inference script (predict.sh)：
+
+```bash
+model="output/pass-00003"
+paddle train \
+    --config=trainer_config.lstm.py \
+    --use_gpu=false \
+    --job=test \
+    --init_model_path=$model \
+    --config_args=is_predict=1 \
+    --predict_output_dir=. \
+
+mv rank-00000 result.txt
+```
+There are several differences between training and inference network configurations.
+- You do not need labels during inference.
+- Outputs need to be specified to the classification probability layer (the output of softmax layer), or the id of maximum probability (`max_id` layer). An example to output the id and probability is given in the code snippet.
+- batch_size = 1.
+- You need to specify the location of `test_list` in the test data.
+
+```python
+is_predict = get_config_arg('is_predict', bool, False)
+trn = 'data/train.list' if not is_predict else None
+tst = 'data/test.list' if not is_predict else 'data/pred.list'
+obj = 'process' if not is_predict else 'process_pre'
+batch_size = 128 if not is_predict else 1
+if is_predict:
+    maxid = maxid_layer(output)
+    outputs([maxid,output])
+else:
+    label = data_layer(name="label", size=2)
+    cls = classification_cost(input=output, label=label) outputs(cls)
+```
+
+## Summary
+The scripts of data downloading, network configurations, and training scrips are in `/demo/quick_start`. The following table summarizes the performance of our network architecture on Amazon-Elec dataset(25k):
+
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+
+<thead>
+<th scope="col" class="left">Network name</th>
+<th scope="col" class="left">Number of parameters</th>
+<th scope="col" class="left">Error rate</th>
+<th scope="col" class="left">Configuration file name</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">Logistic regression model(BOW)</td>
+<td class="left"> 252KB </td>
+<td class="left">8.652%</td>
+<td class="left">trainer_config.lr.py</td>
+</tr>
+
+<tr>
+<td class="left">Word embedding</td>
+<td class="left"> 15MB </td>
+<td class="left"> 8.484%</td>
+<td class="left">trainer_config.bow.py</td>
+</tr>
+
+<tr>
+<td class="left">Convolution model</td>
+<td class="left"> 16MB </td>
+<td class="left"> 5.628%</td>
+<td class="left">trainer_config.cnn.py</td>
+</tr>
+
+<tr>
+<td class="left">Time sequence model</td>
+<td class="left"> 16MB </td>
+<td class="left"> 4.812%</td>
+<td class="left">trainer_config.lstm.py</td>
+</tr>
+
+</tbody>
+</table>
+</center>
+<br>
+
+## Appendix
+### Command Line Argument
+
+* --config：network architecture path.
+* --save_dir：model save directory.
+* --log_period：the logging period per batch.
+* --num_passes：number of training passes. One pass means the training would go over the whole training dataset once.* --config_args：Other configuration arguments.
+* --init_model_path：The path of the initial model parameter.
+
+By default, the trainer will save model every pass. You can also specify `saving_period_by_batches` to set the frequency of batch saving. You can use `show_parameter_stats_period` to print the statistics of the parameters, which are very useful for tuning parameters. Other command line arguments can be found in <a href = "../../ui/index.html#command-line-argument">command line argument documentation</a>。
+
+### Log
+
+```
+TrainerInternal.cpp:160]  Batch=20 samples=2560 AvgCost=0.628761 CurrentCost=0.628761 Eval: classification_error_evaluator=0.304297  CurrentEval: classification_error_evaluator=0.304297
+```
+During model training, you will see the log like the examples above:
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+
+<thead>
+<th scope="col" class="left">Name</th>
+<th scope="col" class="left">Explanation</th>
+</tr>
+</thead>
+
+<tr>
+<td class="left">Batch=20</td>
+<td class="left"> You have trained 20 batches. </td>
+</tr>
+
+<tr>
+<td class="left">samples=2560</td>
+<td class="left"> You have trained 2560 examples. </td>
+</tr>
+
+<tr>
+<td class="left">AvgCost</td>
+<td class="left"> The average cost from the first batch to the current batch. </td>
+</tr>
+
+<tr>
+<td class="left">CurrentCost</td>
+<td class="left"> the average cost of the last log_period batches </td>
+</tr>
+
+<tr>
+<td class="left">Eval: classification_error_evaluator</td>
+<td class="left"> The average classification error from the first batch to the current batch.</td>
+</tr>
+
+<tr>
+<td class="left">CurrentEval: classification_error_evaluator</td>
+<td class="left"> The average error rate of the last log_period batches </td>
+</tr>
+
+</tbody>
+</table>
+</center>
+<br>
diff --git a/doc/demo/rec/ml_dataset.md b/doc/demo/rec/ml_dataset.md
new file mode 100644
index 00000000000000..c93a4585e4027b
--- /dev/null
+++ b/doc/demo/rec/ml_dataset.md
@@ -0,0 +1,107 @@
+# MovieLens Dataset
+
+The [MovieLens Dataset](http://grouplens.org/datasets/movielens/) was collected by GroupLens Research.
+The data set contains some user information, movie information, and many movie ratings from \[1-5\].
+The data sets have many version depending on the size of set.
+We use [MovieLens 1M Dataset](http://files.grouplens.org/datasets/movielens/ml-1m.zip) as a demo dataset, which contains
+1 million ratings from 6000 users on 4000 movies. Released 2/2003.
+
+## Dataset Features
+
+In [ml-1m Dataset](http://files.grouplens.org/datasets/movielens/ml-1m.zip), there are many features in these dataset.
+The data files (which have ".dat" extension) in [ml-1m Dataset](http://files.grouplens.org/datasets/movielens/ml-1m.zip)
+is basically CSV file that delimiter is "::". The description in README we quote here.
+
+### RATINGS FILE DESCRIPTION(ratings.dat)
+
+
+All ratings are contained in the file "ratings.dat" and are in the
+following format:
+
+UserID::MovieID::Rating::Timestamp
+
+- UserIDs range between 1 and 6040
+- MovieIDs range between 1 and 3952
+- Ratings are made on a 5-star scale (whole-star ratings only)
+- Timestamp is represented in seconds since the epoch as returned by time(2)
+- Each user has at least 20 ratings
+
+### USERS FILE DESCRIPTION(users.dat)
+
+User information is in the file "users.dat" and is in the following
+format:
+
+UserID::Gender::Age::Occupation::Zip-code
+
+All demographic information is provided voluntarily by the users and is
+not checked for accuracy.  Only users who have provided some demographic
+information are included in this data set.
+
+- Gender is denoted by a "M" for male and "F" for female
+- Age is chosen from the following ranges:
+
+	*  1:  "Under 18"
+	* 18:  "18-24"
+	* 25:  "25-34"
+	* 35:  "35-44"
+	* 45:  "45-49"
+	* 50:  "50-55"
+	* 56:  "56+"
+
+- Occupation is chosen from the following choices:
+
+	*  0:  "other" or not specified
+	*  1:  "academic/educator"
+	*  2:  "artist"
+	*  3:  "clerical/admin"
+	*  4:  "college/grad student"
+	*  5:  "customer service"
+	*  6:  "doctor/health care"
+	*  7:  "executive/managerial"
+	*  8:  "farmer"
+	*  9:  "homemaker"
+	* 10:  "K-12 student"
+	* 11:  "lawyer"
+	* 12:  "programmer"
+	* 13:  "retired"
+	* 14:  "sales/marketing"
+	* 15:  "scientist"
+	* 16:  "self-employed"
+	* 17:  "technician/engineer"
+	* 18:  "tradesman/craftsman"
+	* 19:  "unemployed"
+	* 20:  "writer"
+
+### MOVIES FILE DESCRIPTION(movies.dat)
+
+Movie information is in the file "movies.dat" and is in the following
+format:
+
+MovieID::Title::Genres
+
+- Titles are identical to titles provided by the IMDB (including
+year of release)
+- Genres are pipe-separated and are selected from the following genres:
+
+	* Action
+	* Adventure
+	* Animation
+	* Children's
+	* Comedy
+	* Crime
+	* Documentary
+	* Drama
+	* Fantasy
+	* Film-Noir
+	* Horror
+	* Musical
+	* Mystery
+	* Romance
+	* Sci-Fi
+	* Thriller
+	* War
+	* Western
+
+- Some MovieIDs do not correspond to a movie due to accidental duplicate
+entries and/or test entries
+- Movies are mostly entered by hand, so errors and inconsistencies may exist
diff --git a/doc/demo/rec/ml_regression.rst b/doc/demo/rec/ml_regression.rst
new file mode 100644
index 00000000000000..eb952c8e7a3d0f
--- /dev/null
+++ b/doc/demo/rec/ml_regression.rst
@@ -0,0 +1,361 @@
+Regression MovieLens Ratting
+============================
+
+Here we demonstrate a **Cosine Similarity Regression** job in movie lens dataset.
+This demo will show how paddle does (word) embedding job,
+handles the similarity regression,
+the character-level convolutional networks for text, and how does paddle handle
+multiple types of inputs.
+Note that the model structure is not fine-tuned and just a demo to show how paddle works.
+
+
+YOU ARE WELCOME TO BUILD A BETTER DEMO
+BY USING PADDLEPADDLE, AND LET US KNOW TO MAKE THIS DEMO BETTER.
+
+Data Preparation
+````````````````
+Download and extract dataset
+''''''''''''''''''''''''''''
+We use `movielens 1m dataset <ml_dataset.html>`_ here. 
+To download and unzip the dataset, simply run the following commands.
+
+..  code-block:: bash
+
+    cd demo/recommendation/data 
+    ./ml_data.sh
+
+And the directory structure of :code:`demo/recommendation/data/ml-1m` is:
+
+..  code-block:: text
+
+    +--ml-1m
+         +--- movies.dat    # movie features
+         +--- ratings.dat   # ratings
+         +--- users.dat     # user features
+         +--- README        # dataset description
+
+Field config file
+'''''''''''''''''
+**Field config file** is used to specific the fields dataset and file format,
+i.e, specific **WHAT** type it is in each feature file.
+
+The field config file of ml-1m shows in :code:`demo/recommendation/data/config.json`.
+It specifics the field types and file names: 1) there are four types of field for user file\: id, gender, age and occupation;
+2) the filename is "users.dat", and the delimiter of file is "::".
+
+..  include:: ../../../demo/recommendation/data/config.json
+    :code: json
+    :literal:
+
+Preprocess Data
+```````````````
+You need to install python 3rd party libraries.
+IT IS HIGHLY RECOMMEND TO USE VIRTUALENV MAKE A CLEAN PYTHON ENVIRONMENT.
+
+..  code-block:: bash
+
+    pip install -r requirements.txt
+
+The general command for preprocessing the dataset is:
+
+..  code-block:: bash
+
+    cd demo/recommendation
+    ./preprocess.sh
+    
+And the detail steps are introduced as follows.
+
+Extract Movie/User features to python object
+'''''''''''''''''''''''''''''''''''''''''''''
+
+There are many features in movie or user in movielens 1m dataset.
+Each line of rating file just provides a Movie/User id to refer each movie or user.
+We process the movie/user feature file first, and pickle the feature (**Meta**) object as a file.
+
+Meta config file
+................
+
+**Meta config file** is used to specific **HOW** to parse each field in dataset.
+It could be translated from field config file, or written by hand.
+Its file format could be either json or yaml syntax file. Parser will automatically choose the file format by extension name.
+
+To convert Field config file to meta config file, just run:
+
+..  code-block:: bash
+
+    cd demo/recommendation/data
+    python config_generator.py config.json > meta_config.json
+
+The meta config file shows below:
+
+..  include:: ../../../demo/recommendation/data/meta_config.json
+    :code: json
+    :literal:
+
+There are two kinds of features in meta\: movie and user.
+
+* in movie file, whose name is movies.dat
+   * we just split each line by "::"
+   * pos 0 is id.
+   * pos 1 feature:
+      * name is title.
+      * it uses regex to parse this feature.
+      * it is a char based word embedding feature.
+      * it is a sequence.
+   * pos 2 feature:
+      * name is genres.
+      * type is one hot dense vector.
+      * dictionary is auto generated by parsing, each key is split by '|'
+* in user file, whose name is users.dat
+   * we just split each line by "::"
+   * pos 0 is id.
+   * pos 1 feature:
+       * name is gender
+       * just simple char based embedding.
+   * pos 2 feature:
+       * name is age
+       * just whole word embedding.
+       * embedding id will be sort by word.
+   * pos 3 feature:
+       * name is occupation.
+       * just simple whole word embedding.
+
+
+Meta file
+'''''''''
+
+After having meta config file, we can generate **Meta file**, a python pickle object which stores movie/user information.
+The following commands could be run to generate it.
+
+..  code-block:: bash
+
+    python meta_generator.py ml-1m meta.bin --config=meta_config.json
+
+And the structure of the meta file :code:`meta.bin` is:
+
+..  code-block:: text
+
+    +--+ movie
+    |      +--+ __meta__
+    |      |       +--+ raw_meta  # each feature meta config. list
+    |      |       |       +
+    |      |       |       |     # ID Field, we use id as key
+    |      |       |       +--+ {'count': 3883, 'max': 3952, 'is_key': True, 'type': 'id', 'min': 1}
+    |      |       |       |
+    |      |       |       |     # Titile field, the dictionary list of embedding.
+    |      |       |       +--+ {'dict': [ ... ], 'type': 'embedding', 'name': 'title', 'seq': 'sequence'}
+    |      |       |       |
+    |      |       |       |     # Genres field, the genres dictionary
+    |      |       |       +--+ {'dict': [ ... ], 'type': 'one_hot_dense', 'name': 'genres'}
+    |      |       |
+    |      |       +--+ feature_map [1, 2] # a list for raw_meta index for feature field.
+    |      |                               # it means there are 2 features for each key.
+    |      |                               #    * 0 offset of feature is raw_meta[1], Title.
+    |      |                               #    * 1 offset of feature is raw_meta[2], Genres.
+    |      |
+    |      +--+ 1 # movie 1 features
+    |      |    +
+    |      |    +---+ [[...], [...]] # title ids, genres dense vector
+    |      |
+    |      +--+ 2
+    |      |
+    |      +--+ ...
+    |
+    +--- user
+           +--+ __meta__
+           |       +
+           |       +--+ raw_meta
+           |       |       +
+           |       |       +--+ id field as user
+           |       |       |
+           |       |       +--+ {'dict': ['F', 'M'], 'type': 'embedding', 'name': 'gender', 'seq': 'no_sequence'}
+           |       |       |
+           |       |       +--+ {'dict': ['1', '18', '25', '35', '45', '50', '56'], 'type': 'embedding', 'name': 'age', 'seq': 'no_sequence'}
+           |       |       |
+           |       |       +--+ {'dict': [...], 'type': 'embedding', 'name': 'occupation', 'seq': 'no_sequence'}
+           |       |
+           |       +--+ feature_map [1, 2, 3]
+           |
+           +--+ 1 # user 1 features
+           |
+           +--+ 2
+           +--+ ...
+
+
+Split Training/Testing files
+''''''''''''''''''''''''''''
+
+We split :code:`ml-1m/ratings.dat` into a training and testing file. The way to split file is for each user, we split the
+rating by two parts. So each user in testing file will have some rating information in training file.
+
+Use separate.py to separate the training and testing file.
+
+..  code-block:: bash
+
+    python split.py ml-1m/ratings.dat --delimiter="::" --test_ratio=0.1
+
+Then two files will be generated\: :code:`ml-1m/ratings.dat.train` and :code:`ml-1m/rating.data.test`.
+Move them to workspace :code:`data`, shuffle the train file, and prepare the file list for paddle train.
+
+..  code-block:: bash
+
+    shuf ml-1m/ratings.dat.train > ratings.dat.train
+    cp ml-1m/ratings.dat.test .
+    echo "./data/ratings.dat.train" > train.list
+    echo "./data/ratings.dat.test" > test.list
+
+
+Neural Network Configuration
+````````````````````````````
+
+Trainer Config File
+'''''''''''''''''''
+
+The network structure shows below.
+
+..  image:: rec_regression_network.png
+    :align: center
+    :alt: rec_regression_network
+
+The demo's neural network config file "trainer_config.py" show as below.
+
+..  include:: ../../../demo/recommendation/trainer_config.py
+    :code: python
+    :literal:
+
+In this :code:`trainer_config.py`, we just map each feature type to
+a feature vector, following shows how to map each feature to a vector shows below.
+
+* :code:`id`\: Just simple embedding, and then add to fully connected layer.
+* :code:`embedding`\:
+    - if is_sequence, get the embedding and do a text convolutional operation,
+      get the average pooling result.
+    - if not sequence, get the embedding and add to fully connected layer.
+* :code:`one_host_dense`\:
+    - just two fully connected layer.
+
+Then we combine each features of movie into one movie feature by a
+:code:`fc_layer` with multiple inputs, and do the same thing to user features,
+get one user feature. Then we calculate the cosine similarity of these two
+features.
+
+In these network, we use several api in `trainer_config_helpers
+<../../ui/api/trainer_config_helpers/index.html>`_. There are
+
+*  Data Layer, `data_layer 
+   <../../ui/api/trainer_config_helpers/layers.html#id1>`_
+*  Fully Connected Layer, `fc_layer
+   <../../ui/api/trainer_config_helpers/layers.html#fc-layer>`_
+*  Embedding Layer, `embedding_layer
+   <../../ui/api/trainer_config_helpers/layers.html#embedding-layer>`_
+*  Context Projection Layer, `context_projection
+   <../../ui/api/trainer_config_helpers/layers.html#context-projection>`_
+*  Pooling Layer, `pooling_layer
+   <../../ui/api/trainer_config_helpers/layers.html#pooling-layer>`_
+*  Cosine Similarity Layer, `cos_sim
+   <../../ui/api/trainer_config_helpers/layers.html#cos-sim>`_
+*  Text Convolution Pooling Layer, `text_conv_pool
+   <../../ui/api/trainer_config_helpers/networks.html
+   #trainer_config_helpers.networks.text_conv_pool>`_
+*  Declare Python Data Sources, `define_py_data_sources
+   <../../ui/api/trainer_config_helpers/data_sources.html>`_
+
+Data Provider
+'''''''''''''
+
+..  include:: ../../../demo/recommendation/dataprovider.py
+    :code: python
+    :literal:
+
+The data provider just read the meta.bin and rating file, yield each sample for training.
+In this :code:`dataprovider.py`, we should set\:
+
+* obj.slots\: The feature types and dimension.
+* use_seq\: Whether this :code:`dataprovider.py` in sequence mode or not.
+* process\: Return each sample of data to :code:`paddle`.
+
+The data provider details document see `there <../../ui/DataProvider.html>`_.
+
+Train
+`````
+
+After prepare data, config network, writting data provider, now we can run paddle training.
+
+The run.sh is shown as follow:
+
+..  include:: ../../../demo/recommendation/run.sh
+    :code: bash
+    :literal:
+
+It just start a paddle training process, write the log to `log.txt`,
+then print it on screen.
+
+Each command line argument in :code:`run.sh`, please refer to the `command line
+arguments <TBD>`_ page. The short description of these arguments is shown as follow.
+
+*  config\: Tell paddle which file is neural network configuration.
+*  save_dir\: Tell paddle save model into './output'
+*  use_gpu\: Use gpu or not. Default is false.
+*  trainer_count\: The compute thread in one machine.
+*  test_all_data_in_one_period\: Test All Data during one test period. Otherwise,
+   will test a :code:`batch_size` data in one test period.
+*  log_period\: Print log after train :code:`log_period` batches.
+*  dot_period\: Print a :code:`.` after train :code:`dot_period` batches.
+*  num_passes\: Train at most :code:`num_passes`.
+
+
+
+If training process starts successfully, the output likes follow:
+
+..  code-block:: text
+
+    I0601 08:07:22.832059 10549 TrainerInternal.cpp:157]  Batch=100 samples=160000 AvgCost=4.13494 CurrentCost=4.13494 Eval:  CurrentEval:
+
+    I0601 08:07:50.672627 10549 TrainerInternal.cpp:157]  Batch=200 samples=320000 AvgCost=3.80957 CurrentCost=3.48421 Eval:  CurrentEval:
+
+    I0601 08:08:18.877369 10549 TrainerInternal.cpp:157]  Batch=300 samples=480000 AvgCost=3.68145 CurrentCost=3.42519 Eval:  CurrentEval:
+
+    I0601 08:08:46.863963 10549 TrainerInternal.cpp:157]  Batch=400 samples=640000 AvgCost=3.6007 CurrentCost=3.35847 Eval:  CurrentEval:
+
+    I0601 08:09:15.413025 10549 TrainerInternal.cpp:157]  Batch=500 samples=800000 AvgCost=3.54811 CurrentCost=3.33773 Eval:  CurrentEval:
+    I0601 08:09:36.058670 10549 TrainerInternal.cpp:181]  Pass=0 Batch=565 samples=902826 AvgCost=3.52368 Eval:
+    I0601 08:09:46.215489 10549 Tester.cpp:101]  Test samples=97383 cost=3.32155 Eval:
+    I0601 08:09:46.215966 10549 GradientMachine.cpp:132] Saving parameters to ./output/model/pass-00000
+    I0601 08:09:46.233397 10549 ParamUtil.cpp:99] save dir ./output/model/pass-00000
+    I0601 08:09:46.233438 10549 Util.cpp:209] copy trainer_config.py to ./output/model/pass-00000
+    I0601 08:09:46.233541 10549 ParamUtil.cpp:147] fileName trainer_config.py
+
+The model is saved in :code:`output/` directory. You can use :code:`Ctrl-C` to stop training whenever you want.
+
+Evaluate and Predict
+````````````````````
+
+After training several passes, you can evalute them and get the best pass. Just run
+
+.. code-block:: bash
+
+    ./evalute.sh 
+
+You will see messages like this:
+
+.. code-block:: text
+
+    Best pass is 00009,  error is 3.06949, which means predict get error as 0.875998002281
+    evaluating from pass output/pass-00009
+
+Then, you can predict what any user will rate a movie. Just run
+
+..  code-block:: bash
+
+    python prediction.py 'output/pass-00009/'
+
+Predictor will read user input, and predict scores. It has a command-line user interface as follows:
+
+..  code-block:: text
+
+    Input movie_id: 9
+    Input user_id: 4
+    Prediction Score is 2.56
+    Input movie_id: 8
+    Input user_id: 2
+    Prediction Score is 3.13
diff --git a/doc/demo/rec/rec_regression_network.png b/doc/demo/rec/rec_regression_network.png
new file mode 100644
index 00000000000000..7d2b54d4fcf560
Binary files /dev/null and b/doc/demo/rec/rec_regression_network.png differ
diff --git a/doc/demo/semantic_role_labeling/feature.jpg b/doc/demo/semantic_role_labeling/feature.jpg
new file mode 100644
index 00000000000000..0e3310e4ace561
Binary files /dev/null and b/doc/demo/semantic_role_labeling/feature.jpg differ
diff --git a/doc/demo/semantic_role_labeling/index.md b/doc/demo/semantic_role_labeling/index.md
new file mode 100644
index 00000000000000..53c817a485b3cc
--- /dev/null
+++ b/doc/demo/semantic_role_labeling/index.md
@@ -0,0 +1,195 @@
+# Semantic Role Labelling Tutorial
+Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering.  An instance is as following [1]:
+
+
+ [ <sub>A0</sub> He ] [ <sub>AM-MOD</sub> would ][ <sub>AM-NEG</sub> n’t ] [ <sub>V</sub> accept] [ <sub>A1</sub> anything of value ] from [<sub>A2</sub> those he was writing about ]. 
+
+- V: verb
+- A0: acceptor
+- A1: thing accepted
+- A2: accepted-from
+- A3: Attribute
+- AM-MOD: modal 
+- AM-NEG: negation
+
+
+Given the verb "accept", the chunks in sentence would play certain semantic roles. Here, the label scheme is from Penn Proposition Bank. 
+
+To this date, most of the successful SRL systems are built on top of some form of parsing results where pre-defined feature templates over the syntactic structure are used. This tutorial will present an end-to-end system using deep bidirectional long short-term memory (DB-LSTM)[2] for solving the SRL task, which largely outperforms the previous state-of-the-art systems. The system regards SRL task as the sequence labelling problem. 
+
+
+## Data Description
+The relevant paper[2] takes the data set in CoNLL-2005&2012 Shared Task for training and testing. Accordingto data license,  the demo adopts the test data set of CoNLL-2005, which can be reached on website.
+
+To download and process the original data, user just need to execute the following command:
+
+```bash
+cd data
+./get_data.sh
+```
+Several new files appear in the `data `directory as follows.
+```bash
+conll05st-release：the test data set of CoNll-2005 shared task 
+test.wsj.words：the Wall Street Journal data sentences
+test.wsj.props:  the propositional arguments
+src.dict：the dictionary of words in sentences
+tgt.dict：the labels dictionary
+feature: the extracted features from data set
+```
+ 
+
+## Training
+### DB-LSTM
+Please refer to the Sentiment Analysis demo to learn more about the long short-term memory unit. 
+
+Unlike Bidirectional-LSTM that used in Sentiment Analysis demo,  the DB-LSTM adopts another way to stack LSTM layer. First a standard LSTM processes the sequence in forward direction. The input and output of this LSTM layer are taken by the next LSTM layer as input, processed in reversed direction. These two standard LSTM layers compose a pair of LSTM. Then we stack LSTM layers pair after pair to obtain the deep LSTM model. 
+
+The following figure shows a temporal expanded 2-layer DB-LSTM network.
+<center>
+![pic](./network_arch.png)
+</center>
+
+
+
+### Features
+Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark m<sub>r</sub> = 1 to denote the argument position if it locates in the predicate context region, or m<sub>r</sub> = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]:
+<center>
+![pic](./feature.jpg)
+</center>
+
+In this sample, the coresponding labelled sentence is:
+
+[ <sub>A1</sub> A record date ] has [ <sub>AM-NEG</sub> n't ] been [ <sub>V</sub> set ] . 
+
+In the demo, we adopt the feature template as above, consists of :  `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` and use `B/I/O` scheme to label each argument. These features and labels are stored in `feature` file, and separated by `\t`.
+
+### Data Provider
+
+`dataprovider.py` is the python file to wrap data. `hook()` function is to define the data slots for network. The  Six features and label are all IndexSlots.
+```
+def hook(settings, word_dict, label_dict, **kwargs):
+    settings.word_dict = word_dict
+    settings.label_dict = label_dict
+    #all inputs are integral and sequential type
+    settings.slots = [ 
+        integer_value(len(word_dict), seq_type=SequenceType.SEQUENCE),
+        integer_value(len(word_dict), seq_type=SequenceType.SEQUENCE),
+        integer_value(len(word_dict), seq_type=SequenceType.SEQUENCE),
+        integer_value(len(word_dict), seq_type=SequenceType.SEQUENCE),
+        integer_value(len(word_dict), seq_type=SequenceType.SEQUENCE),
+        integer_value(2, seq_type=SequenceType.SEQUENCE),
+        integer_value(len(label_dict), seq_type=SequenceType.SEQUENCE)]```
+
+```
+The corresponding data iterator is as following:
+```
+@provider(use_seq=True, init_hook=hook)
+def process(obj, file_name):
+    with open(file_name, 'r') as fdata:
+        for line in fdata:
+            sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = line.strip().split('\t')
+            words = sentence.split()
+            sen_len = len(words)
+            word_slot = [obj.word_dict.get(w, UNK_IDX) for w in words]
+
+            predicate_slot = [obj.word_dict.get(predicate, UNK_IDX)] * sen_len
+            ctx_n1_slot = [obj.word_dict.get(ctx_n1, UNK_IDX) ] * sen_len
+            ctx_0_slot = [obj.word_dict.get(ctx_0, UNK_IDX) ] * sen_len
+            ctx_p1_slot = [obj.word_dict.get(ctx_p1, UNK_IDX) ] * sen_len
+
+            marks = mark.split()
+            mark_slot = [int(w) for w in marks]
+
+            label_list = label.split()
+            label_slot = [obj.label_dict.get(w) for w in label_list]
+
+            yield word_slot, predicate_slot, ctx_n1_slot, ctx_0_slot, ctx_p1_slot, mark_slot, label_slot
+```
+The `process`function yield 7 lists which are six features and labels.
+ 
+### Neural Network Config
+`db_lstm.py` is the neural network config file to load the dictionaries and define the  data provider module and network architecture during the training procedure. 
+
+Seven `data_layer` load instances from data provider. Six features are transformed into embedddings respectively, and mixed by `mixed_layer` .  Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels.
+
+### Run Training 
+The script for training is `train.sh`, user just need to execute:
+```bash
+  ./train.sh
+```
+The content in `train.sh`:
+```
+paddle train \
+  --config=./db_lstm.py \
+  --save_dir=./output \
+  --trainer_count=4 \
+  --log_period=10 \
+  --num_passes=500 \
+  --use_gpu=false \
+  --show_parameter_stats_period=10 \
+  --test_all_data_in_one_period=1 \
+2>&1 | tee 'train.log'
+```
+
+
+-  \--config=./db_lstm.py : network config file.
+-  \--save_di=./output: output path to save models.
+-  \--trainer_count=4 : set thread number (or GPU count).
+-  \--log_period=10 : print log every 20 batches.
+-  \--num_passes=500: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
+-  \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train.
+-  \--show_parameter_stats_period=10: show parameter statistic every 100 batches.
+-  \--test_all_data_in_one_period=1: test all data in every testing.
+
+
+After training, the models  will be saved in directory `output`.
+
+### Run testing
+The script for testing is `test.sh`, user just need to execute:
+```bash
+  ./test.sh
+```
+The main part in `tesh.sh`
+```
+paddle train \
+  --config=./db_lstm.py \
+  --model_list=$model_list \
+  --job=test \
+  --config_args=is_test=1 \
+```
+
+  - \--config=./db_lstm.py: network config file
+  - \--model_list=$model_list.list: model list file
+  - \--job=test: indicate the test job
+  - \--config_args=is_test=1: flag to indicate test
+  
+
+### Run prediction
+The script for prediction is `predict.sh`, user just need to execute:
+```bash
+  ./predict.sh
+  
+```
+In `predict.sh`, user should offer the network config file, model path, label file, word dictionary file, feature file
+```
+python predict.py 
+     -c $config_file 
+     -w $model_path 
+     -l $label_file 
+     -d $dict_file 
+     -i $input_file
+```
+
+`predict.py` is the main executable python script, which includes functions: load model, load data, data prediction. The network model will output the probability distribution of labels. In the demo, we take the label with maximum probability as result. User can also implement the beam search or viterbi decoding upon the probability distribution matrix.
+
+After prediction,  the result is saved in `predict.res`.
+
+
+
+
+
+## Reference
+[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. 
+
+[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
+
diff --git a/doc/demo/semantic_role_labeling/network_arch.png b/doc/demo/semantic_role_labeling/network_arch.png
new file mode 100644
index 00000000000000..4ae7864212f2a0
Binary files /dev/null and b/doc/demo/semantic_role_labeling/network_arch.png differ
diff --git a/doc/demo/sentiment_analysis/bi_lstm.jpg b/doc/demo/sentiment_analysis/bi_lstm.jpg
new file mode 100644
index 00000000000000..adec1606d64d6e
Binary files /dev/null and b/doc/demo/sentiment_analysis/bi_lstm.jpg differ
diff --git a/doc/demo/sentiment_analysis/index.rst b/doc/demo/sentiment_analysis/index.rst
new file mode 100644
index 00000000000000..9ee6d3a177c19d
--- /dev/null
+++ b/doc/demo/sentiment_analysis/index.rst
@@ -0,0 +1,9 @@
+Sentiment Analasis Tutorial
+===========================
+
+.. toctree::
+    :maxdepth: 3
+    :glob:
+
+    Training Locally <sentiment_analysis.md>
+    internal/cluster_train.md
diff --git a/doc/demo/sentiment_analysis/lstm.png b/doc/demo/sentiment_analysis/lstm.png
new file mode 100644
index 00000000000000..aaf1fc690da2ff
Binary files /dev/null and b/doc/demo/sentiment_analysis/lstm.png differ
diff --git a/doc/demo/sentiment_analysis/sentiment_analysis.md b/doc/demo/sentiment_analysis/sentiment_analysis.md
new file mode 100644
index 00000000000000..957e85869820a8
--- /dev/null
+++ b/doc/demo/sentiment_analysis/sentiment_analysis.md
@@ -0,0 +1,320 @@
+# Sentiment Analysis Tutorial
+
+Sentiment analysis has many applications. A basic task in sentiment analysis is classifying the polarity of a given text at the document, sentence or feature/aspect level. One simple example is to classify the customer reviews in a shopping website, a tourism website, and group buying websites like Amazon, TaoBao, Tmall etc.
+
+Sentiment analysis is also used to monitor social media based on large amount of reviews or blogs. For example, the researchers analyzed several surveys on consumer confidence and political opinion, found they correlate to sentiment word frequencies in contemporaneous Twitter messages [1]. Another example is to forecast stock movements through analyzing the text content of a daily Twitter blog [2].
+
+On the other hand, grabbing the user comments of products and analyzing their sentiment are useful to understand user preferences for companies, products, even competing products.
+
+This tutorial will guide you through the process of training a Long Short Term Memory (LSTM) Network to classify the sentiment of sentences from [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), sometimes known as the [Internet Movie Database (IMDB)](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf). This dataset contains movie reviews along with their associated binary sentiment polarity labels, namely positive and negative. So randomly guessing yields 50% accuracy.
+
+## Data Preparation
+
+### IMDB Data Introduction
+
+Before training models, we need to preprocess the data and build a dictionary. First, you can use following script to download IMDB dataset and [Moses](http://www.statmt.org/moses/) tool, which is a statistical machine translation system. We provide a data preprocessing script, which is capable of handling not only IMDB data, but also other user-defined data. In order to use the pre-written script, it needs to move labeled train and test samples to another path, which has been done in `get_imdb.sh`.
+
+```
+cd demo/sentiment/data
+./get_imdb.sh
+```
+If the data is obtained successfuly, you will see the following files at ```./demo/sentiment/data```:
+
+```
+aclImdb  get_imdb.sh  imdb  mosesdecoder-master
+```
+
+* aclImdb: raw dataset downloaded from website.
+* imdb: only contains train and test data.
+* mosesdecoder-master: Moses tool.
+
+IMDB dataset contains 25,000 highly polar movie reviews for training, and 25,000 for testing. A negative review has a score ≤ 4 out of 10, and a positive review has a score ≥ 7 out of 10. After running `./get_imdb.sh`, we can find the dataset has the following structure in `aclImdb`.
+
+```
+imdbEr.txt  imdb.vocab  README  test  train
+```
+* train: train sets.
+* test : test sets.
+* imdb.vocab: dictionary.
+* imdbEr.txt: expected rating for each token in imdb.vocab.
+* README: data documentation.
+
+Both train and test set directory contains:
+
+```
+labeledBow.feat  neg  pos  unsup  unsupBow.feat  urls_neg.txt  urls_pos.txt  urls_unsup.txt
+```
+
+* pos: positive samples, contains 12,500 txt files, each file is one movie review.
+* neg: negative samples, contains 12,500 txt files, each file is one movie review.
+* unsup: unlabeled samples, contains 50,000 txt files.
+* urls_xx.txt: urls of each reviews.
+* xxBow.feat: already-tokenized bag of words (BoW) features.
+
+### IMDB Data Preparation
+
+In this demo, we only use labled train and test set and not use imdb.vocab as dictionary. By default, dictionary is builded on train set. Train set is shuffled and test set is not. `tokenizer.perl` in Moses tool is used to tokenize the words and punctuation. Simply execute the following command to preprcess data.
+
+```
+cd demo/sentiment/
+./preprocess.sh
+```
+preprocess.sh:
+
+```
+data_dir="./data/imdb"
+python preprocess.py -i data_dir
+```
+
+* data_dir: input data directory.
+* preprocess.py: preprocess script.
+
+If running successfully, you will see `demo/sentiment/data/pre-imdb` directory as follows:
+
+```
+dict.txt  labels.list  test.list  test_part_000  train.list  train_part_000
+```
+* test\_part\_000 and train\_part\_000: all labeled test and train sets. Train sets have be shuffled.
+* train.list and test.list: train and test file lists.
+* dict.txt: dictionary generated on train sets by default.
+* labels.txt: neg  0, pos 1, means label 0 is negative review, label 1 is positive review.
+
+### User-defined Data Preparation
+
+If you perform other sentiment classifcation task, you can prepare data as follows. We have provided the scripts to build dictionary and preprocess data. So just organize data as follows.
+
+```
+dataset
+|----train
+|    |----class1
+|    |    |----text_files
+|    |----class2
+|    |    |----text_files
+|    |    ...
+|----test
+|    |----class1
+|    |    |----text_files
+|    |----class2
+|    |    |----text_files
+|    |    ...
+```
+* dataset: 1st directory.
+* train, test: 2nd directory.
+* class1,class2,...: 3rd directory.
+* text_files: samples with text file format.
+
+All samples with text files format under the same folder are same category. Each text file contains one or more samples and each line is one sample. In order to shuffle fully, the preprocessing is a little different for data with multiple lines in one text file, which needs to set `-m True` in `preprocess.sh`. And tokenizer.perl is used by default. If you don't need it, only set `-t False` in `preprocess.sh'.
+
+## Training
+
+In this task, we use Recurrent Neural Network (RNN) of LSTM architecure to train sentiment analysis model. LSTM model was introduced primarily in order to overcome the problem of vanishing gradients. LSTM network resembles a standard recurrent neural network with a hidden layer, but each ordinary node in the hidden layer is replaced by a memory cell. Each memory cell contains four main elements: an input gate, a neuron with a self-recurrent connection, a forget gate and an output gate. More details can be found in the literature [4]. The biggest advantage of the LSTM architecture is that it learns to memorize information over long time intervals without the loss of short time memory. At each time step with a new coming word, historical information stored in the memory block is updated to iteratively learn the sequence representation.
+
+<center>![LSTM](./lstm.png)</center>
+<center>Figure 1. LSTM [3]</center>
+
+Sentiment analysis is among the most typical problems in natural language understanding. It aims at predicting the attitude expressed in a sequence. Usually, only some key words, like adjectives and adverbs words, play a major role in predicting the sentiment of sequences or paragraphs. However, some review or comment contexts are very long, such as IMDB dataset. We use LSTM to perform this task for its improved design with the gate mechanism. First, it is able to summarize the representation from word level to context level with variable context length which is adapted by the gate values. Second, it can utilize the expanded context at the sentence level, while most methods are good at utilizing n-gram level knowledge. Third, it learns the paragraph representation directly rather than combining the context level information. This results in this end-to-end framework.
+
+In this demo we provide two network, namely bidirectional-LSTM and three layers of stacked-LSTM.
+
+#### Bidirectional-LSTM
+
+One is a bidirectional LSTM network, connected by fully connected layer and softmax, as shown in Figure 2.
+
+<center>![BiLSTM](./bi_lstm.jpg)</center>
+<center>Figure 2. Bidirectional-LSTM </center>
+
+#### Stacked-LSTM
+Another is three-layer LSTM structure in Figure 3. The bottom of the figure is word embedding. Next, three LSTM-Hidden layers are connected and the second LSTM is reversed. Then extract the maximum hidden vectors of all time step of hidden and LSTM layer as the representation for the entire sequence. Finally, a fully connected feed forward layer with softmax activation is used to perform the classification task. This network is refered to paper [5].
+
+<center>![StackedLSTM](./stacked_lstm.jpg)</center>
+<center>Figure 3. Stacked-LSTM for sentiment analysis </center>
+
+**Config**
+
+Switch into `demo/sentiment` directory, `trainer_config.py` file is an example of the config, containing algorithm and newtork configure. The first line imports predefined networks from `sentiment_net.py`.
+
+trainer_config.py:
+
+```python
+from sentiment_net import *
+
+data_dir  = "./data/pre-imdb"
+# whether this config is used for test
+is_test = get_config_arg('is_test', bool, False)
+# whether this config is used for prediction
+is_predict = get_config_arg('is_predict', bool, False)
+dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict)
+
+################## Algorithm Config #####################
+
+settings(
+  batch_size=128,
+  learning_rate=2e-3,
+  learning_method=AdamOptimizer(),
+  regularization=L2Regularization(8e-4),
+  gradient_clipping_threshold=25
+)
+
+#################### Network Config ######################
+stacked_lstm_net(dict_dim, class_dim=class_dim,
+                 stacked_num=3, is_predict=is_predict)
+#bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
+```
+
+* **Data Definition**:
+   * get\_config\_arg(): get arguments setted by `--config_args=xx` in commandline argument.
+   * Define TrainData and TestData provider, here using Python interface (PyDataProviderWrapper) of PaddlePaddle to load data. For details, you can refer to the document of PyDataProvider.
+
+* **Algorithm Configuration**:
+   * use sgd algorithm.
+   * use adam optimization.
+   * set batch size of 128.
+   * set average sgd window.
+   * set global learning rate.
+* **Network Configuration**:
+   * dict_dim: get dictionary dimension.
+   * class_dim: set category number, IMDB has two label, namely positive and negative label.
+   * `stacked_lstm_net`: predefined network as shown in Figure 3, use this network by default.
+   * `bidirectional_lstm_net`: predefined network as shown in Figure 2.
+
+**Training**
+
+Install PaddlePaddle first if necessary. Then you can use script `train.sh` as follows to launch local training.
+
+```
+cd demo/sentiment/
+./train.sh
+```
+
+train.sh:
+
+```
+config=trainer_config.py
+output=./model_output
+paddle train --config=$config \
+             --save_dir=$output \
+             --job=train \
+             --use_gpu=false \
+             --trainer_count=4 \
+             --num_passes=10 \
+             --log_period=20 \
+             --dot_period=20 \
+             --show_parameter_stats_period=100 \
+             --test_all_data_in_one_period=1 \
+             2>&1 | tee 'train.log'
+```
+
+* --config=$config: set network config.
+* --save\_dir=$output: set output path to save models.
+* --job=train: set job mode to train.
+* --use\_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train.
+* --trainer\_count=4: set thread number (or GPU count).
+* --num\_passes=15: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
+* --log\_period=20: print log every 20 batches.
+* --show\_parameter\_stats\_period=100: show parameter statistic every 100 batches.
+* --test\_all_data\_in\_one\_period=1: test all data every testing.
+
+If the run succeeds, the output log is saved in path of `demo/sentiment/train.log` and model is saved in path of `demo/sentiment/model_output/`. The output log is explained as follows.
+
+```
+Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875  CurrentEval: classification_error_evaluator=0.36875
+...
+Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922
+Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406
+```
+- Batch=xx: means passing xx batches.
+- samples=xx: means passing xx samples.
+- AvgCost=xx: averaged cost from 0-th batch to current batch.
+- CurrentCost=xx: current cost of latest log_period batches.
+- Eval: classification\_error\_evaluator=xx: means classfication error from 0-th batch ro current batch.
+- CurrentEval: classification\_error\_evaluator: current classfication error of the lates log_period batches.
+- Pass=0: Going through all training set one time is called one pass. 0 means going through training set first time.
+
+By default, we use the `stacked_lstm_net` network, which converges at a faster rate than `bidirectional_lstm_net` when passing same sample number. If you want to use bidirectional LSTM, just remove comment in the last line and comment `stacked_lstm_net`.
+
+## Testing
+
+Testing means evaluating the labeled validation set using trained model.
+
+```
+cd demo/sentiment
+./test.sh
+```
+
+test.sh:
+
+```bash
+function get_best_pass() {
+  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
+  sed  -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
+  sort | head -n 1
+}
+
+log=train.log
+LOG=`get_best_pass $log`
+LOG=(${LOG})
+evaluate_pass="model_output/pass-${LOG[1]}"
+
+echo 'evaluating from pass '$evaluate_pass
+
+model_list=./model.list
+touch $model_list | echo $evaluate_pass > $model_list
+net_conf=trainer_config.py
+paddle train --config=$net_conf \
+             --model_list=$model_list \
+             --job=test \
+             --use_gpu=false \
+             --trainer_count=4 \
+             --config_args=is_test=1 \
+             2>&1 | tee 'test.log'
+```
+
+The function `get_best_pass` gets the best model by classification error rate for testing. In this example, We use test dataset of IMDB as validation by default. Unlike training, it needs to specify `--job=test` and model path, namely `--model_list=$model_list` here. If running successfully, the log is saved in path of `demo/sentiment/test.log`. For example, in our test, the best model is `model_output/pass-00002`, the classification error is 0.115645 as follows.
+
+```
+Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645
+```
+
+## Prediction
+
+`predict.py` provides a predicting interface. You should install python api of PaddlePaddle before using it. One example to predict unlabeled review of IMDB is as follows. Simply running:
+
+```
+cd demo/sentiment
+./predict.sh
+```
+predict.sh:
+
+```
+config=trainer_config.py
+model=model_output/pass-00002/
+label=data/pre-imdb/labels.list
+python predict.py \
+     -n $config\
+     -w $model \
+     -b $label \
+     -d data/pre-imdb/dict.txt \
+     -i data/aclImdb/test/pos/10007_10.txt
+```
+
+* `predict.py`: predicting interface.
+*  -n $config : set network configure.
+*  -w $model: set model path.
+*  -b $label: set dictionary about corresponding relation between integer label and string label.
+*  -d data/pre-imdb/dict.txt: set dictionary.
+*  -i data/aclImdb/test/pos/10014_7.txt: set one example file to predict.
+
+Predicting result of this example:
+
+```
+Loading parameters from model_output/pass-00002/
+./data/aclImdb/test/pos/10014_7.txt: predicting label is pos
+```
+We sincerely appreciate your interest and welcome your contributions.
+
+## Reference
+[1] Brendan O'Connor, Ramnath Balasubramanyan, Bryan R. Routledge, and Noah A. Smith. 2010. [From Tweets to Polls: Linking Text Sentiment to Public Opinion Time Series](http://homes.cs.washington.edu/~nasmith/papers/oconnor+balasubramanyan+routledge+smith.icwsm10.pdf). In ICWSM-2010. <br>
+[2] Johan Bollen, Huina Mao, Xiaojun Zeng. 2011. [Twitter mood predicts the stock market](http://arxiv.org/abs/1010.3003), Journal of Computational Science.<br>
+[3] Alex Graves, Marcus Liwicki, Santiago Fernan- dez, Roman Bertolami, Horst Bunke, and Ju ̈rgen Schmidhuber. 2009. [A novel connectionist system for unconstrained handwriting recognition. IEEE Transactions on Pattern Analysis and Machine In- telligence](http://www.cs.toronto.edu/~graves/tpami_2009.pdf), 31(5):855–868.<br>
+[4] Zachary C. Lipton, [A Critical Review of Recurrent Neural Networks for Sequence Learning](http://arxiv.org/abs/1506.00019v1), arXiv:1506.00019. <br>
+[5] Jie Zhou and Wei Xu; [End-to-end Learning of Semantic Role Labeling Using Recurrent Neural Networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf); ACL-IJCNLP 2015. <br>
diff --git a/doc/demo/sentiment_analysis/stacked_lstm.jpg b/doc/demo/sentiment_analysis/stacked_lstm.jpg
new file mode 100644
index 00000000000000..4239055050966e
Binary files /dev/null and b/doc/demo/sentiment_analysis/stacked_lstm.jpg differ
diff --git a/doc/demo/text_generation/encoder-decoder-attention-model.png b/doc/demo/text_generation/encoder-decoder-attention-model.png
new file mode 100644
index 00000000000000..79f911d4ba12ac
Binary files /dev/null and b/doc/demo/text_generation/encoder-decoder-attention-model.png differ
diff --git a/doc/demo/text_generation/index.rst b/doc/demo/text_generation/index.rst
new file mode 100644
index 00000000000000..82da5524197ac8
--- /dev/null
+++ b/doc/demo/text_generation/index.rst
@@ -0,0 +1,9 @@
+Text Generation Tutorial
+========================
+
+.. toctree::
+    :maxdepth: 3
+    :glob:
+
+    Training Locally <text_generation.md>
+    internal/cluster_train.md
diff --git a/doc/demo/text_generation/text_generation.md b/doc/demo/text_generation/text_generation.md
new file mode 100644
index 00000000000000..ee97139dd8f796
--- /dev/null
+++ b/doc/demo/text_generation/text_generation.md
@@ -0,0 +1,337 @@
+# Text generation Tutorial #
+
+Sequence to sequence has been proven to be a powerful model for language generation. It can be used for machine translation, query rewriting, image captioning, etc.
+
+This tutorial guides you through training a sequence to sequence model for neural machine translation (NMT) network that translates French to English.
+
+We follow the paper [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473) , which details the model architecture and training procedure for good performance on WMT-14 dataset. This tutorial reproduces this result in PaddlePaddle.
+
+We thank @caoying for the pull request that defines the model architecture and solver configurations.
+
+## Data Preparation ##
+### Download and Extract ###
+Download the WMT-14 dataset from [http://www-lium.univ-lemans.fr/~schwenk/cslm\_joint\_paper/](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/), extract it, and divide Develop and Test data into separate folder.
+
+- **Train data**: [bitexts (after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)
+- **Develop and Test data**: [dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)
+
+To do this, simply run the following commands in linux, otherwise, you need to download, extract, divide, and rename the file suffix respectively.
+
+```bash
+cd demo/seqToseq/data
+./wmt14_data.sh
+```
+
+We should find that the dataset `wmt14` has three folders as shown in the following table.
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+<colgroup>
+<col  class="left" />
+<col  class="left" />
+<col  class="left" />
+<col  class="left" />
+</colgroup>
+
+<thead>
+<tr>
+<th scope="col" class="left">folder name</th>
+<th scope="col" class="left">French-English parallel corpora file</th>
+<th scope="col" class="left">number of total file</th>
+<th scope="col" class="left">size</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">train_data</td>
+<td class="left">ccb2_pc30.src, ccb2_pc30.trg, etc</td>
+<td class="left">twelve</td>
+<td class="left">3.55G</td>
+</tr>
+
+<tr>
+<td class="left">test_data</td>
+<td class="left">ntst1213.src, ntst1213.trg</td>
+<td class="left">two</td>
+<td class="left">1636k</td>
+</tr>
+
+<tr>
+<td class="left">gen_data</td>
+<td class="left">ntst14.src, ntst14.trg</td>
+<td class="left">two</td>
+<td class="left">864k</td>
+</tr>
+</tbody>
+</table>
+<br/>
+
+- Each folder has French-English parallel corpora
+- **XXX.src** are source French files; **XXX.trg** are target English files.
+- The number of lines of **XXX.src** and **XXX.trg** should be the same.
+- Each line is a French/English sentence.
+- There is a one-to-one correspondence between the sentence at the i-th line of **XXX.src** and **XXX.trg**.
+
+### User Defined Dataset ###
+
+If you need to do other sequence-to-sequence tasks, such as Paraphrasing, you only need to organize the data as follows, and place them in `demo/seqToseq/data`:
+
+    dataset
+      train
+        file1.src file1.trg
+        file2.src file2.trg
+        ......
+      test
+        file1.src file1.trg
+        file2.src file2.trg
+        ......
+      gen
+        file1.src file1.trg
+        file2.src file2.trg
+        ......
+- 1st directory: dataset folder name
+- 2nd directory: folder of train, test, and gen. The names of these three folders are fixed.
+- 3rd file: Source-Target parallel corpora files.
+  - **XXX.src** are source files, **XXX.trg** are target files.
+  - Each line of the file must be a sequence.
+  - There should be a one-to-one correspondence between the i-th sequence of **XXX.src** and **XXX.trg**.
+
+## Data Preprocess ##
+### Preprocessing Workflow ###
+- Concat each Source-Target parallel corpora to be one file:
+  - concat each **XXX.src** and **XXX.trg** to be **XXX**.
+  - the i-th line of **XXX** = the i-th line of **XXX.src** + '\t' + the i-th line of **XXX.trg**
+- Build source and target dictionary of train data, each dictionary has DICTSIZE words:
+  - the most frequent (DICTSIZE-3) words
+  - 3 special token:
+    - `<s>`: the start of a sequence
+    - `<e>`: the end of a sequence
+    - `<unk>`: a word not included in dictionary
+
+### Preprocessing Command and Result
+The general command for preprocessing the dataset is:
+
+```python
+cd demo/seqToseq/
+python preprocess.py -i INPUT [-d DICTSIZE] [-m]
+```
+
+- `-i INPUT`: the path of input original dataset
+- `-d DICTSIZE`: the specified word count of dictionary, if not set, dictionary will contain all the words in input dataset
+- `-m --mergeDict`: merge source and target dictionary, thus, two dictionaries have the same context
+
+And you will see messages like this:
+
+    concat parallel corpora for dataset
+    build source dictionary for train data
+    build target dictionary for train data
+    dictionary size is XXX
+
+Here, you can simply run the command:
+
+```python
+python preprocess.py -i data/wmt14 -d 30000
+```
+
+It will take several minutes, and store the preprocessed dataset in `demo/seqToseq/data/pre-wmt14`, the directory has following structure.
+
+    train test gen train.list test.list gen.list src.dict trg.dict
+
+- **train, test, gen**: folder contains French-English parallel corpora of train data, test data and gen data respectively. Each line of file in folder contains two parts, the former is a French sequence, and the latter is a corresponding English sequence.
+- **train.list, test.list, gen.list**: text contains a file list in train folder, test folder and gen folder respectively
+- **src.dict, trg.dict**: source (French) / target (English) dictionary, each dictionary has 30000 words: the most frequent 29997 words and 3 special token
+
+## Model Training ##
+### Introduction ###
+
+Neural machine translation (NMT) aims at building a single neural network that can be jointly tuned to maximize translation performance. Recently proposed NMT models often belong to a family of encoder–decoder models. Encoder-Decoder models encode a source sentence into a fixed-length vector from which a decoder generates a target sentence.
+
+In this task, we use an extension to the encoder–decoder model which learns to align and translate jointly. Each time the model generates a word in a translation, it searches for a set of positions in the source sentence for the most relevant information.  The decoder predicts a target word based on the context vectors associated with these source positions and all the previous generated target words. For more detailed explanation, readers can refer to paper [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473).
+
+The most distinguishing feature of this model is that it doesn't encode an input sentence into a single ﬁxed-length vector. Instead, it encodes the input sentence into a sequence of vectors, where one vector corresponds to an input element. A subset of these vectors is chosen adaptively while decoding the translated sentence. This frees a NMT model from having to squash all the information of a source sentence, regardless of its length, into a ﬁxed-length vector. The improvement of this model is more apparent for longer sentences, but the improvement can be observed for sentences of any length.
+<center>![](./encoder-decoder-attention-model.png)</center>
+<center>Figure 1. Encoder-Decoder-Attention-Model</center>
+
+### Training Model in PaddlePaddle ###
+We need to create a model config file before training. Here is an example `demo/seqToseq/translation/train.conf`. The first three lines import python function for defining network, and define the job_mode and attention_mode.
+
+```python
+from seqToseq_net import *
+is_generating = False
+
+### Data Definiation
+train_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14",
+                             is_generating = is_generating)
+
+### Algorithm Configuration
+settings(
+    learning_method = AdamOptimizer(),
+    batch_size = 50,
+    learning_rate = 5e-4)
+
+### Network Architecture
+gru_encoder_decoder(train_conf, is_generating)
+```
+
+1. **Data Definiation**: We define a SeqToSeq train and test data in our example. It returns train_conf as the configuration, following is its input arguments:
+   - data_dir: directory of train data and test data
+   - is\_generating: whether this config is used for generating, here is false
+2. **Algorithm Configuration**: We use the SGD training algorithm (default), ADAM learning method in our example, specify batch_size as 50, and learning rate as 5e-4.
+3. **Network Architecture**: We use an attention version of GRU Encoder-Decoder network in our example. It consists a bidirectional GRU as an encoder and a decoder that emulates searching through a source sentence during decoding a translation.
+
+### Training Command and Result###
+After writing the model config, we can train the model by running the command:
+
+```bash
+cd demo/seqToseq/translation
+./train.sh
+```
+
+The `train.sh` is shown as follows:
+
+```bash
+paddle train \
+--config='translation/train.conf' \
+--save_dir='translation/model' \
+--use_gpu=false \
+--num_passes=16 \
+--show_parameter_stats_period=100 \
+--trainer_count=4 \
+--log_period=10 \
+--dot_period=5 \
+2>&1 | tee 'translation/train.log'
+```
+- config: set config of neural network
+- save_dir: set output path to save models
+- use_gpu: whether to use GPU to train, here use CPU
+- num_passes: set number of passes. One pass in paddle means training all samples in dataset one time
+- show_parameter_stats_period: here show parameter statistic every 100 batches
+- trainer_count: set number of CPU threads or GPU devices
+- log_period: here print log every 10 batches
+- dot_period: here print '.' every 5 batches
+
+The training loss function is printed every 10 batch by default, and you will see messages like this:
+
+    I0719 19:16:45.952062 15563 TrainerInternal.cpp:160]  Batch=10 samples=500 AvgCost=198.475 CurrentCost=198.475 Eval: classification_error_evaluator=0.737155  CurrentEval: classification_error_evaluator=0.737155
+    I0719 19:17:56.707319 15563 TrainerInternal.cpp:160]  Batch=20 samples=1000 AvgCost=157.479 CurrentCost=116.483 Eval: classification_error_evaluator=0.698392  CurrentEval: classification_error_evaluator=0.659065
+    .....
+- AvgCost: Average Cost from 0th batch to current batch
+- CurrentCost: Cost in current batch
+- classification\_error\_evaluator(Eval): False prediction rate for each word from 0th evaluation to current evaluation
+- classification\_error\_evaluator(CurrentEval): False prediction rate for each word in current evaluation
+
+And when the classification\_error\_evaluator is less than 0.35, the model is trained sucessfully.
+
+## Text Generation ##
+### Introduction ###
+
+Generally speaking, the NMT model is conditioned on the encodings of the source sentence, and then to predict the next target word by given the current target word. In the training process, the current word is always knowns as the ground truth, by contrast. In the generating process, the current word is the output of the decoder in last time step, which is accessed to from a memory in PaddlePaddle.
+
+Besides, we use Beam Search to generate sequences. Beam search uses breadth-first search to build its search tree. At each level of the tree, it generates all successors of the states at the current level, sorting them in increasing order of heuristic cost. However, it only stores a predetermined number of best states at each level (called the beam size).
+
+### Pretrained model ###
+We trained the model on a cluster with 50 nodes, each node has two 6-core CPUs. We trained 16 passes in 5 days, where each pass takes 7 hours. The model_dir has 16 sub-folder, each of which contains the whole model parameters with 202MB size. And we find pass-00012 model has the highest BLEU 27.77 (see paper [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf)). To download and extract this model, simply run the following commands in linux.
+
+```bash
+cd demo/seqToseq/data
+./wmt14_model.sh
+```
+
+### Generating Model in PaddlePaddle ###
+We need to create a model config file before translating French sequence. Here is an example `demo/seqToseq/translation/gen.conf`, the first three lines import python function for defining network, and define the job\_mode and attention\_mode.
+
+```python
+from seqToseq_net import *
+is_generating = True
+
+################## Data Definiation #####################
+gen_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14",
+                           is_generating = is_generating,
+                           gen_result = "./translation/gen_result")
+
+############## Algorithm Configuration ##################
+settings(
+  learning_method = AdamOptimizer(),
+  batch_size = 1,
+  learning_rate = 0)
+
+################# Network configure #####################
+gru_encoder_decoder(gen_conf, is_generating)
+```
+
+1. **Data Definiation**: We defines an SeqToSeq gen data in our example. It returns gen_conf as the configuration, following is its input arguments:
+   - data\_dir: directory of gen data
+   - is\_generating: whether this config is used for generating, here is false
+   - gen\_result: file to store the generation result
+2. **Algorithm Configuration**: We use SGD traing algorithm in generation, and specify batch_size as 1 (each time generate one sequence), and learning rate as 0.
+3. **Network Architecture**: Essentially the same as the training model.
+
+### Generating Command and Result ###
+After writing the model config, we can do text translation from French to English by running the command:
+
+```bash
+cd demo/seqToseq/translation
+./gen.sh
+```
+
+The `gen.sh` is shown as follows, unlike training, there are some different arguments to specify:
+
+```bash
+paddle train \
+--job=test \
+--config='translation/gen.conf' \
+--save_dir='data/wmt14_model' \
+--use_gpu=true \
+--num_passes=13 \
+--test_pass=12 \
+--trainer_count=1 \
+2>&1 | tee 'translation/gen.log'
+```
+- job: set job mode to test
+- num_passes and test_pass: loading model parameters from test_pass to (num_passes - 1), here only loads `data/wmt14_model/pass-00012`
+
+You will see messages like this:
+
+    I0706 14:48:31.178915 31441 GradientMachine.cpp:143] Loading parameters from data/wmt14_model/pass-00012
+    I0706 14:48:40.012039 31441 Tester.cpp:125]  Batch=100 samples=100 AvgCost=0
+    I0706 14:48:48.898632 31441 Tester.cpp:125]  Batch=200 samples=200 AvgCost=0
+    ...
+
+And the generating result in `demo/seqToseq/translation/gen_result` likes:
+
+    0
+    0       -11.1314         The <unk> <unk> about the width of the seats while large controls are at stake <e>
+    1       -11.1519         The <unk> <unk> on the width of the seats while large controls are at stake <e>
+    2       -11.5988         The <unk> <unk> about the width of the seats while large controls are at stake . <e>
+
+    1
+    0       -24.4149         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of the Dubai <unk> . <e>
+    1       -26.9524         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of Dubai &apos; s <unk> . <e>
+    2       -27.9574         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of Dubai &apos; s Dubai <unk> . <e>
+    ...
+
+- This is the beam search result, where beam size is 3
+- '0' in 1st-line and '1' in 6th-line mean the sequence-id in gen data
+- Other six lines list the beam search results
+  - The 2nd-column is the score of beam search (from large to small)
+  - The 3rd-colunm is the generating English sequence
+- There is 2 special tokens:
+  - `<e>`: the end of a sequence
+  - `<unk>`: a word not included in dictionary
+
+### Bleu Evalutaion ###
+Human evaluations of machine translation are extensive but expensive. Paper [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf) presents a method as an automated understudy to skilled human judges which substitutes for them when there is need for quick or frequent evaluations. [Moses](http://www.statmt.org/moses/) is a statistical machine translation system, and we use [multi-bleu.perl](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl) of it to do Bleu Evalution. To download this script, simply run the following command:
+
+```bash
+cd demo/seqToseq/translation
+./moses_bleu.sh
+```
+
+Since the standard translation is alrealy downloaded as `data/wmt14/gen/ntst14.trg`, we can do Bleu Evalution by running the command:
+
+```bash
+cd demo/seqToseq/translation
+./eval_bleu.sh FILE BEAMSIZE
+```
+
+- FILE: the generation result file
+- BEAMSIZE: expand width in beam search
diff --git a/doc/index.md b/doc/index.md
new file mode 100644
index 00000000000000..5b5998fe054837
--- /dev/null
+++ b/doc/index.md
@@ -0,0 +1,13 @@
+PaddlePaddle Documentation
+===================
+
+User Guide
+----------
+* [Quick Start](demo/quick_start/index_en.md)
+* [Build and Installation](build/index.rst)
+* [Contribute Code](build/contribute_to_paddle.md)
+* [User Interface](ui/index.md)
+* [Source Code Documents](source/index.md)
+* [Layer Documents](layer.md)
+* [Example and Demo](demo/index.md)
+* [Cluster Train](cluster/index.md)
diff --git a/doc/source/api/api.rst b/doc/source/api/api.rst
new file mode 100644
index 00000000000000..6fc450202df73f
--- /dev/null
+++ b/doc/source/api/api.rst
@@ -0,0 +1,5 @@
+API
+========
+
+.. doxygenfile:: paddle/api/PaddleAPI.h
+.. doxygenfile:: paddle/api/Internal.h
diff --git a/doc/source/cuda/cuda/cuda.rst b/doc/source/cuda/cuda/cuda.rst
new file mode 100644
index 00000000000000..52f17c2b2e48ae
--- /dev/null
+++ b/doc/source/cuda/cuda/cuda.rst
@@ -0,0 +1,39 @@
+Cuda
+=============
+
+Dynamic Link Libs
+--------------------------
+
+hl_dso_loader.h
+``````````````````
+.. doxygenfile:: paddle/cuda/include/hl_dso_loader.h
+
+GPU Resources
+----------------
+
+hl_cuda.ph
+``````````````
+.. doxygenfile:: paddle/cuda/include/hl_cuda.ph
+
+hl_cuda.h
+``````````````
+.. doxygenfile:: paddle/cuda/include/hl_cuda.h
+
+CUDA Wrapper
+--------------
+
+hl_cuda_cublas.h
+``````````````````````
+.. doxygenfile:: paddle/cuda/include/hl_cuda_cublas.h
+
+hl_cuda_cudnn.h
+``````````````````````
+.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.h
+
+hl_cuda_cudnn.h
+``````````````````````
+.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.ph
+
+
+
+
diff --git a/doc/source/cuda/cuda/index.rst b/doc/source/cuda/cuda/index.rst
new file mode 100644
index 00000000000000..5fa38ff0fc8cea
--- /dev/null
+++ b/doc/source/cuda/cuda/index.rst
@@ -0,0 +1,7 @@
+CUDA
+====================
+
+.. toctree::
+  :maxdepth: 3
+
+  cuda.rst
diff --git a/doc/source/cuda/matrix/index.rst b/doc/source/cuda/matrix/index.rst
new file mode 100644
index 00000000000000..63f95eb46618fd
--- /dev/null
+++ b/doc/source/cuda/matrix/index.rst
@@ -0,0 +1,7 @@
+Matrix
+====================
+
+.. toctree::
+  :maxdepth: 3
+
+  matrix.rst
diff --git a/doc/source/cuda/matrix/matrix.rst b/doc/source/cuda/matrix/matrix.rst
new file mode 100644
index 00000000000000..dd4f06599c5af2
--- /dev/null
+++ b/doc/source/cuda/matrix/matrix.rst
@@ -0,0 +1,61 @@
+Matrix
+=======
+
+Base Matrix
+-------------
+
+hl_matrix.h
+``````````````````
+.. doxygenfile:: paddle/cuda/include/hl_matrix.h
+
+hl_matrix_base.h
+``````````````````
+.. doxygenfile:: paddle/cuda/include/hl_matrix_base.cuh
+
+hl_matrix_apply.cuh
+``````````````````````
+.. doxygenfile:: paddle/cuda/include/hl_matrix_apply.cuh
+
+hl_matrix_ops.cuh
+``````````````````````
+.. doxygenfile:: paddle/cuda/include/hl_matrix_ops.cuh
+
+hl_matrix_type.cuh
+``````````````````````
+.. doxygenfile:: paddle/cuda/include/hl_matrix_type.cuh
+
+hl_sse_matrix_kernel.cuh
+``````````````````````````
+.. doxygenfile:: paddle/cuda/include/hl_sse_matrix_kernel.cuh
+
+hl_batch_transpose.h
+``````````````````````````
+.. doxygenfile:: paddle/cuda/include/hl_batch_transpose.h
+
+Sparse Matrix
+--------------
+
+hl_sparse.h
+``````````````````
+.. doxygenfile:: paddle/cuda/include/hl_sparse.h
+
+hl_sparse.ph
+``````````````````````
+.. doxygenfile:: paddle/cuda/include/hl_sparse.ph
+
+Others
+---------------
+
+hl_aggregate.h
+``````````````````
+.. doxygenfile:: paddle/cuda/include/hl_aggregate.h
+
+hl_table_apply.h
+``````````````````
+.. doxygenfile:: paddle/cuda/include/hl_table_apply.h
+
+hl_top_k.h
+``````````````````
+.. doxygenfile:: paddle/cuda/include/hl_top_k.h
+
+
diff --git a/doc/source/cuda/rnn/index.rst b/doc/source/cuda/rnn/index.rst
new file mode 100644
index 00000000000000..4913e47ba1cbc1
--- /dev/null
+++ b/doc/source/cuda/rnn/index.rst
@@ -0,0 +1,7 @@
+RNN
+====================
+
+.. toctree::
+  :maxdepth: 3
+
+  rnn.rst
diff --git a/doc/source/cuda/rnn/rnn.rst b/doc/source/cuda/rnn/rnn.rst
new file mode 100644
index 00000000000000..ce8ed96692bcb7
--- /dev/null
+++ b/doc/source/cuda/rnn/rnn.rst
@@ -0,0 +1,36 @@
+Neural Networks
+==================
+
+Base
+-------
+.. doxygenfile:: paddle/cuda/include/hl_gpu.h
+.. doxygenfile:: paddle/cuda/include/hl_cnn.h
+.. doxygenfile:: paddle/cuda/include/hl_functions.h
+.. doxygenfile:: paddle/cuda/include/hl_avx_functions.h
+.. doxygenfile:: paddle/cuda/include/hl_device_functions.cuh
+.. doxygenfile:: paddle/cuda/include/hl_gpu_functions.cuh
+
+Activation Functions
+-----------------------
+.. doxygenfile:: paddle/cuda/include/hl_activation_functions.h
+
+RNN Related APIs
+-----------------
+
+.. doxygenfile:: paddle/cuda/include/hl_recurrent_apply.cuh
+.. doxygenfile:: paddle/cuda/include/hl_sequence.h
+
+LSTM Model
+``````````````
+.. doxygenfile:: paddle/cuda/include/hl_lstm.h
+.. dpxygenfile:: paddle/cuda/include/hl_cpu_lstm.cuh
+.. doxygenfile:: paddle/cuda/include/hl_gpu_lstm.cuh
+.. doxygenfile:: paddle/cuda/include/hl_lstm_ops.cuh
+
+GRU Model
+````````````````
+.. doxygenfile:: paddle/cuda/include/hl_gru_ops.cuh
+.. doxygenfile:: paddle/cuda/include/hl_cpu_gru.cuh
+.. doxygenfile:: paddle/cuda/include/hl_gpu_gru.cuh
+
+
diff --git a/doc/source/cuda/utils/index.rst b/doc/source/cuda/utils/index.rst
new file mode 100644
index 00000000000000..7a84cbe27dd21e
--- /dev/null
+++ b/doc/source/cuda/utils/index.rst
@@ -0,0 +1,7 @@
+Utils
+====================
+
+.. toctree::
+  :maxdepth: 3
+
+  utils.rst
diff --git a/doc/source/cuda/utils/utils.rst b/doc/source/cuda/utils/utils.rst
new file mode 100644
index 00000000000000..1ea3e5404aa5fc
--- /dev/null
+++ b/doc/source/cuda/utils/utils.rst
@@ -0,0 +1,23 @@
+Utilities
+===========
+
+HPPL Base
+------------
+
+hl_base.h
+``````````````
+.. doxygenfile:: paddle/cuda/include/hl_base.h
+
+Timer
+-----------
+
+hl_time.h
+``````````````
+.. doxygenfile:: paddle/cuda/include/hl_time.h
+
+Thread Resource
+-----------
+
+hl_thread.ph
+``````````````
+.. doxygenfile:: paddle/cuda/include/hl_thread.ph
diff --git a/doc/source/gserver/activations/index.rst b/doc/source/gserver/activations/index.rst
new file mode 100644
index 00000000000000..ed6200d9a6c12c
--- /dev/null
+++ b/doc/source/gserver/activations/index.rst
@@ -0,0 +1,5 @@
+Activations
+=============
+
+.. doxygenfile:: paddle/gserver/activations/ActivationFunction.h
+.. doxygenfile:: paddle/gserver/activations/ActivationFunction.cpp
diff --git a/doc/source/gserver/dataprovider/dataproviders.rst b/doc/source/gserver/dataprovider/dataproviders.rst
new file mode 100644
index 00000000000000..2d2ace177b97a1
--- /dev/null
+++ b/doc/source/gserver/dataprovider/dataproviders.rst
@@ -0,0 +1,14 @@
+Data Providers
+================
+
+Data Provider
+---------------
+.. doxygenfile:: paddle/gserver/dataproviders/DataProvider.h
+.. doxygenfile:: paddle/gserver/dataproviders/PyDataProvider2.cpp
+.. doxygenfile:: paddle/gserver/dataproviders/DataProviderGroup.h
+.. doxygenfile:: paddle/gserver/dataproviders/MultiDataProvider.h
+
+Proto Data Provider
+--------------------
+.. doxygenfile:: paddle/gserver/dataproviders/ProtoDataProvider.h
+.. doxygenfile:: paddle/gserver/dataproviders/ProtoReader.h
diff --git a/doc/source/gserver/dataprovider/index.rst b/doc/source/gserver/dataprovider/index.rst
new file mode 100644
index 00000000000000..4f6077f1224f90
--- /dev/null
+++ b/doc/source/gserver/dataprovider/index.rst
@@ -0,0 +1,7 @@
+Data Providers Documents
+==========================
+
+.. toctree::
+  :maxdepth: 3
+
+  dataproviders.rst
diff --git a/doc/source/gserver/evaluators/index.rst b/doc/source/gserver/evaluators/index.rst
new file mode 100644
index 00000000000000..d7f622ff826033
--- /dev/null
+++ b/doc/source/gserver/evaluators/index.rst
@@ -0,0 +1,8 @@
+Evaluators
+============
+
+.. doxygenfile:: paddle/gserver/evaluators/Evaluator.h
+.. doxygenfile:: paddle/gserver/evaluators/ChunkEvaluator.cpp
+.. doxygenfile:: paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+
+
diff --git a/doc/source/gserver/gradientmachines/gradientmachines.rst b/doc/source/gserver/gradientmachines/gradientmachines.rst
new file mode 100644
index 00000000000000..b3009f274e055d
--- /dev/null
+++ b/doc/source/gserver/gradientmachines/gradientmachines.rst
@@ -0,0 +1,20 @@
+Gradient machines
+===================
+
+Networks
+------------
+.. doxygenfile:: paddle/gserver/gradientmachines/MultiNetwork.h
+.. doxygenfile:: paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
+
+Gradient Machines
+--------------------
+.. doxygenfile:: paddle/gserver/gradientmachines/GradientMachine.h
+.. doxygenfile:: paddle/gserver/gradientmachines/MultiGradientMachine.h
+
+Recurrent Gradient Machines
+-----------------------------
+.. doxygenfile:: paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+.. doxygenfile:: paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+
+
+
diff --git a/doc/source/gserver/gradientmachines/index.rst b/doc/source/gserver/gradientmachines/index.rst
new file mode 100644
index 00000000000000..997c29a102f53c
--- /dev/null
+++ b/doc/source/gserver/gradientmachines/index.rst
@@ -0,0 +1,7 @@
+Gradient Machines Documents
+=============================
+
+.. toctree::
+  :maxdepth: 3
+
+  gradientmachines.rst
diff --git a/doc/source/gserver/layers/index.rst b/doc/source/gserver/layers/index.rst
new file mode 100644
index 00000000000000..559c5436b10a59
--- /dev/null
+++ b/doc/source/gserver/layers/index.rst
@@ -0,0 +1,7 @@
+Layers Documents
+====================
+
+.. toctree::
+  :maxdepth: 3
+
+  layer.rst
diff --git a/doc/source/gserver/layers/layer.rst b/doc/source/gserver/layers/layer.rst
new file mode 100644
index 00000000000000..a864e18b9fbae1
--- /dev/null
+++ b/doc/source/gserver/layers/layer.rst
@@ -0,0 +1,525 @@
+Base
+======
+
+Layer 
+-----
+..  doxygenclass:: paddle::Layer
+    :members:
+
+Projection
+----------
+..  doxygenclass:: paddle::Projection
+    :members:
+
+Operator
+--------
+..  doxygenclass:: paddle::Operator
+    :members:
+    
+Data Layer
+===========
+
+..  doxygenclass:: paddle::DataLayer
+    :members:
+
+Fully Connected Layers
+======================
+
+FullyConnectedLayer
+-------------------
+..  doxygenclass:: paddle::FullyConnectedLayer
+    :members:
+
+SelectiveFullyConnectedLayer
+----------------------------
+..  doxygenclass:: paddle::SelectiveFullyConnectedLayer
+    :members:
+
+Conv Layers
+===========
+
+ConvBaseLayer
+-------------
+..  doxygenclass:: paddle::ConvBaseLayer
+    :members:
+
+ConvOperator
+------------
+..  doxygenclass:: paddle::ConvOperator
+    :members:
+
+ConvShiftLayer
+--------------
+..  doxygenclass:: paddle::ConvShiftLayer
+    :members:
+
+CudnnConvLayer
+--------------
+..  doxygenclass:: paddle::CudnnConvLayer
+    :members:
+
+ExpandConvLayer
+---------------
+..  doxygenclass:: paddle::ExpandConvLayer
+    :members:
+
+ContextProjection
+-----------------
+..  doxygenclass:: paddle::ContextProjection
+    :members:
+
+Pooling Layers
+==============
+
+PoolLayer
+---------
+..  doxygenclass:: paddle::PoolLayer
+    :members:
+
+PoolProjectionLayer
+-------------------
+..  doxygenclass:: paddle::PoolProjectionLayer
+    :members:
+
+CudnnPoolLayer
+--------------
+..  doxygenclass:: paddle::CudnnPoolLayer
+    :members:
+
+Norm Layers
+===========
+
+NormLayer
+---------
+..  doxygenclass:: paddle::NormLayer
+    :members:
+
+CMRProjectionNormLayer
+----------------------
+..  doxygenclass:: paddle::CMRProjectionNormLayer
+    :members:
+
+DataNormLayer
+-------------
+..  doxygenclass:: paddle::DataNormLayer
+    :members:
+
+ResponseNormLayer
+-----------------
+..  doxygenclass:: paddle::ResponseNormLayer
+    :members:
+
+BatchNormBaseLayer
+------------------
+..  doxygenclass:: paddle::BatchNormBaseLayer
+    :members:
+
+BatchNormalizationLayer
+-----------------------
+..  doxygenclass:: paddle::BatchNormalizationLayer
+    :members:
+
+CudnnBatchNormLayer
+-----------------------
+..  doxygenclass:: paddle::CudnnBatchNormLayer
+    :members:
+
+SumToOneNormLayer
+-----------------
+..  doxygenclass:: paddle::SumToOneNormLayer
+    :members:
+
+Activation Layer
+================
+
+ParameterReluLayer
+------------------
+..  doxygenclass:: paddle::ParameterReluLayer
+    :members:
+
+Recurrent Layers
+================
+
+RecurrentLayer
+--------------
+..  doxygenclass:: paddle::RecurrentLayer
+    :members:
+
+SequenceToBatch
+---------------
+..  doxygenclass:: paddle::SequenceToBatch
+    :members:
+
+LSTM
+----
+LstmLayer
+`````````
+..  doxygenclass:: paddle::LstmLayer
+    :members:
+
+LstmStepLayer
+`````````````
+..  doxygenclass:: paddle::LstmStepLayer
+    :members:
+
+LstmCompute
+```````````
+..  doxygenclass:: paddle::LstmCompute
+    :members:
+
+MDLSTM
+------
+MDLstmLayer
+```````````
+..  doxygenclass:: paddle::MDLstmLayer
+    :members:
+
+CoordIterator
+`````````````
+..  doxygenclass:: paddle::CoordIterator
+    :members:
+
+GRU
+---
+GatedRecurrentLayer
+```````````````````
+..  doxygenclass:: paddle::GatedRecurrentLayer
+    :members:
+
+GruStepLayer
+````````````
+..  doxygenclass:: paddle::GruStepLayer
+    :members:
+
+GruCompute
+``````````
+..  doxygenclass:: paddle::GruCompute
+    :members:
+    
+
+Recurrent Layer Group
+=====================
+
+AgentLayer
+----------
+..  doxygenclass:: paddle::AgentLayer
+    :members:
+
+SequenceAgentLayer
+------------------
+..  doxygenclass:: paddle::SequenceAgentLayer
+    :members:
+
+GatherAgentLayer
+----------------
+..  doxygenclass:: paddle::GatherAgentLayer
+    :members:
+
+SequenceGatherAgentLayer
+------------------------
+..  doxygenclass:: paddle::SequenceGatherAgentLayer
+    :members:
+
+ScatterAgentLayer
+-----------------
+..  doxygenclass:: paddle::ScatterAgentLayer
+    :members:
+
+SequenceScatterAgentLayer
+-------------------------
+..  doxygenclass:: paddle::SequenceScatterAgentLayer
+    :members:
+
+GetOutputLayer
+--------------
+..  doxygenclass:: paddle::GetOutputLayer
+    :members:
+
+Mixed Layer
+===========
+..  doxygenclass:: paddle::MixedLayer
+    :members:
+
+DotMulProjection
+----------------
+..  doxygenclass:: paddle::DotMulProjection
+    :members:
+
+DotMulOperator
+--------------
+..  doxygenclass:: paddle::DotMulOperator
+    :members:
+
+FullMatrixProjection
+--------------------
+..  doxygenclass:: paddle::FullMatrixProjection
+    :members:
+
+IdentityProjection
+------------------
+..  doxygenclass:: paddle::IdentityProjection
+    :members:
+
+IdentityOffsetProjection
+------------------------
+..  doxygenclass:: paddle::IdentityOffsetProjection
+    :members:
+
+TableProjection
+---------------
+..  doxygenclass:: paddle::TableProjection
+    :members:
+
+TransposedFullMatrixProjection
+------------------------------
+..  doxygenclass:: paddle::TransposedFullMatrixProjection
+    :members:
+
+Aggregate Layers
+================
+
+Aggregate
+---------
+AverageLayer
+````````````
+..  doxygenclass:: paddle::AverageLayer
+    :members:
+
+MaxLayer
+````````
+..  doxygenclass:: paddle::MaxLayer
+    :members:
+
+SequenceLastInstanceLayer
+`````````````````````````
+..  doxygenclass:: paddle::SequenceLastInstanceLayer
+    :members:
+
+Concat
+------
+ConcatenateLayer
+````````````````
+..  doxygenclass:: paddle::ConcatenateLayer
+    :members:
+
+ConcatenateLayer2
+`````````````````
+..  doxygenclass:: paddle::ConcatenateLayer2
+    :members:
+
+SequenceConcatLayer
+```````````````````
+..  doxygenclass:: paddle::SequenceConcatLayer
+    :members:
+
+Subset
+------
+SubSequenceLayer
+````````````````
+..  doxygenclass:: paddle::SubSequenceLayer
+    :members:
+
+Reshaping Layers
+================
+
+BlockExpandLayer
+----------------
+..  doxygenclass:: paddle::BlockExpandLayer
+    :members:
+
+ExpandLayer
+-----------
+..  doxygenclass:: paddle::ExpandLayer
+    :members:
+
+FeatureMapExpandLayer
+---------------------
+..  doxygenclass:: paddle::FeatureMapExpandLayer
+    :members:
+
+ResizeLayer
+-----------
+..  doxygenclass:: paddle::ResizeLayer
+    :members:
+
+SequenceReshapeLayer
+--------------------
+..  doxygenclass:: paddle::SequenceReshapeLayer
+    :members:
+
+Math Layers
+===========
+
+AddtoLayer
+----------
+..  doxygenclass:: paddle::AddtoLayer
+    :members:
+
+ConvexCombinationLayer
+----------------------
+..  doxygenclass:: paddle::ConvexCombinationLayer
+    :members:
+
+InterpolationLayer
+------------------
+..  doxygenclass:: paddle::InterpolationLayer
+    :members:
+
+MultiplexLayer
+--------------
+..  doxygenclass:: paddle::MultiplexLayer
+    :members:
+
+OuterProdLayer
+--------------
+..  doxygenclass:: paddle::OuterProdLayer
+    :members:
+
+PowerLayer
+----------
+..  doxygenclass:: paddle::PowerLayer
+    :members:
+
+ScalingLayer
+------------
+..  doxygenclass:: paddle::ScalingLayer
+    :members:
+
+SlopeInterceptLayer
+-------------------
+..  doxygenclass:: paddle::SlopeInterceptLayer
+    :members:
+
+TensorLayer
+------------
+..  doxygenclass:: paddle::TensorLayer
+    :members:
+
+TransLayer
+----------
+..  doxygenclass:: paddle::TransLayer
+    :members:
+
+Sampling Layers
+===============
+
+MultinomialSampler
+------------------
+..  doxygenclass:: paddle::MultinomialSampler
+    :members:
+
+MaxIdLayer
+----------
+..  doxygenclass:: paddle::MaxIdLayer
+    :members:
+
+SamplingIdLayer
+---------------
+..  doxygenclass:: paddle::SamplingIdLayer
+    :members:
+
+Cost Layers
+===========
+
+CostLayer
+-----------
+..  doxygenclass:: paddle::CostLayer
+    :members:
+
+HuberTwoClass
+`````````````
+..  doxygenclass:: paddle::HuberTwoClass
+    :members:
+
+LambdaCost
+```````````
+..  doxygenclass:: paddle::LambdaCost
+    :members:
+
+MultiBinaryLabelCrossEntropy
+````````````````````````````
+..  doxygenclass:: paddle::MultiBinaryLabelCrossEntropy
+    :members:
+
+MultiClassCrossEntropy
+```````````````````````
+..  doxygenclass:: paddle::MultiClassCrossEntropy
+    :members:
+
+MultiClassCrossEntropyWithSelfNorm
+``````````````````````````````````
+..  doxygenclass:: paddle::MultiClassCrossEntropyWithSelfNorm
+    :members:
+
+RankingCost
+```````````
+..  doxygenclass:: paddle::RankingCost
+    :members:
+
+SoftBinaryClassCrossEntropy
+```````````````````````````
+..  doxygenclass:: paddle::SoftBinaryClassCrossEntropy
+    :members:
+
+SumOfSquaresCostLayer
+`````````````````````
+..  doxygenclass:: paddle::SumOfSquaresCostLayer
+    :members:
+
+CosSimLayer
+-----------
+..  doxygenclass:: paddle::CosSimLayer
+    :members:
+
+CosSimVecMatLayer
+-----------------
+..  doxygenclass:: paddle::CosSimVecMatLayer
+    :members:
+
+CRFDecodingLayer
+----------------
+..  doxygenclass:: paddle::CRFDecodingLayer
+    :members:
+
+CRFLayer
+--------
+..  doxygenclass:: paddle::CRFLayer
+    :members:
+
+CTCLayer
+--------
+..  doxygenclass:: paddle::CTCLayer
+    :members:
+
+HierarchicalSigmoidLayer
+------------------------
+..  doxygenclass:: paddle::HierarchicalSigmoidLayer
+    :members:
+
+LinearChainCRF
+--------------
+..  doxygenclass:: paddle::LinearChainCRF
+    :members:
+
+LinearChainCTC
+--------------
+..  doxygenclass:: paddle::LinearChainCTC
+    :members:
+
+NCELayer
+--------
+..  doxygenclass:: paddle::NCELayer
+    :members:
+
+ValidationLayer
+---------------
+..  doxygenclass:: paddle::ValidationLayer
+    :members:
+
+Check Layers
+============
+
+EosIdCheckLayer
+---------------
+..  doxygenclass:: paddle::EosIdCheckLayer
+    :members:
diff --git a/doc/source/index.md b/doc/source/index.md
new file mode 100644
index 00000000000000..669362a1c3ad5a
--- /dev/null
+++ b/doc/source/index.md
@@ -0,0 +1,53 @@
+# Source Code Documents
+
+## cuda
+
+- [CUDA](cuda/cuda/index.rst)
+- [Matrix](cuda/matrix/index.rst)
+- [RNN](cuda/rnn/index.rst)
+- [Utils](cuda/utils/index.rst)
+
+## gserver
+
+- [Activations](gserver/activations/index.rst)
+- [Data Providers](gserver/dataprovider/index.rst)
+- [Evaluators](gserver/evaluators/index.rst)
+- [Gradient Machines](gserver/gradientmachines/index.rst)
+- [Layers](gserver/layers/index.rst)
+
+## math
+
+- [Matrix](math/matrix/index.rst)
+- [Utils](math/utils/index.rst)
+
+## parameter
+
+- [Parameter](parameter/parameter/index.rst)
+- [Update](parameter/update/index.rst)
+- [Optimizer](parameter/optimizer/index.rst)
+
+## pserver
+
+- [Client](pserver/client/index.rst)
+- [Network](pserver/network/index.rst)
+- [Server](pserver/server/index.rst)
+
+## trainer
+
+- [Trainer](trainer/trainer.rst)
+
+## api
+
+- [API](api/api.rst)
+
+## utils
+
+- [CustomStackTrace](utils/customStackTrace.rst)
+- [Enumeration wrapper](utils/enum.rst)
+- [Lock](utils/lock.rst)
+- [Queue](utils/queue.rst)
+- [Thread](utils/thread.rst)
+
+## proto
+
+TBD
diff --git a/doc/source/math/matrix/index.rst b/doc/source/math/matrix/index.rst
new file mode 100644
index 00000000000000..68410f2a27b68c
--- /dev/null
+++ b/doc/source/math/matrix/index.rst
@@ -0,0 +1,7 @@
+Matrix Documents
+====================
+
+.. toctree::
+  :maxdepth: 3
+
+  matrix.rst
diff --git a/doc/source/math/matrix/matrix.rst b/doc/source/math/matrix/matrix.rst
new file mode 100644
index 00000000000000..b12e3934f4705d
--- /dev/null
+++ b/doc/source/math/matrix/matrix.rst
@@ -0,0 +1,20 @@
+Matrix
+=======
+
+Base
+--------
+.. doxygenfile:: paddle/math/BaseMatrix.h
+
+Sparse Matrix
+----------------
+.. doxygenfile:: paddle/math/Matrix.h
+.. doxygenfile:: paddle/math/Vector.h
+.. doxygenfile:: paddle/math/MathUtils.h
+.. doxygenfile:: paddle/math/SparseMatrix.h
+.. doxygenfile:: paddle/math/SparseRowMatrix.h
+.. doxygenfile:: paddle/math/CpuSparseMatrix.h
+
+Others
+----------
+.. doxygenfile:: paddle/math/MathFunctions.h
+.. doxygenfile:: paddle/math/SIMDFunctions.h
diff --git a/doc/source/math/utils/index.rst b/doc/source/math/utils/index.rst
new file mode 100644
index 00000000000000..e5fe335da29b95
--- /dev/null
+++ b/doc/source/math/utils/index.rst
@@ -0,0 +1,7 @@
+Utils Documents
+====================
+
+.. toctree::
+  :maxdepth: 3
+
+  utils.rst
diff --git a/doc/source/math/utils/utils.rst b/doc/source/math/utils/utils.rst
new file mode 100644
index 00000000000000..e00dc6229c15e1
--- /dev/null
+++ b/doc/source/math/utils/utils.rst
@@ -0,0 +1,13 @@
+Utils
+=======
+
+Bits
+-------
+.. doxygenfile:: paddle/math/Bits.h
+
+Memory Handle
+--------------
+.. doxygenfile:: paddle/math/MemoryHandle.h
+.. doxygenfile:: paddle/math/Allocator.h
+.. doxygenfile:: paddle/math/PoolAllocator.h
+.. doxygenfile:: paddle/math/Storage.h
diff --git a/doc/source/parameter/optimizer/index.rst b/doc/source/parameter/optimizer/index.rst
new file mode 100644
index 00000000000000..3338af5608a03e
--- /dev/null
+++ b/doc/source/parameter/optimizer/index.rst
@@ -0,0 +1,7 @@
+Parameter Documents
+====================
+
+.. toctree::
+  :maxdepth: 3
+
+  optimizer.rst
diff --git a/doc/source/parameter/optimizer/optimizer.rst b/doc/source/parameter/optimizer/optimizer.rst
new file mode 100644
index 00000000000000..3d9e49217eb175
--- /dev/null
+++ b/doc/source/parameter/optimizer/optimizer.rst
@@ -0,0 +1,7 @@
+Optimizer
+============
+
+.. doxygenfile:: paddle/parameter/FirstOrderOptimizer.h
+.. doxygenfile:: paddle/parameter/AverageOptimizer.h
+.. doxygenfile:: paddle/parameter/ParameterOptimizer.h
+.. doxygenfile:: paddle/parameter/OptimizerWithRegularizer.h
diff --git a/doc/source/parameter/parameter/index.rst b/doc/source/parameter/parameter/index.rst
new file mode 100644
index 00000000000000..e7ed70ec4c87b3
--- /dev/null
+++ b/doc/source/parameter/parameter/index.rst
@@ -0,0 +1,7 @@
+Parameter Documents
+====================
+
+.. toctree::
+  :maxdepth: 3
+
+  parameter.rst
diff --git a/doc/source/parameter/parameter/parameter.rst b/doc/source/parameter/parameter/parameter.rst
new file mode 100644
index 00000000000000..2b7afdb4093753
--- /dev/null
+++ b/doc/source/parameter/parameter/parameter.rst
@@ -0,0 +1,16 @@
+Parameter
+=============
+
+Weight
+--------
+.. doxygenfile:: paddle/parameter/Weight.h
+
+Regularizer
+------------
+.. doxygenfile:: paddle/parameter/Regularizer.h
+
+Parameter
+-------------
+.. doxygenfile:: paddle/parameter/Argument.h
+.. doxygenfile:: paddle/parameter/Parameter.h
+.. doxygenfile:: paddle/parameter/ParallelParameter.h
diff --git a/doc/source/parameter/update/index.rst b/doc/source/parameter/update/index.rst
new file mode 100644
index 00000000000000..1bbd73319396e7
--- /dev/null
+++ b/doc/source/parameter/update/index.rst
@@ -0,0 +1,7 @@
+Parameter Documents
+====================
+
+.. toctree::
+  :maxdepth: 3
+
+  update.rst
diff --git a/doc/source/parameter/update/update.rst b/doc/source/parameter/update/update.rst
new file mode 100644
index 00000000000000..c417602f0338db
--- /dev/null
+++ b/doc/source/parameter/update/update.rst
@@ -0,0 +1,7 @@
+Update
+==========
+
+.. doxygenfile:: paddle/parameter/ParameterUpdaterBase.h
+.. doxygenfile:: paddle/parameter/ParameterUpdaterHook.h
+.. doxygenfile:: paddle/parameter/ParameterUpdateFunctions.h
+
diff --git a/doc/source/pserver/client/client.rst b/doc/source/pserver/client/client.rst
new file mode 100644
index 00000000000000..fc7ed90d3dc8be
--- /dev/null
+++ b/doc/source/pserver/client/client.rst
@@ -0,0 +1,14 @@
+Client
+=========
+
+.. doxygenclass:: paddle::BaseClient
+    :members:
+    :protected-members:
+    :private-members:
+    :undoc-members:
+
+.. doxygenclass:: paddle::ParameterClient2
+    :members:
+    :protected-members:
+    :private-members:
+    :undoc-members:
diff --git a/doc/source/pserver/client/index.rst b/doc/source/pserver/client/index.rst
new file mode 100644
index 00000000000000..dc924c9ca8e7b9
--- /dev/null
+++ b/doc/source/pserver/client/index.rst
@@ -0,0 +1,7 @@
+Client Documents
+====================
+
+.. toctree::
+  :maxdepth: 3
+
+  client.rst
diff --git a/doc/source/pserver/network/index.rst b/doc/source/pserver/network/index.rst
new file mode 100644
index 00000000000000..2fdf95e17d339d
--- /dev/null
+++ b/doc/source/pserver/network/index.rst
@@ -0,0 +1,7 @@
+Network Documents
+====================
+
+.. toctree::
+  :maxdepth: 3
+
+  network.rst
diff --git a/doc/source/pserver/network/network.rst b/doc/source/pserver/network/network.rst
new file mode 100644
index 00000000000000..e000ff8dbbdc37
--- /dev/null
+++ b/doc/source/pserver/network/network.rst
@@ -0,0 +1,42 @@
+Network
+==========
+
+Socket Server
+----------------
+.. doxygenclass:: paddle::SocketServer
+    :members:
+    :protected-members:
+    :private-members:
+    :undoc-members:
+
+Socket Worker
+----------------
+.. doxygenclass:: paddle::SocketWorker
+    :members:
+    :protected-members:
+    :private-members:
+    :undoc-members:
+
+Socket Client
+----------------
+.. doxygenclass:: paddle::SocketClient
+    :members:
+    :protected-members:
+    :private-members:
+    :undoc-members:
+
+Socket Channel
+---------------
+.. doxygenclass:: paddle::SocketChannel
+    :members:
+    :protected-members:
+    :private-members:
+    :undoc-members:
+
+Message Reader
+---------------
+.. doxygenclass:: paddle::MsgReader
+    :members:
+    :protected-members:
+    :private-members:
+    :undoc-members:
diff --git a/doc/source/pserver/server/index.rst b/doc/source/pserver/server/index.rst
new file mode 100644
index 00000000000000..09e3530bfeaf56
--- /dev/null
+++ b/doc/source/pserver/server/index.rst
@@ -0,0 +1,7 @@
+Server Documents
+====================
+
+.. toctree::
+  :maxdepth: 3
+
+  server.rst
diff --git a/doc/source/pserver/server/server.rst b/doc/source/pserver/server/server.rst
new file mode 100644
index 00000000000000..f3110fdd731d24
--- /dev/null
+++ b/doc/source/pserver/server/server.rst
@@ -0,0 +1,14 @@
+Server
+==========
+
+.. doxygenclass:: paddle::ProtoServer
+    :members:
+    :protected-members:
+    :private-members:
+    :undoc-members:
+
+.. doxygenclass:: paddle::ParameterServer2
+    :members:
+    :protected-members:
+    :private-members:
+    :undoc-members:
diff --git a/doc/source/trainer/trainer.rst b/doc/source/trainer/trainer.rst
new file mode 100644
index 00000000000000..12c24597e7f99c
--- /dev/null
+++ b/doc/source/trainer/trainer.rst
@@ -0,0 +1,32 @@
+Trainer
+=======
+
+TrainerStats
+------------
+
+..  doxygenclass:: paddle::TrainerStats
+    :members:
+
+RemoteParameterUpdater
+-----------------------
+
+..  doxygenclass:: paddle::RemoteParameterUpdater
+    :members:
+
+ConcurrentRemoteParameterUpdater
+---------------------------------
+
+..  doxygenclass:: paddle::ConcurrentRemoteParameterUpdater
+    :members:
+
+SparseRemoteParameterUpdater
+----------------------------
+
+..  doxygenclass:: paddle::SparseRemoteParameterUpdater
+    :members:
+
+SparseRemoteParameterUpdaterComposite
+-------------------------------------
+
+..  doxygenclass:: paddle::SparseRemoteParameterUpdaterComposite
+    :members:
diff --git a/doc/source/utils/customStackTrace.rst b/doc/source/utils/customStackTrace.rst
new file mode 100644
index 00000000000000..a4e6f05a406f33
--- /dev/null
+++ b/doc/source/utils/customStackTrace.rst
@@ -0,0 +1,9 @@
+CustomStackTrace
+================
+
+
+class CustomStackTrace
+----------------------
+
+..  doxygenclass:: paddle::CustomStackTrace
+    :members:
diff --git a/doc/source/utils/enum.rst b/doc/source/utils/enum.rst
new file mode 100644
index 00000000000000..17166d35f7cfa6
--- /dev/null
+++ b/doc/source/utils/enum.rst
@@ -0,0 +1,9 @@
+enumeration_wrapper
+===================
+
+
+namespace paddle::enumeration_wrapper
+-------------------------------------
+
+..  doxygennamespace:: paddle::enumeration_wrapper
+
diff --git a/doc/source/utils/lock.rst b/doc/source/utils/lock.rst
new file mode 100644
index 00000000000000..0b027e403f49fc
--- /dev/null
+++ b/doc/source/utils/lock.rst
@@ -0,0 +1,37 @@
+Thread
+======
+
+
+class Thread 
+------------
+
+..  doxygenclass:: paddle::Thread
+    :members:
+
+
+class ThreadWorker
+------------------
+
+..  doxygenclass:: paddle::ThreadWorker
+    :members:
+    
+
+class SyncThreadPool 
+--------------------
+
+..  doxygenclass:: paddle::SyncThreadPool 
+    :members:
+    
+
+class MultiThreadWorker 
+-----------------------
+
+..  doxygenclass:: paddle::MultiThreadWorker 
+    :members:
+    
+
+class AsyncThreadPool 
+---------------------
+
+..  doxygenclass:: paddle::AsyncThreadPool 
+    :members:
diff --git a/doc/source/utils/queue.rst b/doc/source/utils/queue.rst
new file mode 100644
index 00000000000000..72a464ca67288d
--- /dev/null
+++ b/doc/source/utils/queue.rst
@@ -0,0 +1,16 @@
+Queue
+=====
+
+
+class Queue
+------------
+
+..  doxygenclass:: paddle::Queue
+    :members:
+
+
+class BlockingQueue 
+-------------------
+
+..  doxygenclass:: paddle::BlockingQueue 
+    :members:
diff --git a/doc/source/utils/thread.rst b/doc/source/utils/thread.rst
new file mode 100644
index 00000000000000..2eb67dde6a945c
--- /dev/null
+++ b/doc/source/utils/thread.rst
@@ -0,0 +1,40 @@
+Lock
+====
+
+
+class RWLock
+------------
+
+..  doxygenclass:: paddle::RWLock
+    :members:
+
+class ReadLockGuard
+-------------------
+
+..  doxygenclass:: paddle::ReadLockGuard
+    :members:
+
+class SpinLock
+--------------
+
+..  doxygenclass:: paddle::SpinLock
+    :members:
+
+class Semaphore
+---------------
+
+..  doxygenclass:: paddle::Semaphore
+    :members:
+
+class ThreadBarrier
+-------------------
+
+..  doxygenclass:: paddle::ThreadBarrier
+    :members:
+
+class LockedCondition
+---------------------
+
+..  doxygenclass:: paddle::LockedCondition
+    :members:
+
diff --git a/doc/ui/api/py_data_provider_wrapper.rst b/doc/ui/api/py_data_provider_wrapper.rst
new file mode 100644
index 00000000000000..91222dd96819f5
--- /dev/null
+++ b/doc/ui/api/py_data_provider_wrapper.rst
@@ -0,0 +1,6 @@
+PyDataProviderWrapper API
+=========================
+
+
+..  automodule:: paddle.trainer.PyDataProviderWrapper
+    :members:
diff --git a/doc/ui/api/trainer_config_helpers/activations.rst b/doc/ui/api/trainer_config_helpers/activations.rst
new file mode 100644
index 00000000000000..fea420f5926560
--- /dev/null
+++ b/doc/ui/api/trainer_config_helpers/activations.rst
@@ -0,0 +1,5 @@
+Activations
+===========
+
+..  automodule:: paddle.trainer_config_helpers.activations
+    :members:
diff --git a/doc/ui/api/trainer_config_helpers/attrs.rst b/doc/ui/api/trainer_config_helpers/attrs.rst
new file mode 100644
index 00000000000000..44919aba90df0b
--- /dev/null
+++ b/doc/ui/api/trainer_config_helpers/attrs.rst
@@ -0,0 +1,5 @@
+Parameter and Extra Layer Attribute
+===================================
+
+..  automodule:: paddle.trainer_config_helpers.attrs
+    :members:
diff --git a/doc/ui/api/trainer_config_helpers/data_sources.rst b/doc/ui/api/trainer_config_helpers/data_sources.rst
new file mode 100644
index 00000000000000..44ea59df437625
--- /dev/null
+++ b/doc/ui/api/trainer_config_helpers/data_sources.rst
@@ -0,0 +1,5 @@
+DataSources
+===========
+
+..  automodule:: paddle.trainer_config_helpers.data_sources
+    :members:
diff --git a/doc/ui/api/trainer_config_helpers/evaluators.rst b/doc/ui/api/trainer_config_helpers/evaluators.rst
new file mode 100644
index 00000000000000..fdcf3d303ea491
--- /dev/null
+++ b/doc/ui/api/trainer_config_helpers/evaluators.rst
@@ -0,0 +1,5 @@
+Evaluators
+==========
+
+..  automodule:: paddle.trainer_config_helpers.evaluators
+    :members:
diff --git a/doc/ui/api/trainer_config_helpers/index.md b/doc/ui/api/trainer_config_helpers/index.md
new file mode 100644
index 00000000000000..a439e7a8ccabd7
--- /dev/null
+++ b/doc/ui/api/trainer_config_helpers/index.md
@@ -0,0 +1,10 @@
+# Trainer Config Helpers
+
+* [Optimizer](optimizers.rst)
+* [Data Source](data_sources.rst)
+* [Layers](layers_index.rst)
+* [Activations](activations.rst)
+* [Poolings](poolings.rst)
+* [Networks](networks.rst)
+* [Evaluators](evaluators.rst)
+* [Parameter and Extra Layer Attribute](attrs.rst)  
diff --git a/doc/ui/api/trainer_config_helpers/layers.rst b/doc/ui/api/trainer_config_helpers/layers.rst
new file mode 100644
index 00000000000000..8051d297161568
--- /dev/null
+++ b/doc/ui/api/trainer_config_helpers/layers.rst
@@ -0,0 +1,375 @@
+Base
+======
+
+LayerType
+---------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: LayerType
+    :noindex:
+
+LayerOutput
+-----------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: LayerOutput
+    :noindex:
+
+Data layer
+===========
+
+data_layer
+----------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: data_layer
+    :noindex:
+
+Fully Connected Layers
+======================
+
+fc_layer
+--------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: fc_layer
+    :noindex:
+
+selective_fc_layer
+------------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: selective_fc_layer
+    :noindex:
+
+Conv Layers
+===========
+
+conv_operator
+-------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: conv_operator
+    :noindex:
+
+conv_shift_layer
+------------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: conv_shift_layer
+    :noindex:
+
+img_conv_layer
+--------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: img_conv_layer
+    :noindex:
+
+context_projection 
+------------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: context_projection
+    :noindex:
+
+Image Pooling Layer
+===================
+
+img_pool_layer
+--------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: img_pool_layer
+    :noindex:
+
+Norm Layer
+==========
+
+img_cmrnorm_layer
+-----------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: img_cmrnorm_layer
+    :noindex:
+
+img_rnorm_layer
+-----------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: img_rnorm_layer
+    :noindex:
+
+batch_norm_layer
+---------------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: batch_norm_layer
+    :noindex:
+
+sum_to_one_norm_layer
+---------------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: sum_to_one_norm_layer
+    :noindex:
+    
+Recurrent Layers
+================
+
+recurrent_layer
+-----------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: recurrent_layer
+    :noindex:
+
+lstmemory
+---------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: lstmemory
+    :noindex:
+
+lstm_step_layer
+---------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: lstm_step_layer
+    :noindex:
+
+grumemory
+---------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: grumemory
+    :noindex:
+
+gru_step_layer
+---------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: gru_step_layer
+    :noindex:
+
+Recurrent Layer Group
+=====================
+
+get_output_layer
+-----------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: get_output_layer
+    :noindex:
+    
+Mixed Layer
+===========
+
+mixed_layer
+-----------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: mixed_layer
+    :noindex:
+
+embedding_layer
+---------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: embedding_layer
+    :noindex:
+
+dotmul_projection
+-----------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: dotmul_projection
+    :noindex:
+
+full_matrix_projection
+----------------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: full_matrix_projection
+    :noindex:
+
+identity_projection
+-------------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: identity_projection
+    :noindex:
+
+
+table_projection
+----------------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: table_projection
+    :noindex:
+
+trans_full_matrix_projection
+----------------------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: trans_full_matrix_projection
+    :noindex:
+    
+Aggregate Layers
+================
+
+pooling_layer
+-------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: pooling_layer
+    :noindex:
+
+last_seq
+--------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: last_seq
+    :noindex:
+
+first_seq
+---------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: first_seq
+    :noindex:
+
+concat_layer
+------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: concat_layer
+    :noindex:
+
+Reshaping Layers
+================
+
+block_expand_layer
+------------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: block_expand_layer
+    :noindex:
+
+expand_layer
+------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: expand_layer
+    :noindex:
+
+Math Layers
+===========
+
+addto_layer
+-----------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: addto_layer
+    :noindex:
+
+convex_comb_layer
+-----------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: convex_comb_layer
+    :noindex:
+
+interpolation_layer
+-------------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: interpolation_layer
+    :noindex:
+
+power_layer
+-----------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: power_layer
+    :noindex:
+
+scaling_layer
+-------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: scaling_layer
+    :noindex:
+
+slope_intercept_layer
+----------------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: slope_intercept_layer
+    :noindex:
+
+tensor_layer
+------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: tensor_layer
+    :noindex:
+    
+trans_layer
+------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: trans_layer
+    :noindex:
+
+Sampling Layers
+===============
+
+maxid_layer
+-----------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: maxid_layer
+    :noindex:
+
+sampling_id_layer
+-----------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: sampling_id_layer
+    :noindex:
+
+Cost Layers
+===========
+
+cross_entropy
+-------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: cross_entropy
+    :noindex:
+
+cross_entropy_with_selfnorm
+---------------------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: cross_entropy_with_selfnorm
+    :noindex:
+
+multi_binary_label_cross_entropy
+--------------------------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: multi_binary_label_cross_entropy
+    :noindex:
+
+huber_cost
+----------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: huber_cost
+    :noindex:
+
+lambda_cost
+-----------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: lambda_cost
+    :noindex:
+
+rank_cost
+---------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: rank_cost
+    :noindex:
+
+cos_sim
+-------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: cos_sim
+    :noindex:
+
+crf_layer
+-----------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: crf_layer
+    :noindex:
+
+crf_decoding_layer
+-------------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: crf_decoding_layer
+    :noindex:
+
+ctc_layer
+-----------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: ctc_layer
+    :noindex:
+
+hsigmoid
+---------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: hsigmoid
+    :noindex:
+
+Check Layer 
+============
+
+eos_layer
+------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: eos_layer
+    :noindex:
diff --git a/doc/ui/api/trainer_config_helpers/layers_index.rst b/doc/ui/api/trainer_config_helpers/layers_index.rst
new file mode 100644
index 00000000000000..c0daab152148ce
--- /dev/null
+++ b/doc/ui/api/trainer_config_helpers/layers_index.rst
@@ -0,0 +1,7 @@
+Layers
+======
+
+.. toctree::
+  :maxdepth: 3
+
+  layers.rst
diff --git a/doc/ui/api/trainer_config_helpers/networks.rst b/doc/ui/api/trainer_config_helpers/networks.rst
new file mode 100644
index 00000000000000..255f154ed70733
--- /dev/null
+++ b/doc/ui/api/trainer_config_helpers/networks.rst
@@ -0,0 +1,5 @@
+Networks
+========
+
+..  automodule:: paddle.trainer_config_helpers.networks
+    :members:
diff --git a/doc/ui/api/trainer_config_helpers/optimizers.rst b/doc/ui/api/trainer_config_helpers/optimizers.rst
new file mode 100644
index 00000000000000..3839d932ba0a19
--- /dev/null
+++ b/doc/ui/api/trainer_config_helpers/optimizers.rst
@@ -0,0 +1,5 @@
+Optimizers
+==========
+
+..  automodule:: paddle.trainer_config_helpers.optimizers
+    :members:
diff --git a/doc/ui/api/trainer_config_helpers/poolings.rst b/doc/ui/api/trainer_config_helpers/poolings.rst
new file mode 100644
index 00000000000000..4d3c5dc486b5f8
--- /dev/null
+++ b/doc/ui/api/trainer_config_helpers/poolings.rst
@@ -0,0 +1,5 @@
+Poolings
+========
+
+..  automodule:: paddle.trainer_config_helpers.poolings
+    :members:
diff --git a/doc/ui/cmd_argument/argument_outline.md b/doc/ui/cmd_argument/argument_outline.md
new file mode 100644
index 00000000000000..98dadc270dcac8
--- /dev/null
+++ b/doc/ui/cmd_argument/argument_outline.md
@@ -0,0 +1,404 @@
+# Argument Outline
+
+It looks like there are a lot of arguments. However, most of them are for developers or alrealy set automatically in cluster submitting environment and users do not need to care about them. Here, we divide these arguments into serveral classes according to the scenario that they are used in. For example, the arguments in `common` can be used in all scenes. Some arguments can be only used in certain layers. Some are needed by multi machines training in cluster, etc.
+
+<html>
+<table border="2" frame="border">
+<thead>
+<tr>
+<th scope="col" class="left"></th>
+<th scope="col" class="left">args</th>
+<th scope="col" class="left">local train</th>
+<th scope="col" class="left">cluster train</th>
+<th scope="col" class="left">local test</th>
+<th scope="col" class="left">cluster test</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left" rowspan="9">common</td>
+<td class="left">job</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">use_gpu</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">local</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">config</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">config_args</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">num_passes</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">trainer_count</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">version</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">show_layer_stat</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan="15">train</td><td class="left">dot_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">saving_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">show_parameter_stats_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">init_model_path</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">load_missing_parameter_strategy</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">saving_period_by_batches</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">use_old_updater</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">enable_grad_share</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">grad_share_block_num</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_error_clipping</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_clipping</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">save_only_one</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">allow_inefficient_sparse_update</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">start_pass</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">train/test</td><td class="left">save_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">testing during training</td><td class="left">test_all_data_in_one_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">average_test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "5">test</td><td class="left">model_list</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">test_wait</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">test_pass</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">predict_output_dir</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">distribute_test</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">Auc/PnpairValidation</td><td class="left">predict_file</td>
+<td class="left"></td><td class="left"></td><td class="left"></td>√<td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "5">GPU</td><td class="left">gpu_id</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">parallel_nn</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">allow_only_one_model_on_one_gpu</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cudnn_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cuda_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "4">RNN</td>
+<td class="left">beam_size</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">rnn_use_batch</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">prev_batch_state</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">diy_beam_search_prob_so</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">metric learning</td><td class="left">external</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">data_server_port</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "16">PServer</td><td class="left">start_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">pservers</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">port</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">port_num</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">ports_num_for_sparse</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">nics</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">rdma_tcp</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">small_messages</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">loadsave_parameters_in_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">log_period_server</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">pserver_num_threads</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">sock_send_buf_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">sock_recv_buf_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">num_gradient_servers</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">parameter_block_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">parameter_block_size_for_sparse</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "3">Async SGD</td><td class="left">async_count</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">async_lagged_ratio_min</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">async_lagged_ratio_default</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "8">Performance Tuning</td><td class="left">log_barrier_abstract</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_barrier_lowest_nodes</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_barrier_show_log</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_batches</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_ratio</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_unbalance_degree</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_in_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">show_check_sparse_distribution_log</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">Data Provider</td><td class="left">memory_threshold_on_load_data</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">RandomNumber</td><td class="left">seed</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">thread_local_rand_use_global_seed</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">UnitTest</td><td class="left">checkgrad_eps</td>
+<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">Matrix/Vector</td><td class="left">enable_parallel_vector</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+</tbody>
+
+</table>
+</html>
diff --git a/doc/ui/cmd_argument/detail_introduction.md b/doc/ui/cmd_argument/detail_introduction.md
new file mode 100644
index 00000000000000..0d0362d022a72b
--- /dev/null
+++ b/doc/ui/cmd_argument/detail_introduction.md
@@ -0,0 +1,336 @@
+# Detail Description
+
+## Common
+
+* `--job`
+  - Job mode, including: **train, test, checkgrad**, where checkgrad is mainly for developers and users do not need to care about.
+  - type: string (default: train)
+
+* `--config`
+  - Use to specfiy network configure file.
+  - type: string (default: null).
+
+* `--use_gpu`
+  - Whether to use GPU for training, false is cpu mode and true is gpu mode.
+  - type: bool (default: 1).
+
+* `--local`
+  - Whether the training is in local mode or not. True when training locally or using one node in cluster. False when using multiple machines in cluster.
+  - type: bool (default: 1).
+
+* `--trainer_count`
+  - Define the number of threads used in one machine. For example, trainer_count = 4, means use 4 GPU in GPU mode and 4 threads in CPU mode. Each thread (or GPU) is assigned to 1/4 samples in current batch. That is to say, if setting batch_size of 512 in trainer config, each thread train 128 samples.
+  - type: int32 (default: 1).
+
+* `--num_passes`
+   - When `--job=train`, means training for num_passes passes. One pass means training all samples in dataset one time. When `--job=test`, means testing data from model of test_pass to  model of (num_passes - 1).
+   - type: int32 (default: 100).
+
+* `--config_args`
+  - arguments passed to config file. Format: key1=value1,key2=value2.
+  - type: string (default: null).
+
+* `--version`
+  - Whether to print version infomatrion.
+  - type: bool (default: 0).
+
+* `--show_layer_stat`
+  - Whether to show the statistics of each layer **per batch**.
+  - type: bool (default: 0).
+
+## Train
+
+* `--log_period`
+  - Log progress every log_period batches.
+  - type: int32 (default: 100).
+
+* `--dot_period`
+  - Print '.' every dot_period batches.
+  - type: int32 (default: 1).
+
+* `--saving_period`
+  - Save parameters every saving_period passes
+  - type: int32 (default: 1).
+
+* `--save_dir`
+  - Directory for saving model parameters. It needs to be specified, but no need to be created in advance.
+  - type: string (default: null).
+
+* `--start_pass`
+  - Start training from this pass. It will load parameters from the previous pass.
+  - type: int32 (default: 0).
+
+* `--show_parameter_stats_period`
+  - Show parameter statistic during training every show_parameter_stats_period batches. It will not show by default.
+  - type: int32 (default: 0).
+
+* `--save_only_one`
+  - Save the parameters only in last pass, while the previous parameters will be removed.
+  - type: bool (default: 0).
+
+* `--load_missing_parameter_strategy`
+  - Specify the loading operation when model file is missing. Now support fail/rand/zere three operations.
+    - `fail`: program will exit.
+    - `rand`: uniform or normal distribution according to **initial\_strategy** in network config. Uniform range is: **[mean - std, mean + std]**, where mean and std are configures in trainer config.
+    - `zero`: all parameters are zero.
+  - type: string (default: fail).
+
+* `--init_model_path`
+   - Path of the initialization model. If it was set, start\_pass will be ignored. It can be used to specify model path in testing mode as well.
+   - type: string (default: null).
+
+* `--saving_period_by_batches`
+   - Save parameters every saving_period_by_batches batches in one pass.
+   - type: int32 (default: 0).
+
+* `--log_error_clipping`
+  - Whether to print error clipping log when setting **error_clipping_threshold** in layer config. If it is true, log will be printed in backward propagation **per batch**. This clipping effects on **gradient of output**.
+  - type: bool (default: 0).
+
+* `--log_clipping`
+  - Enable print log clipping or not when setting **gradient_clipping_threshold** in trainer config. This clipping effects on **gradient w.r.t. (with respect to) weight**.
+  - type: bool (default: 0).
+
+* `--use_old_updater`
+  - Whether to use the old RemoteParameterUpdater. Default use ConcurrentRemoteParameterUpdater. It is mainly for deverlopers and users usually do not need to care about.
+  - type: bool (default: 0).
+
+* `--enable_grad_share`
+  - threshold for enable gradient parameter, which is shared for batch multi-cpu training.
+  - type: int32 (default: 100 \* 1024 \* 1024).
+
+* `--grad_share_block_num`
+  - block number of gradient parameter, which is shared for batch multi-cpu training.
+  - type: int32 (default: 64).
+
+## Test
+
+* `--test_pass`
+  - Load parameter from this pass to test.
+  - type: int32 (default: -1).
+
+* `--test_period`
+  - Run testing every test_period train batches. If not set, run testing each pass.
+  - type: int32 (default: 1000).
+
+* `--test_wait`
+  - Whether to wait for parameter per pass if not exist. If set test_data_path in submitting environment of cluster, it will launch one process to perfom testing, so we need to set test_wait=1. Note that in the cluster submitting environment, this argument has been set True by default.
+  - type: bool (default: 0).
+
+* `--model_list`
+  - File that saves the model list when testing. It was set automatically when using cluster submitting environment after setting model_path.
+  - type: string (default: "", null).
+
+* `--test_all_data_in_one_period`
+  - This argument is usually used in testing period during traning. If true, all data will be tested in one test period. Otherwise (batch_size * log_peroid) data will be tested.
+  - type: bool (default: 0).
+
+* `--predict_output_dir`
+  - Directory that saves the layer output. It is configured in Outputs() in network config. Default, this argument is null, meaning save nothing. Specify this directory if you want to save feature map of some layers in testing mode. Note that, layer outputs are values after activation function.
+  - type: string (default: "", null).
+
+* `--average_test_period`
+  - Do test on average parameter every `average_test_period` batches. It MUST be devided by FLAGS_log_period. Default 0 means do not test on average parameter.
+  - type: int32 (default: 0).
+
+* `--distribute_test`
+  - Testing in distribute environment will merge results from multiple machines.
+  - type: bool (default: 0).
+
+* `--predict_file`
+  - File name for saving predicted result. Default, this argument is null, meaning save nothing. Now, this argument is only used in AucValidationLayer and PnpairValidationLayer, and saves predicted result every pass.
+  - type: string (default: "", null).
+
+## GPU
+
+* `--gpu_id`
+  - Which gpu core to use.
+  - type: int32 (default: 0).
+
+* `--allow_only_one_model_on_one_gpu`
+  - If true, do not allow multiple models on one GPU device.
+  - type: bool (default: 1).
+
+* `--parallel_nn`
+  - Whether to use multi-thread to calculate one neural network or not. If false, use gpu_id specify which gpu core to use (the device property in trainer config will be ingored). If true, the gpu core is specified in trainer config (gpu_id will be ignored).
+  - type: bool (default: 0).
+
+* `--cudnn_dir`
+  - Choose path to dynamic load NVIDIA CuDNN library, for instance, /usr/local/cuda/lib64. [Default]: LD_LIBRARY_PATH
+  - type: string (default: "", null)
+
+* `--cuda_dir`
+  - Choose path to dynamic load NVIDIA CUDA library, for instance, /usr/local/cuda/lib64. [Default]: LD_LIBRARY_PATH
+  - type: string (default: "", null)
+
+## NLP: RNN/LSTM/GRU
+* `--rnn_use_batch`
+  - Whether to use batch method for calculation in simple RecurrentLayer.
+  - type: bool (default: 0).
+
+* `--prev_batch_state`
+  - batch is continue with next batch.
+  - type: bool (default: 0).
+
+* `--beam_size`
+  - Beam search uses breadth-first search to build its search tree. At each level of the tree, it generates all successors of the states at the current level, sorting them in increasing order of heuristic cost. However, it only stores a predetermined number of best states at each level (called the beam size).
+  - type: int32 (default: 1).
+
+* `--diy_beam_search_prob_so`
+  - Specify shared dynamic library. It can be defined out of paddle by user.
+  - type: string (default: "", null).
+
+## Metric Learning
+* `--external`
+   - Whether to use external machine for metric learning.
+   - type: bool (default: 0).
+
+* `--data_server_port`
+  - Listening port for dserver (data server), dserver is mainly used in metric learning.
+  - type: int32 (default: 21134).
+
+## DataProvider
+
+* `--memory_threshold_on_load_data`
+  - Stop loading data when memory is not sufficient.
+  - type: double (default: 1.0).
+
+## Unit Test
+
+* `--checkgrad_eps`
+  - parameter change size for checkgrad.
+  - type: double (default: 1e-05).
+
+## Parameter Server and Distributed Communication
+
+* `--start_pserver`
+  - Whether to start pserver (parameter server).
+  - type: bool (default: 0).
+
+* `--pservers`
+  - Comma separated IP addresses of pservers. It is set automatically in cluster submitting environment.
+  - type: string (default: "127.0.0.1").
+
+* `--port`
+  - Listening port for pserver.
+  - type: int32 (default: 20134).
+
+* `--ports_num`
+  - The ports number for parameter send, increment based on default port number.
+  - type: int32 (default: 1).
+
+* `--trainer_id`
+  - In distributed training, each trainer must be given an unique id ranging from 0 to num_trainers-1. Trainer 0 is the master trainer. User do not need to care this flag.
+  - type: int32 (default: 0).
+
+* `--num_gradient_servers`
+  - Numbers of gradient servers. This arguments is set automatically in cluster submitting environment.
+  - type: int32 (default: 1).
+
+* `--small_messages`
+  - If message size is small, recommend set it True to enable quick ACK and no delay
+  - type: bool (default: 0).
+
+* `--sock_send_buf_size`
+  - Restrict socket send buffer size. It can reduce network congestion if set carefully.
+  - type: int32 (default: 1024 \* 1024 \* 40).
+
+* `--sock_recv_buf_size`
+  - Restrict socket recieve buffer size.
+  - type: int32 (default: 1024 \* 1024 \* 40).
+
+* `--parameter_block_size`
+  - Parameter block size for pserver, will automatically calculate a suitable value if it's not set.
+  - type: int32 (default: 0).
+
+* `--parameter_block_size_for_sparse`
+  - Parameter block size for sparse update pserver, will automatically calculate a suitable value if it's not set.
+  - type: int32 (default: 0).
+
+* `--log_period_server`
+  - Log progress every log_period_server batches at pserver end.
+  - type: int32 (default: 500).
+
+* `--loadsave_parameters_in_pserver`
+  - Load and save parameters in pserver. Only work when parameter set sparse_remote_update.
+  - type: bool (default: 0).
+
+* `--pserver_num_threads`
+  - number of threads for sync op exec.
+  - type: bool (default: 1).
+
+* `--ports_num_for_sparse`
+  - The ports number for parameter send, increment based on default (port + ports_num). It is used by sparse Tranning.
+  - type: int32 (default: 0).
+
+* `--nics`
+  - Network device name for pservers, already set in cluster submitting environment.
+  - type: string (default: "xgbe0,xgbe1").
+
+* `--rdma_tcp`
+  - Use rdma or tcp transport protocol, already set in cluster submitting environment.
+  - type: string (default: "tcp").
+
+## Async SGD
+* `--async_count`
+  - Defined the asynchronous training length, if 0, then use synchronized training.
+  - type: int32 (default: 0).
+
+* `--async_lagged_ratio_min`
+  - Control the minimize value of `config_.async_lagged_grad_discard_ratio()`.
+  - type: double (default: 1.0).
+
+* `--async_lagged_ratio_default`
+  - If async_lagged_grad_discard_ratio is not set in network config, use it as defalut value.
+  - type: double (default: 1.5).
+
+## Performance Tuning
+
+* `--log_barrier_abstract`
+  - If true, show abstract barrier performance information.
+  - type: bool (default: 1).
+
+* `--log_barrier_show_log`
+  - If true, always show barrier abstract even with little gap.
+  - type: bool (default: 0).
+
+* `--log_barrier_lowest_nodes`
+  - How many lowest node will be logged.
+  - type: int32 (default: 5).
+
+* `--check_sparse_distribution_in_pserver`
+  - Whether to check that the distribution of sparse parameter on all pservers is balanced.
+  - type: bool (default: 0).
+
+* `--show_check_sparse_distribution_log`
+  - show log details for sparse parameter distribution in pserver.
+  - type: bool (default: 0).
+
+* `--allow_inefficient_sparse_update`
+  - Whether to allow inefficient sparse update.
+  - type: bool (default: 0).
+
+* `--check_sparse_distribution_batches`
+  - Running sparse parameter distribution check every so many batches.
+  - type: int32 (default: 100).
+
+* `--check_sparse_distribution_ratio`
+  - If parameters dispatched to different pservers have an unbalanced distribution for check_sparse_distribution_ratio *  check_sparse_distribution_batches times, crash program.
+  - type: double (default: 0.6).
+
+* `--check_sparse_distribution_unbalance_degree`
+  - The ratio of maximum data size / minimun data size for different pserver.
+  - type: double (default: 2).
+
+## Matrix/Vector/RandomNumber
+* `--enable_parallel_vector`
+  - threshold for enable parallel vector.
+  - type: int32 (default: 0).
+
+* `--seed`
+  - random number seed. 0 for srand(time)
+  - type: int32 (default: 1)
+
+* `--thread_local_rand_use_global_seed`
+  - Whether to use global seed in rand of thread local.
+  - type: bool (default: 0).
diff --git a/doc/ui/cmd_argument/use_case.md b/doc/ui/cmd_argument/use_case.md
new file mode 100644
index 00000000000000..a6bfba29af4f73
--- /dev/null
+++ b/doc/ui/cmd_argument/use_case.md
@@ -0,0 +1,183 @@
+# Use Case
+
+## Local Training
+
+These command line arguments are commonly used by local training experiments, such as image classification, natural language processing, et al.
+
+```
+paddle train \
+  --use_gpu=1/0 \                        #1:GPU,0:CPU(default:true)
+  --config=network_config \
+  --save_dir=output \
+  --trainer_count=COUNT \                #(default:1)
+  --test_period=M \                      #(default:1000）
+  --test_all_data_in_one_period=true \   #(default:false) 
+  --num_passes=N \                       #(defalut:100）
+  --log_period=K \                       #(default:100)
+  --dot_period=1000 \                    #(default:1)
+  #[--show_parameter_stats_period=100] \ #(default:0)
+  #[--saving_period_by_batches=200] \    #(default:0)
+```
+`show_parameter_stats_period` and `saving_period_by_batches` are optional according to your task.
+
+### 1) Pass Command Argument to Network config
+
+`config_args` is a useful parameter to pass arguments to network config.
+
+```
+--config_args=generating=1,beam_size=5,layer_num=10 \
+```
+And `get_config_arg` can be used to parse these arguments in network config as follows:
+
+```
+generating = get_config_arg('generating', bool, False)
+beam_size = get_config_arg('beam_size', int, 3)
+layer_num = get_config_arg('layer_num', int, 8)
+```
+
+`get_config_arg`:
+
+```
+get_config_arg(name, type, default_value)
+```
+- name: the name specified in the `--config_args`
+- type: value type, bool, int, str, float etc.
+- default_value: default value if not set.
+
+### 2) Use Model to Initialize Network
+
+add argument:
+
+```
+--init_model_path=model_path
+--load_missing_parameter_strategy=rand
+```
+
+## Local Testing
+
+Method 1:
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \ 
+             --config=network_config \
+             --trainer_count=COUNT \ 
+             --init_model_path=model_path \
+```
+- use init\_model\_path to specify test model.
+- only can test one model.
+
+Method 2:
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \ 
+             --config=network_config \
+             --trainer_count=COUNT \ 
+             --model_list=model.list \
+```
+- use model_list to specify test models
+- can test several models, where model.list likes:
+
+```
+./alexnet_pass1
+./alexnet_pass2
+```
+
+Method 3:
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \
+             --config=network_config \
+             --trainer_count=COUNT \
+             --save_dir=model \
+             --test_pass=M \
+             --num_passes=N \
+```
+This way must use model path saved by Paddle like this: `model/pass-%5d`. Testing model is from M-th pass to (N-1)-th pass. For example: M=12 and N=14 will test `model/pass-00012` and `model/pass-00013`.
+
+## Sparse Training
+
+Sparse training is usually used to accelerate calculation when input is sparse data with highly dimension. For example, dictionary dimension of input data is 1 million, but one sample just have several words. In paddle, sparse matrix multiplication is used in forward propagation and sparse updating is perfomed on weight updating after backward propagation.
+
+### 1) Local training
+
+You need to set **sparse\_update=True** in network config.  Check the network config documentation for more details.
+
+### 2) cluster training
+
+Add the following argument for cluster training of a sparse model. At the same time you need to set **sparse\_remote\_update=True** in network config. Check the network config documentation for more details.
+
+```
+--ports_num_for_sparse=1    #(default: 0)
+```
+
+## parallel_nn
+`parallel_nn` can be set to mixed use of GPUs and CPUs to compute layers. That is to say, you can deploy network to use a GPU to compute some layers and use a CPU to compute other layers. The other way is to split layers into different GPUs, which can **reduce GPU memory** or **use parallel computation to accelerate some layers**.
+
+If you want to use these characteristics, you need to specify device ID in network config (denote it as deviceId) and add command line argument:
+
+```
+--parallel_nn=true
+```
+### case 1: Mixed Use of GPU and CPU
+Consider the following example:
+
+```
+#command line:
+paddle train --use_gpu=true --parallel_nn=true trainer_count=COUNT
+
+default_device(0)
+
+fc1=fc_layer(...)
+fc2=fc_layer(...)
+fc3=fc_layer(...,layer_attr=ExtraAttr(device=-1))
+
+```
+- default_device(0): set default device ID to 0. This means that except the layers with device=-1, all layers will use a GPU, and the specific GPU used for each layer depends on trainer\_count and gpu\_id (0 by default). Here, layer l1 and l2 are computed on the GPU.
+
+- device=-1: use the CPU for layer l3.
+
+- trainer_count:
+  - trainer_count=1: if gpu\_id is not set, then use the first GPU to compute layers l1 and l2. Otherwise use the GPU with gpu\_id.
+
+  - trainer_count>1: use trainer\_count GPUs to compute one layer using data parallelism. For example, trainer\_count=2 means that GPUs 0 and 1 will use data parallelism to compute layer l1 and l2.
+
+### Case 2: Specify Layers in Different Devices
+
+```
+#command line:
+paddle train --use_gpu=true --parallel_nn=true --trainer_count=COUNT
+
+#network:
+fc2=fc_layer(input=l1, layer_attr=ExtraAttr(device=0), ...)
+fc3=fc_layer(input=l1, layer_attr=ExtraAttr(device=1), ...)
+fc4=fc_layer(input=fc2, layer_attr=ExtraAttr(device=-1), ...)
+```
+In this case, we assume that there are 4 GPUs in one machine.
+
+- trainer_count=1:
+  - Use GPU 0 to compute layer l2.
+  - Use GPU 1 to compute layer l3.
+  - Use CPU to compute layer l4.
+
+- trainer_count=2:
+  - Use GPU 0 and 1 to compute layer l2.
+  - Use GPU 2 and 3 to compute layer l3.
+  - Use CPU to compute l4 in two threads.
+
+- trainer_count=4:
+  - It will fail (note, we have assumed that there are 4 GPUs in machine), because argument `allow_only_one_model_on_one_gpu` is true by default.
+
+**Allocation of device ID when `device!=-1`**:
+
+```
+(deviceId + gpu_id + threadId * numLogicalDevices_) % numDevices_
+
+deviceId:             specified in layer.
+gpu_id:               0 by default.
+threadId:             thread ID, range: 0,1,..., trainer_count-1
+numDevices_:          device (GPU) count in machine.
+numLogicalDevices_:   min(max(deviceId + 1), numDevices_)
+```
diff --git a/doc/ui/data_provider/index.md b/doc/ui/data_provider/index.md
new file mode 100644
index 00000000000000..49f6e8fbc87edd
--- /dev/null
+++ b/doc/ui/data_provider/index.md
@@ -0,0 +1,55 @@
+# DataProvider Tutorial #
+
+DataProvider is responsible for data management in PaddlePaddle, corresponding to <a href = "../trainer_config_helpers_api.html#trainer_config_helpers.layers.data_layer">Data Layer</a>.
+
+## Input Data Format ##
+PaddlePaddle uses **Slot** to describe the data layer of neural network. One slot describes one data layer. Each slot stores a series of samples, and each sample contains a set of features. There are three attributes of a slot: 
++ **Dimension**: dimenstion of features
++ **SlotType**: there are 5 different slot types in PaddlePaddle, following table compares the four commonly used ones.
+
+<table border="2" frame="border">
+<thead>
+<tr>
+<th scope="col" class="left">SlotType</th>
+<th scope="col" class="left">Feature Description</th>
+<th scope="col" class="left">Vector Description</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left"><b>DenseSlot</b></td>
+<td class="left">Continuous Features</td>
+<td class="left">Dense Vector</td>
+</tr>
+
+<tr>
+<td class="left"><b>SparseNonValueSlot<b></td>
+<td class="left">Discrete Features without weights</td>
+<td class="left">Sparse Vector with all non-zero elements equaled to 1</td>
+</tr>
+
+<tr>
+<td class="left"><b>SparseValueSlot</b></td>
+<td class="left">Discrete Features with weights</td>
+<td class="left">Sparse Vector</td>
+</tr>
+
+<tr>
+<td class="left"><b>IndexSlot</b></td>
+<td class="left">mostly the same as SparseNonValueSlot, but especially for a single label</td>
+<td class="left">Sparse Vector with only one value in each time step</td>
+</tr>
+</tbody>
+</table>
+</br>
+
+And the remained one is **StringSlot**. It stores Character String, and can be used for debug or to describe data Id for prediction, etc.
++ **SeqType**: a **sequence** is a sample whose features are expanded in time scale. And a **sub-sequence** is a continous ordered subset of a sequence. For example, (a1, a2) and (a3, a4, a5) are two sub-sequences of one sequence (a1, a2, a3, a4, a5). Following are 3 different sequence types in PaddlePaddle:
+  - **NonSeq**: input sample is not sequence
+  - **Seq**: input sample is a sequence without sub-sequence
+  - **SubSeq**: input sample is a sequence with sub-sequence
+
+## Python DataProvider
+  
+PyDataProviderWrapper is a python decorator in PaddlePaddle, used to read custom python DataProvider class. It currently supports all SlotTypes and SeqTypes of input data. User should only concern how to read samples from file. Feel easy with its [Use Case](python_case.md) and <a href = "../py_data_provider_wrapper_api.html">API Reference</a>.
diff --git a/doc/ui/data_provider/python_case.md b/doc/ui/data_provider/python_case.md
new file mode 100644
index 00000000000000..b3ce70522318a2
--- /dev/null
+++ b/doc/ui/data_provider/python_case.md
@@ -0,0 +1,112 @@
+# Python Use Case #
+
+This tutorial guides you into using python script that converts user input data into PaddlePaddle Data Format. 
+
+## Quick Start ##
+
+We use a custom data to show the quick usage. This data consists of two parts with semicolon-delimited `';'`: a) label with 2 dimensions, b) continuous features with 9 dimensions:
+
+    1;0 0 0 0 0.192157 0.070588 0.215686 0.533333 0
+    0;0 0 0 0.988235 0.913725 0.329412 0.376471 0 0
+
+The `simple_provider.py` defines a python data provider:
+
+```python
+from trainer.PyDataProviderWrapper import DenseSlot, IndexSlot, provider
+
+@provider([DenseSlot(9), IndexSlot(2)])
+def process(obj, file_name):
+    with open(file_name, 'r') as f:
+        for line in f:
+        line = line.split(";")
+        label = int(line[0])
+        image = [float(x) for x in line[1].split()[1:]]
+        yield label, image
+```
+
+- `@provider`: specify the SlotType and its dimension. Here, we have 2 Slots, DenseSlot(9) stores continuous features with 9 dimensions, and IndexSlot(2) stores label with 2 dimensions. 
+- `process`: a generator using **yield** keyword to return results one by one. Here, the return format is 1 Discrete Feature and a list of 9 float Continuous Features.
+
+The corresponding python **Train** data source `define_py_data_source` is:
+
+```python
+define_py_data_sources('train.list', None, 'simple_provider', 'process')
+```
+See <a href = "../trainer_config_helpers_api.html#trainer_config_helpers.data_sources.define_py_data_sources">here</a> for detail API reference of `define_py_data_sources`.
+
+## Sequence Example ##
+
+In some tasks such as Natural Language Processing (NLP), the dimension of Slot is related to the dictionary size, and the dictionary should be dynamically loaded during training or generating. PyDataProviderWrapper can satisfy all these demands easily.
+
+### Sequence has no sub-sequence ###
+Following is an example of data provider when using LSTM network to do sentiment analysis (If you want to understand the whole details of this task, please refer to [Sentiment Analysis Tutorial](../demo/sentiment_analysis/index.md)). 
+
+The input data consists of two parts with two-tabs-delimited: a) label with 2 dimensions, b) sequence with dictionary length dimensions: 
+
+    0		I saw this movie at the AFI Dallas festival . It all takes place at a lake house and it looks wonderful .
+    1		This documentary makes you travel all around the globe . It contains rare and stunning sequels from the wilderness .
+    ...
+
+The `dataprovider.py` in `demo/sentiment` is:
+
+```python
+from trainer.PyDataProviderWrapper import *
+
+@init_hook_wrapper
+def hook(obj, dictionary, **kwargs):
+    obj.word_dict = dictionary
+    obj.slots = [IndexSlot(len(obj.word_dict)), IndexSlot(2)]
+    obj.logger.info('dict len : %d' % (len(obj.word_dict)))
+
+@provider(use_seq=True, init_hook=hook)
+# @provider(use_seq=True, init_hook=hook, pool_size=PoolSize(5000))
+def process(obj, file_name):
+    with open(file_name, 'r') as fdata:
+        for line_count, line in enumerate(fdata):
+            label, comment = line.strip().split('\t\t')
+            label = int(''.join(label.split(' ')))
+            words = comment.split()
+            word_slot = [obj.word_dict[w] for w in words if w in obj.word_dict]
+            yield word_slot, [label]
+```
+
+- `hook`: Initialization hook of data provider. Here, it reads the dictionary, sets the obj.slots based on the dictionary length, and uses obj.logger to output some logs.
+- `process`: Here, as the Sequence Mode of input is **Seq** and SlotType is IndexSlot, use_seq is set to True, and the yield format is `[int, int, ....]`.
+- `PoolSize`: If there are a lot of data, you may need this argument to increase loading speed and reduce memory footprint. Here, PoolSize(5000) means read at most 5000 samples to memory.
+
+The corresponding python **Train/Test** data sources `define_py_data_sources` is:
+
+```python
+train_list = train_list if not is_test else None
+word_dict = dict()
+with open(dict_file, 'r') as f:
+    for i, line in enumerate(open(dict_file, 'r')):
+        word_dict[line.split('\t')[0]] = i 
+
+define_py_data_sources(train_list, test_list, module = "dataprovider", obj = "processData",
+                       args = {'dictionary': word_dict}, train_async = True)
+```
+
+### Sequence has sub-sequence ###
+
+If the sequence of above input data is considered as several sub-sequences joint by dot `'.'`, quesion mark `'?'` or exclamation mark `'!'`, see `processData2` in `demo/sentiment/dataprovider.py` as follows:
+
+```python
+import re
+
+@provider(use_seq=True, init_hook=hook)
+def process2(obj, file_name):
+    with open(file_name, 'r') as fdata:
+    pat = re.compile(r'[^.?!]+[.?!]')
+    for line_count, line in enumerate(fdata):
+        label, comment = line.strip().split('\t\t')
+        label = int(''.join(label.split(' ')))
+        words_list = pat.findall(comment)
+        word_slot_list = [[obj.word_dict[w] for w in words.split() \
+                          if w in obj.word_dict] for words in words_list]
+        yield word_slot_list, [[label]]
+```
+
+- `hook`: the same as above. Note that as **SubSeq Slot must put before Seq Slot** in PaddlePaddle, we could not reverse the yield order in this case. 
+- `process2`: Here, as the Sequence Mode of input is **SubSeq**, and the SlotType is IndexSlot, use_seq is set to True, and the yield format is `[[int, int, ...], [int, int, ...], ... ]`.
+- `define_py_data_sources`: the same as above.
diff --git a/doc/ui/index.md b/doc/ui/index.md
new file mode 100644
index 00000000000000..976d3382444ad3
--- /dev/null
+++ b/doc/ui/index.md
@@ -0,0 +1,21 @@
+# User Interface
+
+## Data Provider
+
+* [Introduction](data_provider/index.md)
+* [Python Use Case](data_provider/python_case.md)
+
+## API Reference
+
+* [PyDataProviderWrapper](api/py_data_provider_wrapper.rst)
+* [Trainer Config Helpers](api/trainer_config_helpers/index.md)
+
+## Command Line Argument
+
+* [Use Case](cmd_argument/use_case.md)
+* [Argument Outline](cmd_argument/argument_outline.md)
+* [Detail Description](cmd_argument/detail_introduction.md)
+
+## Predict
+
+* [Python Prediction API](predict/swig_py_paddle_en.rst)
diff --git a/doc/ui/predict/predict_sample.py b/doc/ui/predict/predict_sample.py
new file mode 100644
index 00000000000000..ac16b2b48b9f77
--- /dev/null
+++ b/doc/ui/predict/predict_sample.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from py_paddle import swig_paddle, DataProviderWrapperConverter
+from paddle.trainer.PyDataProviderWrapper import DenseSlot
+from paddle.trainer.config_parser import parse_config
+
+TEST_DATA = [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.215686,
+               0.533333, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.67451,
+               0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.070588, 0.886275,
+               0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.192157, 0.070588, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+               0.670588, 0.992157, 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.117647, 0.933333, 0.858824, 0.313725, 0, 0, 0,
+               0, 0, 0, 0, 0, 0, 0, 0, 0.090196, 0.858824, 0.992157, 0.831373, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.141176,
+               0.992157, 0.992157, 0.611765, 0.054902, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.258824, 0.992157, 0.992157,
+               0.529412, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.368627, 0.992157, 0.992157, 0.419608, 0.003922, 0, 0, 0, 0, 0, 0,
+               0, 0, 0, 0.094118, 0.835294, 0.992157, 0.992157, 0.517647, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.603922, 0.992157,
+               0.992157, 0.992157, 0.603922, 0.545098, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0.447059, 0.992157, 0.992157,
+               0.956863, 0.062745, 0, 0, 0, 0, 0, 0, 0, 0, 0.011765, 0.666667, 0.992157, 0.992157, 0.992157, 0.992157,
+               0.992157, 0.745098, 0.137255, 0, 0, 0, 0, 0, 0.152941, 0.866667, 0.992157, 0.992157, 0.521569, 0, 0, 0,
+               0, 0, 0, 0, 0, 0, 0.070588, 0.992157, 0.992157, 0.992157, 0.803922, 0.352941, 0.745098, 0.992157,
+               0.945098, 0.317647, 0, 0, 0, 0, 0.580392, 0.992157, 0.992157, 0.764706, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0,
+               0, 0.070588, 0.992157, 0.992157, 0.776471, 0.043137, 0, 0.007843, 0.27451, 0.882353, 0.941176, 0.176471,
+               0, 0, 0.180392, 0.898039, 0.992157, 0.992157, 0.313725, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.070588, 0.992157,
+               0.992157, 0.713725, 0, 0, 0, 0, 0.627451, 0.992157, 0.729412, 0.062745, 0, 0.509804, 0.992157, 0.992157,
+               0.776471, 0.035294, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.494118, 0.992157, 0.992157, 0.968627, 0.168627, 0, 0,
+               0, 0.423529, 0.992157, 0.992157, 0.364706, 0, 0.717647, 0.992157, 0.992157, 0.317647, 0, 0, 0, 0, 0, 0,
+               0, 0, 0, 0, 0, 0.533333, 0.992157, 0.984314, 0.945098, 0.603922, 0, 0, 0, 0.003922, 0.466667, 0.992157,
+               0.988235, 0.976471, 0.992157, 0.992157, 0.788235, 0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.686275,
+               0.882353, 0.364706, 0, 0, 0, 0, 0, 0, 0.098039, 0.588235, 0.992157, 0.992157, 0.992157, 0.980392,
+               0.305882, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.101961, 0.67451, 0.321569, 0, 0, 0, 0, 0, 0, 0, 0.105882,
+               0.733333, 0.976471, 0.811765, 0.713725, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.65098, 0.992157,
+               0.321569, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.25098, 0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+               0.94902, 0.219608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.968627,
+               0.764706, 0.152941, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.498039,
+               0.25098, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [
+                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.298039, 0.333333, 0.333333, 0.333333, 0.337255, 0.333333,
+                  0.333333, 0.109804, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.027451, 0.223529, 0.776471,
+                  0.964706, 0.988235, 0.988235, 0.988235, 0.992157, 0.988235, 0.988235, 0.780392, 0.098039, 0, 0, 0, 0,
+                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.14902, 0.698039, 0.988235, 0.992157, 0.988235, 0.901961, 0.87451,
+                  0.568627, 0.882353, 0.976471, 0.988235, 0.988235, 0.501961, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                  0.188235, 0.647059, 0.988235, 0.988235, 0.745098, 0.439216, 0.098039, 0, 0, 0, 0.572549, 0.988235,
+                  0.988235, 0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0.933333, 0.992157, 0.941176,
+                  0.247059, 0, 0, 0, 0, 0, 0, 0.188235, 0.898039, 0.992157, 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                  0.039216, 0.639216, 0.933333, 0.988235, 0.913725, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0.113725, 0.843137,
+                  0.988235, 0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.235294, 0.988235, 0.992157, 0.988235, 0.815686,
+                  0.07451, 0, 0, 0, 0, 0, 0, 0, 0.333333, 0.988235, 0.988235, 0.552941, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                  0.211765, 0.878431, 0.988235, 0.992157, 0.701961, 0.329412, 0.109804, 0, 0, 0, 0, 0, 0, 0, 0.698039,
+                  0.988235, 0.913725, 0.145098, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.188235, 0.890196, 0.988235, 0.988235,
+                  0.745098, 0.047059, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.882353, 0.988235, 0.568627, 0, 0, 0, 0, 0, 0, 0, 0,
+                  0, 0.2, 0.933333, 0.992157, 0.992157, 0.992157, 0.447059, 0.294118, 0, 0, 0, 0, 0, 0, 0, 0, 0.447059,
+                  0.992157, 0.768627, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.623529, 0.988235, 0.988235, 0.988235, 0.988235,
+                  0.992157, 0.47451, 0, 0, 0, 0, 0, 0, 0, 0.188235, 0.933333, 0.87451, 0.509804, 0, 0, 0, 0, 0, 0, 0, 0,
+                  0, 0, 0.992157, 0.988235, 0.937255, 0.792157, 0.988235, 0.894118, 0.082353, 0, 0, 0, 0, 0, 0,
+                  0.027451, 0.647059, 0.992157, 0.654902, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.623529, 0.988235, 0.913725,
+                  0.329412, 0.376471, 0.184314, 0, 0, 0, 0, 0, 0, 0.027451, 0.513725, 0.988235, 0.635294, 0.219608, 0,
+                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.196078, 0.929412, 0.988235, 0.988235, 0.741176, 0.309804, 0, 0, 0, 0,
+                  0, 0, 0.529412, 0.988235, 0.678431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.223529, 0.992157,
+                  0.992157, 1, 0.992157, 0.992157, 0.992157, 0.992157, 1, 0.992157, 0.992157, 0.882353, 0, 0, 0, 0, 0,
+                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.023529, 0.478431, 0.654902, 0.658824, 0.952941, 0.988235, 0.988235,
+                  0.988235, 0.992157, 0.988235, 0.729412, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                  0, 0, 0, 0.196078, 0.647059, 0.764706, 0.764706, 0.768627, 0.580392, 0.047059, 0, 0, 0, 0, 0, 0, 0, 0,
+                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                  0, 0, 0, 0, 0, 0, 0]]]
+
+
+def main():
+    conf = parse_config("./mnist_model/trainer_config.conf.norm", "")
+    print conf.data_config.load_data_args
+    network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
+    assert isinstance(network, swig_paddle.GradientMachine)  # For code hint.
+    network.loadParameters("./mnist_model/")
+    converter = DataProviderWrapperConverter(False, [DenseSlot(784)])
+    inArg = converter(TEST_DATA)
+    print network.forwardTest(inArg)
+
+
+if __name__ == '__main__':
+    swig_paddle.initPaddle("--use_gpu=0")
+    main()
diff --git a/doc/ui/predict/swig_py_paddle_en.rst b/doc/ui/predict/swig_py_paddle_en.rst
new file mode 100644
index 00000000000000..e22d0bff338d95
--- /dev/null
+++ b/doc/ui/predict/swig_py_paddle_en.rst
@@ -0,0 +1,51 @@
+Python Prediction API
+=====================
+
+PaddlePaddle offers a set of clean prediction interfaces for python with the help of
+SWIG. The main steps of predict values in python are:
+
+* Parse training configurations
+* Construct GradientMachine
+* Prepare data
+* Predict
+
+Here is a sample python script that shows the typical prediction process for the
+MNIST classification problem.
+
+..  literalinclude:: ./predict_sample.py
+    :language: python
+    :linenos:
+
+The module that does the most of the job is py_paddle.swig_paddle, it's
+generated by SWIG and has complete documents, for more details you can use
+python's :code:`help()` function. Let's walk through the above python script:
+
+* At the beginning, initialize PaddlePaddle with command line arguments(line 90).
+* Parse the configuration file that is used in training(line 93).
+* Create a neural network at line 95 according the parsed configuration, then
+  load the trained parameters from model at line 97.
+* A utility class for data transformation is created at line 98.
+    - Note: As swig_paddle can only accept C++ matrices, we offer a utility
+      class DataProviderWraaperConverter that can accept the same input data with
+      PyDataProviderWrapper, for more information please refer to document
+      of `PyDataProviderWrapper <../py_data_provider_wrapper_api.html>`_.
+* Do the prediction and output the result at line 100, forwardTest is another
+  utility class that directly takes the activations of the output layer.
+
+Here is a typical output:
+
+..  code-block:: text
+
+    [{'id': None, 'value': array([[  5.53018653e-09,   1.12194102e-05,   1.96644767e-09,
+          1.43630644e-02,   1.51111044e-13,   9.85625684e-01,
+          2.08823112e-10,   2.32777140e-08,   2.00186201e-09,
+          1.15501715e-08],
+       [  9.99982715e-01,   1.27787406e-10,   1.72296313e-05,
+          1.49316648e-09,   1.36540484e-11,   6.93137714e-10,
+          2.70634608e-08,   3.48565123e-08,   5.25639710e-09,
+          4.48684503e-08]], dtype=float32)}]
+
+:code:`value` is the output of the output layer, each row represents result of
+the corresponding row in the input data, each element represents activation of
+the corresponding neuron in the output layer.
+
diff --git a/doc_cn/CMakeLists.txt b/doc_cn/CMakeLists.txt
new file mode 100644
index 00000000000000..314b34525ca1d3
--- /dev/null
+++ b/doc_cn/CMakeLists.txt
@@ -0,0 +1,31 @@
+if(NOT DEFINED SPHINX_THEME)
+    set(SPHINX_THEME default)
+endif()
+
+if(NOT DEFINED SPHINX_THEME_DIR)
+    set(SPHINX_THEME_DIR)
+endif()
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
+
+# HTML output directory
+set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in"
+    "${BINARY_BUILD_DIR}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_docs_cn
+                  html
+                  ${BINARY_BUILD_DIR}
+                  ${SPHINX_CACHE_DIR}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR})
+
+add_dependencies(paddle_docs_cn
+  gen_proto_py)
diff --git a/doc_cn/build/docker/build_docker_image.rst b/doc_cn/build/docker/build_docker_image.rst
new file mode 100644
index 00000000000000..73409ceaff4e1a
--- /dev/null
+++ b/doc_cn/build/docker/build_docker_image.rst
@@ -0,0 +1,38 @@
+构建PaddlePaddle Docker Image
+===========================
+
+PaddlePaddle的Docker Image构建源码放置在 :code:`${源码根目录}/paddle/scripts/docker/`目录下。
+该Image基于ubuntu 14.04。该目录下有两个文件，Dockerfile和build.sh。其中:
+
+*  Dockerfile是docker image的主要描述文件。描述了Docker image的构建步骤、各种参数和维护
+   人员等等。
+*  build.sh是docker image的主要构建步骤。
+
+该image的构建在docker 1.12版本测试通过, 低于docker 1.12版本的情况下并没有测试。主要由于旧版本
+的docker可能缺乏 :code:`--build-arg` 参数，从而不能在运行编译命令的时候接受参数。
+
+同时，该构建脚本充分考虑了网络不稳定的情况，对于cuda的Toolkit有断点续传和传输速度过小重启下载的
+简单优化。
+
+使用脚本构建PaddlePaddle Docker Image
+-------------------------------------------
+
+该脚本的使用方法是，进入该源码目录，执行 :code:`docker build .` 命令。可以使用
+ :code:`--build-arg` 传入的配置参数包括:
+
+*  LOWEST\_DL\_SPEED\: 多线程下载过程中，最低线程的下载速度(默认单位是Bytes，可以传入10K, 
+   10M，或者10G这样的单位)。如果小于这个下载速度，那么这个下载线程将会关闭。所有的下载线程关闭时，
+   下载进程会重启。
+*  WITH\_GPU\: ON or OFF。是否开启GPU功能。注意，编译PaddlePaddle的GPU版本并不需要一定在具有GPU
+   的机器上进行。但是，运行PaddlePaddle的GPU版本一定要在具有CUDA的机器上运行。
+
+简单的使用样例为\:
+
+..  code-block:: bash
+
+    cd ${源码根目录}/paddle/scripts/docker/
+    docker build --build-arg LOWEST_DL_SPEED=50K\
+                 --build-arg WITH_GPU=ON \
+                 --tag  paddle_gpu:latest .
+
+即可在本地编译出PaddlePaddle的镜像。
diff --git a/doc_cn/build_and_install/cmake/cblas_settings.csv b/doc_cn/build_and_install/cmake/cblas_settings.csv
new file mode 100644
index 00000000000000..d804c0a662cb65
--- /dev/null
+++ b/doc_cn/build_and_install/cmake/cblas_settings.csv
@@ -0,0 +1,4 @@
+MKL_ROOT,mkl的路径，在${MKL_ROOT}/include下需要包含mkl.h，在${MKL_ROOT}/lib目录下需要包含 mkl_core，mkl_sequential和mkl_intel_lp64三个库
+ATLAS_ROOT,ATLAS库的路径，在${ATLAS_ROOT}/include下需要包含cblas.h，而在${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库
+OPENBLAS_ROOT,在${OPENBLAS_ROOT}/include下需要包含cblas.h，而在${OPENBLAS_ROOT}/lib下需要包含openblas库
+REFERENCE_CBLAS_ROOT,在${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h，在${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库
\ No newline at end of file
diff --git a/doc_cn/build_and_install/cmake/compile_options.csv b/doc_cn/build_and_install/cmake/compile_options.csv
new file mode 100644
index 00000000000000..0b8015aaee4d7b
--- /dev/null
+++ b/doc_cn/build_and_install/cmake/compile_options.csv
@@ -0,0 +1,15 @@
+选项,说明,默认值
+WITH_GPU,是否编译GPU支持。,是否寻找到cuda工具链
+WITH_DOUBLE,是否使用双精度浮点数。,否
+WITH_DSO,是否使用运行时动态加载cuda动态库，而非静态加载cuda动态库。,是
+WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制,是
+WITH_PYTHON,是否内嵌python解释器。可以方便嵌入式工作。,是
+WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
+WITH_RDMA,是否开启RDMA支持,否
+WITH_GLOG,是否使用GLOG，如果不使用则会使用一个简化版的日志实现。可以方便嵌入式工作。,取决于是否寻找到GLOG
+WITH_GFLAGS,是否使用GFLAGS，如果不使用则会使用一个简化版的命令行参数解析。可以方便嵌入式工作。,取决于是否寻找到GFLAGS
+WITH_TIMER,是否开启计时功能开启计时功能会导致运行略慢，打印的日志变多。但是方便调试和benchmark,否
+WITH_TESTING,是否开启单元测试,取决于是否寻找到gtest
+WITH_DOC,是否编译英文文档,否
+WITH_DOC_CN,是否编译中文文档,否
+WITH_SWIG_PY,是否编译python的swig接口，python的swig接口可以方便进行预测和定制化训练,取决于是否找到swig
diff --git a/doc_cn/build_and_install/cmake/compile_options.rst b/doc_cn/build_and_install/cmake/compile_options.rst
new file mode 100644
index 00000000000000..b672b83f514ffd
--- /dev/null
+++ b/doc_cn/build_and_install/cmake/compile_options.rst
@@ -0,0 +1,62 @@
+设置PaddlePaddle的编译选项
+==========================
+
+PaddlePaddle的编译选项可以在调用cmake的时候设置。cmake是一个跨平台的编译脚本，调用
+cmake可以将cmake项目文件，生成各个平台的makefile。详细的cmake使用方法可以参考
+`cmake的官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+PaddlePaddle的编译选项是可以控制PaddlePaddle生成CPU/GPU版本二进制，链接何种blas等等。所有的
+编译选项列表如下
+
+PaddlePaddle的编译选项
+----------------------
+
+bool型的编译选项
+++++++++++++++++
+设置下列编译选项时，可以在cmake的命令行设置。使用 -D命令即可。例如 
+:code:`cmake -D WITH_GPU=OFF`
+
+..  csv-table:: PaddlePaddle的bool型编译选项
+    :widths: 1, 7, 2
+    :file: compile_options.csv
+
+blas相关的编译选项
+++++++++++++++++++
+
+Paddle可以使用 `MKL <https://software.intel.com/en-us/intel-mkl>`_ ，
+`Atlas <http://math-atlas.sourceforge.net/>`_ ,
+`OpenBlas <http://www.openblas.net/>`_ 和 
+`refference Blas <http://www.netlib.org/blas/>`_ ，任意一种cblas实现。
+通过编译时指定路径来实现引用各种blas。
+
+cmake编译时会首先在系统路径(/usr/lib\:/usr/local/lib)中寻找这些blas的实现。同时
+也会读取相关路径变量来进行搜索。路径变量为\:
+
+
+..  csv-table:: PaddlePaddle的cblas编译选项
+    :widths: 1, 9
+    :header: "编译选项", "描述"
+    :file: cblas_settings.csv
+
+这些变量均可以使用 -D命令指定。例如 :code:`cmake -D MKL_ROOT=/opt/mkl/`。这些变
+量也可以通过调用cmake命令前通过环境变量指定。例如
+
+..  code-block:: bash
+
+    export MKL_ROOT=/opt/mkl
+    cmake
+
+需要注意的是，这些变量只在第一次cmake的时候有效。如果在第一次cmake之后想要重新设
+置这些变量，推荐清理( :code:`rm -rf` )掉编译目录后，再指定。
+
+cuda/cudnn相关的编译选项
+++++++++++++++++++++++++
+
+Paddle可以使用 cudnn v2之后的任何一个cudnn版本来编译运行。但需要注意的是编译和
+运行使用的cudnn尽量是同一个版本。推荐使用最新版本的cudnn v5.1。
+
+在cmake配置时可以使用 :code:`CUDNN_ROOT` 来配置CUDNN的安装路径。使用的命令也是 
+-D，例如 :code:`cmake -D CUDNN_ROOT=/opt/cudnnv5` 。
+
+需要注意的是，这些变量只在第一次cmake的时候有效。如果在第一次cmake之后想要重新设
+置这些变量，推荐清理( :code:`rm -rf` )掉编译目录后，再指定。
diff --git a/doc_cn/build_and_install/cmake/index.rst b/doc_cn/build_and_install/cmake/index.rst
new file mode 100644
index 00000000000000..e2a12c500177ea
--- /dev/null
+++ b/doc_cn/build_and_install/cmake/index.rst
@@ -0,0 +1,8 @@
+使用cmake编译PaddlePaddle
+=========================
+
+..  toctree::
+    
+    install_deps.rst
+    compile_options.rst
+    make_and_install.rst
diff --git a/doc_cn/build_and_install/cmake/install_deps.rst b/doc_cn/build_and_install/cmake/install_deps.rst
new file mode 100644
index 00000000000000..dd288b5a39be53
--- /dev/null
+++ b/doc_cn/build_and_install/cmake/install_deps.rst
@@ -0,0 +1,4 @@
+安装编译PaddlePaddle需要的依赖
+========================
+
+TBD
diff --git a/doc_cn/build_and_install/cmake/make_and_install.rst b/doc_cn/build_and_install/cmake/make_and_install.rst
new file mode 100644
index 00000000000000..2bda70ffedc296
--- /dev/null
+++ b/doc_cn/build_and_install/cmake/make_and_install.rst
@@ -0,0 +1,2 @@
+make和make install
+==================
diff --git a/doc_cn/build_and_install/index.rst b/doc_cn/build_and_install/index.rst
new file mode 100644
index 00000000000000..80cb31fe0fe5cc
--- /dev/null
+++ b/doc_cn/build_and_install/index.rst
@@ -0,0 +1,7 @@
+编译与安装
+========================
+
+..  toctree::
+    
+    install/index.rst
+    cmake/index.rst
diff --git a/doc_cn/build_and_install/install/docker_install.rst b/doc_cn/build_and_install/install/docker_install.rst
new file mode 100644
index 00000000000000..2f07e00ae7e2fb
--- /dev/null
+++ b/doc_cn/build_and_install/install/docker_install.rst
@@ -0,0 +1,120 @@
+安装PaddlePaddle的Docker镜像
+============================
+
+PaddlePaddle提供了Docker的使用镜像。PaddlePaddle推荐使用Docker进行Paddle的部署和
+运行。Docker是一个基于容器的轻量级虚拟环境。具有和宿主机差不多的运行效率，并提供
+了非常方便的二进制分发手段。
+
+下述内容将分为如下几个类别描述。
+
+* PaddlePaddle提供的Docker镜像版本
+* 下载和运行Docker镜像
+* 注意事项
+
+PaddlePaddle提供的Docker镜像版本
+--------------------------------
+
+我们提供了6个Docker image\:
+
+* paddledev/paddlepaddle\:latest-cpu\: Paddle的CPU二进制
+* paddledev/paddlepaddle\:latest-gpu\： Paddle的GPU二进制
+* paddledev/paddlepaddle\:latest-cpu-devel\: Paddle的CPU二进制,同时包含CPU开发环境和源码
+* paddledev/paddlepaddle\:latest-gpu-devel\: Paddle的GPU二进制,同时包含GPU开发环境和源码
+* paddledev/paddlepaddle\:latest-cpu-demo\: Paddle的CPU二进制,同时包含CPU开发环境、源码和运行demo的必要依赖
+* paddledev/paddlepaddle\:latest-gpu-demo\: Paddle的GPU二进制,同时包含GPU开发环境、源码和运行demo的必要依赖
+
+同时，不同的稳定版本，会将latest替换成稳定版本的版本号。
+
+Paddle提供的镜像并不包含任何命令运行，想要运行Paddle，您需要进入镜像运行paddle
+程序或者自定义一个含有启动脚本的image。具体请参考注意事项中的 
+`使用ssh访问paddle镜像`
+
+下载和运行Docker镜像
+--------------------
+
+为了运行PaddlePaddle的docker镜像，您需要在机器中安装好Docker。安装Docker需要您的机器
+至少具有3.10以上的linux kernel。安装方法请参考
+`Docker的官方文档 <https://docs.docker.com/engine/installation/>`_ 。如果您使用
+mac osx或者是windows机器，请参考 
+`mac osx的安装文档 <https://docs.docker.com/engine/installation/mac/>`_ 和
+`windows 的安装文档 <https://docs.docker.com/engine/installation/windows/>`_ 。
+
+您可以使用 :code:`docker pull` 命令预先下载镜像，也可以直接执行 
+:code:`docker run` 命令运行镜像。执行方法如下:
+
+..  code-block:: bash
+    
+    $ docker run -it paddledev/paddlepaddle:latest-cpu
+
+即可启动和进入PaddlePaddle的container。如果运行GPU版本的PaddlePaddle，则需要先将
+cuda相关的Driver和设备映射进container中，脚本类似于
+
+..  code-block:: bash
+
+    $ export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    $ docker run -it paddledev/paddlepaddle:latest-gpu
+
+进入Docker container后，运行 :code:`paddle version` 即可打印出paddle的版本和构建
+信息。安装完成的paddle主体包括三个部分， :code:`paddle` 脚本， python的 
+:code:`paddle`包和:code:`py_paddle`包。其中\:
+
+* :code:`paddle`脚本和:code:`paddle`的python包是paddle的训练主要程序。使用 
+  :code:`paddle`脚本可以启动paddle的训练进程和pserver。而:code:`paddle`脚本
+  中的二进制使用了:code:`paddle`的python包来做配置文件解析等工作。
+* python包:code:`py_paddle`是一个swig封装的paddle包，用来做预测和简单的定制化
+  训练。
+
+注意事项
+--------
+
+性能问题
+++++++++
+
+由于Docker是基于容器的轻量化虚拟方案，所以在CPU的运算性能上并不会有严重的影响。
+而GPU的驱动和设备全部映射到了容器内，所以GPU在运算性能上也不会有严重的影响。
+
+但是如果使用了高性能的网卡，例如RDMA网卡(RoCE 40GbE 或者 IB 56GbE)，或者高性能的
+以太网卡 (10GbE)。推荐使用将本地网卡，即 "--net=host" 来进行训练。而不使用docker
+的网桥来进行网络通信。
+
+远程访问问题和二次开发
+++++++++++++++++++++++
+
+由于Paddle的Docker镜像并不包含任何预定义的运行命令。所以如果想要在后台启用ssh
+远程访问，则需要进行一定的二次开发，将ssh装入系统内并开启远程访问。二次开发可以
+使用Dockerfile构建一个全新的docker image。需要参考 
+`Dockerfile的文档 <https://docs.docker.com/engine/reference/builder/>`_ 和
+`Dockerfile的最佳实践 <https://docs.docker.com/engine/userguide/eng-image/dockerfile_best-practices/>`_ 
+两个文档。
+
+简单的含有ssh的Dockerfile如下：
+
+..  literalinclude:: paddle_ssh.Dockerfile
+
+使用该Dockerfile构建出镜像，然后运行这个container即可。相关命令为\:
+
+..  code-block:: bash
+
+    # cd到含有Dockerfile的路径中
+    $ docker build . -t paddle_ssh
+    # 运行这个container，将宿主机的8022端口映射到container的22端口上
+    $ docker run -d -p 8022:22  --name paddle_ssh_machine paddle_ssh
+
+执行如下命令即可以关闭这个container，并且删除container中的数据\:
+
+..  code-block:: bash
+    
+    # 关闭container
+    $ docker stop paddle_ssh_machine
+    # 删除container
+    $ docker rm paddle_ssh_machine
+
+如果想要在外部机器访问这个container，即可以使用ssh访问宿主机的8022端口。用户名为
+root，密码也是root。命令为\:
+
+..  code-block:: bash
+
+    $ ssh -p 8022 root@YOUR_HOST_MACHINE
+
+至此，您就可以远程的使用PaddlePaddle啦。
diff --git a/doc_cn/build_and_install/install/index.rst b/doc_cn/build_and_install/install/index.rst
new file mode 100644
index 00000000000000..3f4e6afb6471d6
--- /dev/null
+++ b/doc_cn/build_and_install/install/index.rst
@@ -0,0 +1,9 @@
+安装Paddle
+==========
+
+Paddle提供数个预编译的二进制来进行安装。他们包括Docker镜像，ubuntu的deb安装包等
+。欢迎贡献更多的安装包。我们更推荐使用Docker镜像来部署Paddle环境。
+
+..	toctree::
+	docker_install.rst
+	ubuntu_install.rst
\ No newline at end of file
diff --git a/doc_cn/build_and_install/install/paddle_ssh.Dockerfile b/doc_cn/build_and_install/install/paddle_ssh.Dockerfile
new file mode 100644
index 00000000000000..aa8a7721604b1e
--- /dev/null
+++ b/doc_cn/build_and_install/install/paddle_ssh.Dockerfile
@@ -0,0 +1,15 @@
+FROM paddledev/paddlepaddle
+
+MAINTAINER PaddlePaddle dev team <paddle-dev@baidu.com>
+
+RUN apt-get update
+RUN apt-get install -y openssh-server
+RUN mkdir /var/run/sshd
+RUN echo 'root:root' | chpasswd
+
+RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+
+EXPOSE 22
+
+CMD    ["/usr/sbin/sshd", "-D"]
diff --git a/doc_cn/build_and_install/install/ubuntu_install.rst b/doc_cn/build_and_install/install/ubuntu_install.rst
new file mode 100644
index 00000000000000..6562a3a4cb8378
--- /dev/null
+++ b/doc_cn/build_and_install/install/ubuntu_install.rst
@@ -0,0 +1,25 @@
+使用deb包在Ubuntu上安装paddle
+=============================
+
+Paddle目前支持ubuntu 14.04版本使用deb包安装。更多的安装包Paddle会在近期提供。
+欢迎大家贡献各个发行版的安装包(例如，ubuntu，centos，debian，gentoo)。
+
+Paddle的ubuntu安装包分为两个版本，即CPU版本，和GPU版本，他们的下载地址是:
+
+* CPU版本的paddle安装包:  TBD
+* GPU版本的paddle安装包:  TBD
+
+需要注意的是，目前paddle的安装包只支持 
+`AVX <https://en.wikipedia.org/wiki/Advanced_Vector_Extensions>`_
+指令集的X86 CPU。如果系统使用不支持 `AVX`_ 指令集的CPU运行Paddle，那么需要从源码
+编译Paddle，请参考 `编译文档 <../cmake/index.html>`_ 。
+
+用户需要先讲paddle安装包下载到本地，然后执行如下命令即可完成安装。
+
+..  code-block:: shell
+
+    dpkg -i paddle-1.0.0-cpu.deb
+    apt-get install -f
+
+需要注意的是，如果使用GPU版本的paddle，请安装CUDA 7.5 和CUDNN 5到本地环境中，并
+设置好对应的环境变量(LD_LIBRARY_PATH等等)。
diff --git a/doc_cn/cluster/index.rst b/doc_cn/cluster/index.rst
new file mode 100644
index 00000000000000..b8a7a05a796e0b
--- /dev/null
+++ b/doc_cn/cluster/index.rst
@@ -0,0 +1,4 @@
+Cluster Train
+====================
+
+TBD
diff --git a/doc_cn/conf.py.in b/doc_cn/conf.py.in
new file mode 100644
index 00000000000000..bd4691d694d87e
--- /dev/null
+++ b/doc_cn/conf.py.in
@@ -0,0 +1,148 @@
+# -*- coding: utf-8 -*-
+#
+# documentation build configuration file, created by
+# sphinx-quickstart on Thu Jul 23 19:40:08 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+import sys
+import os, subprocess
+import shlex
+from recommonmark import parser, transform
+
+MarkdownParser = parser.CommonMarkParser
+AutoStructify = transform.AutoStructify
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+sys.path.insert(0, '@PROJ_ROOT@/python')
+
+# -- General configuration ------------------------------------------------
+
+# General information about the project.
+project = u'PADDLE'
+author = u'%s developers' % project
+copyright = u'2016, %s' % author
+github_doc_root = ''
+
+# add markdown parser
+MarkdownParser.github_doc_root = github_doc_root
+source_parsers = {
+    '.md': MarkdownParser,
+    '.Rmd': MarkdownParser,
+}
+os.environ['PADDLE_BUILD_DOC'] = '1'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.napoleon',
+]
+table_styling_embed_css = True
+
+autodoc_member_order = 'bysource'
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+# source_suffix = ['.rst', '.md']
+source_suffix = ['.rst', '.md', '.Rmd']
+
+# The encoding of source files.
+source_encoding = 'utf-8'
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#html_theme = 'sphinx_rtd_theme'  # sphinx_rtd_theme will cause table bad style
+html_theme = 'classic'
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = project + 'doc'
+
+# -- Options for LaTeX output ---------------------------------------------
+latex_elements = {
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  (master_doc, '%s.tex' % project, project,
+   author, 'manual'),
+]
+
+# Use the .. admonition:: directive for Notes sections.
+# False to use the .. rubric:: directive instead.
+napoleon_use_admonition_for_notes = True
+
+def setup(app):
+    # Add hook for building doxygen xml when needed
+    # no c++ API for now
+    app.add_config_value('recommonmark_config', {
+            'url_resolver': lambda url: github_doc_root + url,
+            }, True)
+    app.add_transform(AutoStructify)
diff --git a/doc_cn/demo/embedding_model/index.md b/doc_cn/demo/embedding_model/index.md
new file mode 100644
index 00000000000000..5894a4de5a14a8
--- /dev/null
+++ b/doc_cn/demo/embedding_model/index.md
@@ -0,0 +1 @@
+# Embedding Demo
diff --git a/doc_cn/demo/image_classification/index.rst b/doc_cn/demo/image_classification/index.rst
new file mode 100644
index 00000000000000..98cbdc29b9d4ac
--- /dev/null
+++ b/doc_cn/demo/image_classification/index.rst
@@ -0,0 +1,4 @@
+图片分类教程
+============
+
+TBD
diff --git a/doc_cn/demo/imagenet_model/index.md b/doc_cn/demo/imagenet_model/index.md
new file mode 100644
index 00000000000000..b54b28401ce169
--- /dev/null
+++ b/doc_cn/demo/imagenet_model/index.md
@@ -0,0 +1,2 @@
+# Resnet
+ TBD
diff --git a/doc_cn/demo/index.md b/doc_cn/demo/index.md
new file mode 100644
index 00000000000000..dd5ce8d4f9aae7
--- /dev/null
+++ b/doc_cn/demo/index.md
@@ -0,0 +1,22 @@
+# 使用示例
+
+## Image
+
+* [Image Classification](image_classification/index.rst)
+
+## NLP
+
+* [Sentiment Analysis](sentiment_analysis/index.md)
+* [Text Generation](text_generation/index.rst)
+* [Semantic Role Labeling](semantic_role_labeling/index.md)
+
+## Recommendation
+
+* MovieLens Dataset
+* MovieLens Regression
+
+## Model Zoo
+* [ImageNet: ResNet](imagenet_model/index.md)
+* [Embedding: Chinese Word](embedding_model/index.md)
+
+
diff --git a/doc_cn/demo/quick_start/NetContinuous.jpg b/doc_cn/demo/quick_start/NetContinuous.jpg
new file mode 100755
index 00000000000000..b88748e6b9c409
Binary files /dev/null and b/doc_cn/demo/quick_start/NetContinuous.jpg differ
diff --git a/doc_cn/demo/quick_start/NetConv.jpg b/doc_cn/demo/quick_start/NetConv.jpg
new file mode 100755
index 00000000000000..099e8d86885a81
Binary files /dev/null and b/doc_cn/demo/quick_start/NetConv.jpg differ
diff --git a/doc_cn/demo/quick_start/NetLR.jpg b/doc_cn/demo/quick_start/NetLR.jpg
new file mode 100755
index 00000000000000..4fe553e5b7c01b
Binary files /dev/null and b/doc_cn/demo/quick_start/NetLR.jpg differ
diff --git a/doc_cn/demo/quick_start/NetRNN.jpg b/doc_cn/demo/quick_start/NetRNN.jpg
new file mode 100755
index 00000000000000..090c9e6a635d38
Binary files /dev/null and b/doc_cn/demo/quick_start/NetRNN.jpg differ
diff --git a/doc_cn/demo/quick_start/Pipeline.jpg b/doc_cn/demo/quick_start/Pipeline.jpg
new file mode 100755
index 00000000000000..46f1d2b3ea4179
Binary files /dev/null and b/doc_cn/demo/quick_start/Pipeline.jpg differ
diff --git a/doc_cn/demo/quick_start/PipelineNetwork.jpg b/doc_cn/demo/quick_start/PipelineNetwork.jpg
new file mode 100755
index 00000000000000..f7269ccc9596e2
Binary files /dev/null and b/doc_cn/demo/quick_start/PipelineNetwork.jpg differ
diff --git a/doc_cn/demo/quick_start/PipelineTest.jpg b/doc_cn/demo/quick_start/PipelineTest.jpg
new file mode 100755
index 00000000000000..990e23a6451404
Binary files /dev/null and b/doc_cn/demo/quick_start/PipelineTest.jpg differ
diff --git a/doc_cn/demo/quick_start/PipelineTrain.jpg b/doc_cn/demo/quick_start/PipelineTrain.jpg
new file mode 100755
index 00000000000000..c581c90ae787be
Binary files /dev/null and b/doc_cn/demo/quick_start/PipelineTrain.jpg differ
diff --git a/doc_cn/demo/quick_start/index.md b/doc_cn/demo/quick_start/index.md
new file mode 100644
index 00000000000000..0027075ea045f4
--- /dev/null
+++ b/doc_cn/demo/quick_start/index.md
@@ -0,0 +1,541 @@
+# PaddlePaddle快速入门教程
+
+我们以文本分类问题作为背景，介绍PaddlePaddle使用流程和常用的网络基础单元的配置方法。
+
+## 安装(Install)
+
+首先请参考<a href = "../../build/index.html">安装教程</a>安装PaddlePaddle。
+
+## 使用概述(Overview)
+
+**文本分类问题**：对于给定的一条文本， 我们从提前给定的类别集合中选择其所属类
+别。比如通过用户对电子商务网站评论，评估产品的质量：
+
+- 这个显示器很棒！ （好评）
+- 用了两个月之后这个显示器屏幕碎了。（差评）
+
+每一个任务流程都可以分为如下5个基础部分。
+<center> ![](./Pipeline.jpg) </center>
+
+1. 数据格式准备
+    - 每行保存一条样本，类别Id 和文本信息用Tab间隔， 文本中的单词用空格分隔（如果不切词，则字与字之间用空格分隔），例如：```类别Id ‘\t’ 这 个 显 示 器 很 棒 ！```
+2. 数据向模型传送
+    - PaddlePaddle可以读取Python写的传输数据脚本，所有字符都将转换为连续整数表示的Id传给模型
+3. 网络结构（由易到难展示4种不同的网络配置）
+    - 逻辑回归模型
+    - 词向量模型
+    - 卷积模型
+    - 时序模型
+    - 优化算法
+4. 训练模型
+5. 预测
+
+## 数据格式准备(Data Preparation)
+在本问题中，我们使用[Amazon电子产品评论数据](http://jmcauley.ucsd.edu/data/amazon/)，
+将评论分为好评(正样本)和差评(负样本)两类。`demo/quick_start`里提供了数据下载脚本
+和预处理脚本。
+
+```bash
+cd demo/quick_start
+./data/get_data.sh
+pip install -r requirements.txt
+./preprocess.sh
+```
+
+## 数据向模型传送(Transfer Data to Model)
+
+### Python数据加载脚本(Data Provider Script)
+
+下面dataprovider_bow.py文件给出了完整例子，主要包括两部分：
+
+* initalizer： 定义文本信息、类别Id的数据类型。
+* process： yield文本信息和类别Id，和initalizer里定义顺序一致。
+
+```python
+from paddle.trainer.PyDataProvider2 import *
+
+# id of the word not in dictionary
+UNK_IDX = 0
+
+# initializer is called by the framework during initialization.
+# It allows the user to describe the data types and setup the
+# necessary data structure for later use.
+# `settings` is an object. initializer need to properly fill settings.input_types.
+# initializer can also store other data structures needed to be used at process().
+# In this example, dictionary is stored in settings.
+# `dictionay` and `kwargs` are arguments passed from trainer_config.lr.py
+def initializer(settings, dictionary, **kwargs):
+    # Put the word dictionary into settings
+    settings.word_dict = dictionary
+
+    # setting.input_types specifies what the data types the data provider
+    # generates.
+    settings.input_types = [
+        # The first input is a sparse_binary_vector,
+        # which means each dimension of the vector is either 0 or 1. It is the
+        # bag-of-words (BOW) representation of the texts.
+        sparse_binary_vector(len(dictionary)),
+        # The second input is an integer. It represents the category id of the
+        # sample. 2 means there are two labels in the dataset.
+        # (1 for positive and 0 for negative)
+        integer_value(2)]
+
+# Delaring a data provider. It has an initializer 'data_initialzer'.
+# It will cache the generated data of the first pass in memory, so that
+# during later pass, no on-the-fly data generation will be needed.
+# `setting` is the same object used by initializer()
+# `file_name` is the name of a file listed train_list or test_list file given
+# to define_py_data_sources2(). See trainer_config.lr.py.
+@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_name):
+    # Open the input data file.
+    with open(file_name, 'r') as f:
+        # Read each line.
+        for line in f:
+            # Each line contains the label and text of the comment, separated by \t.
+            label, comment = line.strip().split('\t')
+
+            # Split the words into a list.
+            words = comment.split()
+
+            # convert the words into a list of ids by looking them up in word_dict.
+            word_vector = [settings.word_dict.get(w, UNK_IDX) for w in words]
+
+            # Return the features for the current comment. The first is a list
+            # of ids representing a 0-1 binary sparse vector of the text,
+            # the second is the integer id of the label.
+            yield word_vector, int(label)
+```
+
+### 配置中的数据加载定义(Data Provider in Configure)
+
+在模型配置中利用`define_py_data_sources2`加载数据：
+
+```python
+from paddle.trainer_config_helpers import *
+
+file = "data/dict.txt"
+word_dict = dict()
+with open(dict_file, 'r') as f:
+    for i, line in enumerate(f):
+        w = line.strip().split()[0]
+        word_dict[w] = i
+# define the data sources for the model.
+# We need to use different process for training and prediction.
+# For training, the input data includes both word IDs and labels.
+# For prediction, the input data only includs word Ids.
+define_py_data_sources2(train_list='data/train.list',
+                        test_list='data/test.list',
+                        module="dataprovider_bow",
+                        obj="process",
+                        args={"dictionary": word_dict})
+```
+* data/train.list,data/test.list: 指定训练、测试数据
+* module="dataprovider": 数据处理Python文件名
+* obj="process": 指定生成数据的函数
+* args={"dictionary": word_dict}: 额外的参数，这里指定词典
+
+更详细用例请参考文档<a href = "../../ui/data_provider/python_case.html">Python Use Case</a>，
+数据格式和详细文档请参考<a href = "../../ui/py_data_provider_wrapper_api.html">
+PyDataProviderWrapper</a>。
+
+## 网络结构(Network Architecture)
+本节我们将专注于网络结构的介绍。
+<center> ![](./PipelineNetwork.jpg) </center>
+
+我们将以基本的逻辑回归网络作为起点，并逐渐展示更加深入的功能。更详细的网络配置
+连接请参考<a href = "../../ui/trainer_config_helpers_api.html#module-paddle.trainer_config_helpers.layers">Layer文档</a>。
+所有配置在`demo/quick_start`目录，首先列举逻辑回归网络。
+
+### 逻辑回归模型(Logistic Regression)
+
+流程如下：
+<center> ![](./NetLR.jpg) </center>
+
+- 获取利用one-hot vector表示的每个单词，维度是词典大小
+
+```python
+word = data_layer(name="word",  size=word_dim)
+```
+
+- 获取该条样本类别Id，维度是类别个数。
+
+```python
+label = data_layer(name="label", size=label_dim)
+```
+
+- 利用逻辑回归模型对该向量进行分类，同时会计算分类准确率
+
+```python
+# Define a fully connected layer with logistic activation (also called softmax activation).
+output = fc_layer(input=word,
+                  size=label_dim,
+                  act_type=SoftmaxActivation())
+# Define cross-entropy classification loss and error.
+classification_cost(input=output, label=label)
+```
+
+ - input: 除过data层，每个层都有一个或多个input,多个input以list方式输入
+ - size: 该层神经元个数
+ - act_type: 激活函数类型
+
+效果总结：我们将在后面介绍训练和预测的流程的脚本。在此为方便对比不同网络结构，
+我们随时总结了各个网络的复杂度和效果。
+
+<html>
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+
+<thead>
+<th scope="col" class="left">网络名称</th>
+<th scope="col" class="left">参数数量</th>
+<th scope="col" class="left">错误率</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">逻辑回归</td>
+<td class="left">252 KB</td>
+<td class="left">8.652%</td>
+</tr>
+
+</tbody>
+</table></center>
+</html>
+<br>
+
+### 词向量模型(Word Vector)
+
+embeding模型需要稍微改变数据提供的脚本，即`dataprovider_emb.py`，词向量模型、
+卷积模型、时序模型均使用该脚
+- 文本输入类型定义为整数类型integer_value
+- 设置文本输入类型seq_type为SequenceType.SEQUENCE
+
+```
+def initializer(settings, dictionary, **kwargs):
+    settings.word_dict = dictionary
+    settings.input_types = [
+        # Define the type of the first input as sequence of integer.
+        integer_value(len(dictionary), seq_type=SequenceType.SEQUENCE),
+        # Define the second input for label id
+        integer_value(2)]
+
+@provider(init_hook=initializer)
+def process(settings, file_name):
+    ...
+    # omitted, it is same as the data provider for LR model
+```
+
+该模型依然是使用逻辑回归分类网络的框架， 只是将句子利用连续向量表示替换稀疏
+向量表示， 即对第3步进行替换。句子表示的计算更新为2步：
+<center> ![](./NetContinuous.jpg) </center>
+
+- 利用单词Id查找对应的该单词的连续表示向量(维度为word_dim)， 输入N个单词，输出为N个word_dim维度向量
+
+```python
+emb = embedding_layer(input=word, size=word_dim)
+```
+
+- 将该句话包含的所有单词向量求平均得到句子的表示
+
+```python
+avg = pooling_layer(input=emb, pooling_type=AvgPooling())
+```
+
+其它部分和逻辑回归网络结构一致。
+效果总结：
+
+<html>
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+
+<thead>
+<th scope="col" class="left">网络名称</th>
+<th scope="col" class="left">参数数量</th>
+<th scope="col" class="left">错误率</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">词向量模型</td>
+<td class="left">15 MB</td>
+<td class="left">8.484%</td>
+</tr>
+
+</tbody>
+</table>
+</html></center>
+<br>
+
+### 卷积模型(Convolution)
+卷积网络是一种特殊的从词向量表示到句子表示的方法， 也就是将词向量模型额步
+骤3-2进行进一步演化， 变为3个新的子步骤。
+<center> ![](./NetConv.jpg) </center>
+
+文本卷积分为三个步骤：
+1. 获取每个单词左右各k个近邻， 拼接成一个新的向量表示；
+2. 对该表示进行非线性变换 （例如Sigmoid变换）, 成为维度为hidden_dim的新的向量；
+3. 在每个维度上取出在该句话新的向量集合上该维度的最大值作为最后的句子表示向量。 这3个子步骤可配置为:
+
+```python
+text_conv = sequence_conv_pool(input=emb,
+	                           context_start=k,
+	                           context_len=2 * k + 1)
+```
+
+效果总结：
+
+<html>
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+
+<thead>
+<th scope="col" class="left">网络名称</th>
+<th scope="col" class="left">参数数量</th>
+<th scope="col" class="left">错误率</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">卷积模型</td>
+<td class="left">16 MB</td>
+<td class="left">5.628%</td>
+</tr>
+
+</tbody>
+</table></center>
+<br>
+
+### 时序模型(Time Sequence)
+<center> ![](./NetRNN.jpg) </center>
+
+时序模型即为RNN模型, 包括简单的RNN模型、GRU模型、LSTM模型等。
+
+- GRU模型配置：
+
+```python
+gru = simple_gru(input=emb, size=gru_size)
+```
+
+- LSTM模型配置：
+
+```python
+lstm = simple_lstm(input=emb, size=lstm_size)
+```
+
+针对本问题，我们采用单层LSTM模型，并使用了Dropout，效果总结：
+
+<html>
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+
+<thead>
+<th scope="col" class="left">网络名称</th>
+<th scope="col" class="left">参数数量</th>
+<th scope="col" class="left">错误率</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">时序模型</td>
+<td class="left">16 MB</td>
+<td class="left">4.812%</td>
+</tr>
+
+</tbody>
+</table></center>
+</html>
+<br>
+
+## 优化算法(Optimization Algorithm)
+<a href = "../../ui/trainer_config_helpers_api.html#module-paddle.trainer_config_helpers.optimizers">优化算法</a>包括
+Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优化方法，加了L2正则和梯度截断。
+
+```python
+settings(batch_size=128,
+         learning_rate=2e-3,
+         learning_method=AdamOptimizer(),
+         regularization=L2Regularization(8e-4),
+         gradient_clipping_threshold=25)
+```
+
+## 训练模型(Training Model)
+在完成了数据和网络结构搭建之后， 我们进入到训练部分。
+<center> ![](./PipelineTrain.jpg) </center>
+
+训练脚本：我们将训练的命令行保存在了 `train.sh`文件中。训练时所需设置的主要参数如下：
+
+```bash
+paddle train \
+--config=trainer_config.py \
+--log_period=20 \
+--save_dir=./output \
+--num_passes=15 \
+--use_gpu=false
+```
+这里没有介绍多机分布式训练，可以参考<a href = "../../platform/index.html">分布式训练</a>的demo学习如何进行多机训练。
+
+## 预测(Prediction)
+可以使用训练好的模型评估带有label的验证集，也可以预测没有label的测试集。
+<center> ![](./PipelineTest.jpg) </center>
+
+测试脚本如下，将会测试配置文件中test.list指定的数据。
+
+```bash
+paddle train \
+--use_gpu=false \
+--job=test \
+--init_model_path=./output/pass-0000x
+```
+
+可以参考<a href = "../../ui/predict/swig_py_paddle.html">Python API预测</a>
+教程，或其他<a href = "../../demo/index.html">demo</a>的Python预测过程。也可以通过如下方式预测。
+
+预测脚本(`predict.sh`)：
+
+```bash
+model="output/pass-00003"
+paddle train \
+    --config=trainer_config.lstm.py \
+    --use_gpu=false \
+    --job=test \
+    --init_model_path=$model \
+    --config_args=is_predict=1 \
+    --predict_output_dir=. \
+
+mv rank-00000 result.txt
+```
+与训练网络配置不同的是：无需label相关的层，指定outputs输出概率层(softmax输出)，
+指定batch_size=1，数据传输无需label数据，预测数据指定test_list的位置。
+
+```
+is_predict = get_config_arg('is_predict', bool, False)
+trn = 'data/train.list' if not is_predict else None
+tst = 'data/test.list' if not is_predict else 'data/pred.list'
+obj = 'process' if not is_predict else 'process_pre'
+batch_size = 128 if not is_predict else 1
+if is_predict:
+    maxid = maxid_layer(output)
+    outputs([maxid,output])
+else:
+    label = data_layer(name="label", size=2)
+    cls = classification_cost(input=output, label=label)
+    outputs(cls)
+```
+
+## 总体效果总结(Summary)
+这些流程中的数据下载、网络配置、训练脚本在`/demo/quick_start`目录，我们在此总
+结上述网络结构在Amazon-Elec测试集(25k)上的效果:
+
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+
+<thead>
+<th scope="col" class="left">网络名称</th>
+<th scope="col" class="left">参数数量</th>
+<th scope="col" class="left">错误率</th>
+<th scope="col" class="left">配置文件</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">逻辑回归模型</td>
+<td class="left"> 252KB </td>
+<td class="left">8.652%</td>
+<td class="left">trainer_config.lr.py</td>
+</tr>
+
+<tr>
+<td class="left">词向量模型</td>
+<td class="left"> 15MB </td>
+<td class="left"> 8.484%</td>
+<td class="left">trainer_config.emb.py</td>
+</tr>
+
+<tr>
+<td class="left">卷积模型</td>
+<td class="left"> 16MB </td>
+<td class="left"> 5.628%</td>
+<td class="left">trainer_config.cnn.py</td>
+</tr>
+
+<tr>
+<td class="left">时序模型</td>
+<td class="left"> 16MB </td>
+<td class="left"> 4.812%</td>
+<td class="left">trainer_config.lstm.py</td>
+</tr>
+
+</tbody>
+</table>
+</center>
+<br>
+
+## 附录(Appendix)
+### 命令行参数(Command Line Argument)
+
+* --config：网络配置
+* --save_dir：模型存储路径
+* --log_period：每隔多少batch打印一次日志
+* --num_passes：训练轮次，一个pass表示过一遍所有训练样本
+* --config_args：命令指定的参数会传入网络配置中。
+* --init_model_path：指定初始化模型路径，可用在测试或训练时指定初始化模型。
+
+默认一个pass保存一次模型，也可以通过saving_period_by_batches设置每隔多少batch保存一次模型。
+可以通过show_parameter_stats_period设置打印参数信息等。
+其他参数请参考<a href = "../../ui/index.html#command-line-argument">令行参数文档</a>。
+
+### 输出日志(Log)
+
+```
+TrainerInternal.cpp:160]  Batch=20 samples=2560 AvgCost=0.628761 CurrentCost=0.628761 Eval: classification_error_evaluator=0.304297  CurrentEval: classification_error_evaluator=0.304297
+```
+模型训练会看到这样的日志，详细的参数解释如下面表格：
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+
+<thead>
+<th scope="col" class="left">名称</th>
+<th scope="col" class="left">解释</th>
+</tr>
+</thead>
+
+<tr>
+<td class="left">Batch=20</td>
+<td class="left"> 表示过了20个batch </td>
+</tr>
+
+<tr>
+<td class="left">samples=2560</td>
+<td class="left"> 表示过了2560个样本 </td>
+</tr>
+
+<tr>
+<td class="left">AvgCost</td>
+<td class="left"> 每个pass的第0个batch到当前batch所有样本的平均cost </td>
+</tr>
+
+<tr>
+<td class="left">CurrentCost</td>
+<td class="left"> 当前log_period个batch所有样本的平均cost </td>
+</tr>
+
+<tr>
+<td class="left">Eval: classification_error_evaluator</td>
+<td class="left"> 每个pass的第0个batch到当前batch所有样本的平均分类错误率 </td>
+</tr>
+
+<tr>
+<td class="left">CurrentEval: classification_error_evaluator</td>
+<td class="left"> 当前log_period个batch所有样本的平均分类错误率 </td>
+</tr>
+
+</tbody>
+</table>
+</center>
+<br>
diff --git a/doc_cn/demo/semantic_role_labeling/index.md b/doc_cn/demo/semantic_role_labeling/index.md
new file mode 100644
index 00000000000000..a1594577bb511b
--- /dev/null
+++ b/doc_cn/demo/semantic_role_labeling/index.md
@@ -0,0 +1,2 @@
+# 语义标注
+TBD
diff --git a/doc_cn/demo/sentiment_analysis/index.md b/doc_cn/demo/sentiment_analysis/index.md
new file mode 100644
index 00000000000000..d95f2803a43ed1
--- /dev/null
+++ b/doc_cn/demo/sentiment_analysis/index.md
@@ -0,0 +1,2 @@
+# 情感分析
+TBD
diff --git a/doc_cn/demo/text_generation/index.rst b/doc_cn/demo/text_generation/index.rst
new file mode 100644
index 00000000000000..147b77646536ce
--- /dev/null
+++ b/doc_cn/demo/text_generation/index.rst
@@ -0,0 +1,3 @@
+文本生成
+========
+TBD
diff --git a/doc_cn/index.md b/doc_cn/index.md
new file mode 100644
index 00000000000000..765a1ae2d6bec2
--- /dev/null
+++ b/doc_cn/index.md
@@ -0,0 +1,8 @@
+Paddle文档
+===================
+
+* [快速入门](demo/quick_start/index.md)
+* [编译与安装](build_and_install/index.rst)
+* [用户接口](ui/index.md)
+* [使用示例](demo/index.md)
+* [集群训练](cluster/index.rst)
diff --git a/doc_cn/ui/cmd/dump_config.rst b/doc_cn/ui/cmd/dump_config.rst
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/doc_cn/ui/cmd/index.rst b/doc_cn/ui/cmd/index.rst
new file mode 100644
index 00000000000000..13772d6ff44d4d
--- /dev/null
+++ b/doc_cn/ui/cmd/index.rst
@@ -0,0 +1,23 @@
+命令行参数
+==========
+
+安装好的Paddle脚本包括多条命令，他们是
+
+* paddle train即为paddle的训练进程。可以使用paddle train完成单机多显卡多线程的训
+  练。也可以和paddle pserver组合使用，完成多机训练。
+* paddle pserver为paddle的parameter server进程。负责多机训练中的参数聚合工作。
+* paddle version可以打印出paddle的版本和编译时信息。
+* merge_model 可以将paddle的模型和配置打包成一个文件。方便部署分发。
+* dump_config 可以将paddle的训练模型以proto string的格式打印出来
+* make_diagram 可以使用graphviz对paddle的网络模型进行绘制，方便调试使用。
+
+更详细的介绍请参考各个命令的命令行参数文档。
+
+..  toctree::
+    
+    paddle_train.rst
+    paddle_pserver.rst
+    paddle_version.rst
+    merge_model.rst
+    dump_config.rst
+    make_diagram.rst
diff --git a/doc_cn/ui/cmd/make_diagram.rst b/doc_cn/ui/cmd/make_diagram.rst
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/doc_cn/ui/cmd/merge_model.rst b/doc_cn/ui/cmd/merge_model.rst
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/doc_cn/ui/cmd/paddle_pserver.rst b/doc_cn/ui/cmd/paddle_pserver.rst
new file mode 100644
index 00000000000000..891975c34af5c3
--- /dev/null
+++ b/doc_cn/ui/cmd/paddle_pserver.rst
@@ -0,0 +1,2 @@
+paddle pserver的命令行参数
+==========================
diff --git a/doc_cn/ui/cmd/paddle_train.rst b/doc_cn/ui/cmd/paddle_train.rst
new file mode 100644
index 00000000000000..87b84f5cbdbbe0
--- /dev/null
+++ b/doc_cn/ui/cmd/paddle_train.rst
@@ -0,0 +1,2 @@
+paddle train的命令行参数
+========================
diff --git a/doc_cn/ui/cmd/paddle_version.rst b/doc_cn/ui/cmd/paddle_version.rst
new file mode 100644
index 00000000000000..0a4f8dd472a600
--- /dev/null
+++ b/doc_cn/ui/cmd/paddle_version.rst
@@ -0,0 +1,9 @@
+paddle version的命令行参数
+==========================
+
+paddle version可以打印出paddle的版本信息和编译的选项。常见的输出格式为
+
+..  literalinclude:: paddle_version.txt
+
+其第一行说明了paddle的版本，后面跟着一系列编译参数。这里可以参考paddle的
+`编译参数选项文件 <../../build/cmake/compile_options.html>`_
diff --git a/doc_cn/ui/cmd/paddle_version.txt b/doc_cn/ui/cmd/paddle_version.txt
new file mode 100644
index 00000000000000..33e2e4de7c24af
--- /dev/null
+++ b/doc_cn/ui/cmd/paddle_version.txt
@@ -0,0 +1,11 @@
+PaddlePaddle 0.8.0b, compiled with
+    with_avx: ON
+    with_gpu: ON
+    with_double: OFF
+    with_python: ON
+    with_rdma: OFF
+    with_glog: ON
+    with_gflags: ON
+    with_metric_learning: OFF
+    with_timer: OFF
+    with_predict_sdk: OFF
diff --git a/doc_cn/ui/data_provider/index.rst b/doc_cn/ui/data_provider/index.rst
new file mode 100644
index 00000000000000..238d5d49f4366d
--- /dev/null
+++ b/doc_cn/ui/data_provider/index.rst
@@ -0,0 +1,26 @@
+Paddle的数据提供(DataProvider)介绍
+==================================
+
+数据提供(DataProvider，后用DataProvider代替)是Paddle负责提供数据的模块。其作用是将训练数据
+传入内存或者显存，让神经网络可以进行训练。简单的使用，用户可以使用Python的
+:code:`PyDataProvider` 来自定义传数据的过程。如果有更复杂的使用，或者需要更高的效率，
+用户也可以在C++端自定义一个 :code:`DataProvider` 。
+
+Paddle需要用户在网络配置(trainer_config.py)中定义使用什么DataProvider，和DataProvider
+的一些参数，训练文件列表(train.list)和测试文件列表(test.list)。
+
+其中，train.list和test.list均为本地的两个文件(推荐直接放置到训练目录，以相对路径引用)。如果
+test.list不设置，或者设置为None的话，那么在训练过程中，不会执行测试操作。否则，则会根据命令行
+参数指定的测试方式，在训练过程中进行测试，从而防止过拟合。
+
+一般情况下，train.list和test.list为纯文本文件，其每一行对应这每一个数据文件。数据文件存放在
+本地磁盘中，将文件的绝对路径或相对路径(相对于Paddle程序运行时的路径)的方式写在train.list和
+test.list中。当然，train.list和test.list也可以放置hdfs文件路径，或者数据库连接地址等等。
+用户在DataProvider中需要实现如何访问其中每一个文件。
+
+DataProvider的具体用法和如何实现一个新的DataProvider，请参考下述文章:
+
+..	toctree::
+
+	pydataprovider2.rst
+	write_new_dataprovider.rst
diff --git a/doc_cn/ui/data_provider/mnist_config.py b/doc_cn/ui/data_provider/mnist_config.py
new file mode 100644
index 00000000000000..0f9094cd2776fc
--- /dev/null
+++ b/doc_cn/ui/data_provider/mnist_config.py
@@ -0,0 +1,6 @@
+from paddle.trainer_config_helpers import *
+
+define_py_data_sources2(train_list='train.list',
+                        test_list=None,
+                        module='mnist_provider',
+                        obj='process')
diff --git a/doc_cn/ui/data_provider/mnist_provider.py b/doc_cn/ui/data_provider/mnist_provider.py
new file mode 100644
index 00000000000000..92f1915c107256
--- /dev/null
+++ b/doc_cn/ui/data_provider/mnist_provider.py
@@ -0,0 +1,25 @@
+from paddle.trainer.PyDataProvider2 import *
+
+
+# Define a py data provider
+@provider(input_types=[
+    dense_vector(28 * 28),
+    integer_value(10)
+])
+def process(settings, filename):  # settings is not used currently.
+    f = open(filename, 'r')  # open one of training file
+
+    for line in f:  # read each line
+        label, pixel = line.split(';')
+
+        # get features and label
+        pixels_str = pixel.split(' ')
+
+        pixels_float = []
+        for each_pixel_str in pixels_str:
+            pixels_float.append(float(each_pixel_str))
+
+        # give data to paddle.
+        yield pixels_float, int(label)
+
+    f.close()  # close file
diff --git a/doc_cn/ui/data_provider/mnist_train.txt b/doc_cn/ui/data_provider/mnist_train.txt
new file mode 100644
index 00000000000000..34be718ad9ea49
--- /dev/null
+++ b/doc_cn/ui/data_provider/mnist_train.txt
@@ -0,0 +1,3 @@
+5;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.215686 0.533333 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.67451 0.992157 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.070588 0.886275 0.992157 0 0 0 0 0 0 0 0 0 0 0.192157 0.070588 0 0 0 0 0 0 0 0 0 0 0 0 0 0.670588 0.992157 0.992157 0 0 0 0 0 0 0 0 0 0.117647 0.933333 0.858824 0.313725 0 0 0 0 0 0 0 0 0 0 0 0.090196 0.858824 0.992157 0.831373 0 0 0 0 0 0 0 0 0 0.141176 0.992157 0.992157 0.611765 0.054902 0 0 0 0 0 0 0 0 0 0 0.258824 0.992157 0.992157 0.529412 0 0 0 0 0 0 0 0 0 0.368627 0.992157 0.992157 0.419608 0.003922 0 0 0 0 0 0 0 0 0 0.094118 0.835294 0.992157 0.992157 0.517647 0 0 0 0 0 0 0 0 0 0.603922 0.992157 0.992157 0.992157 0.603922 0.545098 0.043137 0 0 0 0 0 0 0 0.447059 0.992157 0.992157 0.956863 0.062745 0 0 0 0 0 0 0 0 0.011765 0.666667 0.992157 0.992157 0.992157 0.992157 0.992157 0.745098 0.137255 0 0 0 0 0 0.152941 0.866667 0.992157 0.992157 0.521569 0 0 0 0 0 0 0 0 0 0.070588 0.992157 0.992157 0.992157 0.803922 0.352941 0.745098 0.992157 0.945098 0.317647 0 0 0 0 0.580392 0.992157 0.992157 0.764706 0.043137 0 0 0 0 0 0 0 0 0 0.070588 0.992157 0.992157 0.776471 0.043137 0 0.007843 0.27451 0.882353 0.941176 0.176471 0 0 0.180392 0.898039 0.992157 0.992157 0.313725 0 0 0 0 0 0 0 0 0 0 0.070588 0.992157 0.992157 0.713725 0 0 0 0 0.627451 0.992157 0.729412 0.062745 0 0.509804 0.992157 0.992157 0.776471 0.035294 0 0 0 0 0 0 0 0 0 0 0.494118 0.992157 0.992157 0.968627 0.168627 0 0 0 0.423529 0.992157 0.992157 0.364706 0 0.717647 0.992157 0.992157 0.317647 0 0 0 0 0 0 0 0 0 0 0 0.533333 0.992157 0.984314 0.945098 0.603922 0 0 0 0.003922 0.466667 0.992157 0.988235 0.976471 0.992157 0.992157 0.788235 0.007843 0 0 0 0 0 0 0 0 0 0 0 0.686275 0.882353 0.364706 0 0 0 0 0 0 0.098039 0.588235 0.992157 0.992157 0.992157 0.980392 0.305882 0 0 0 0 0 0 0 0 0 0 0 0 0.101961 0.67451 0.321569 0 0 0 0 0 0 0 0.105882 0.733333 0.976471 0.811765 0.713725 0 0 0 0 0 0 0 0 0 0 0 0 0 0.65098 0.992157 0.321569 0 0 0 0 0 0 0 0 0 0.25098 0.007843 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0.94902 0.219608 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.968627 0.764706 0.152941 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.498039 0.25098 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
+0;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.298039 0.333333 0.333333 0.333333 0.337255 0.333333 0.333333 0.109804 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.027451 0.223529 0.776471 0.964706 0.988235 0.988235 0.988235 0.992157 0.988235 0.988235 0.780392 0.098039 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.14902 0.698039 0.988235 0.992157 0.988235 0.901961 0.87451 0.568627 0.882353 0.976471 0.988235 0.988235 0.501961 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.188235 0.647059 0.988235 0.988235 0.745098 0.439216 0.098039 0 0 0 0.572549 0.988235 0.988235 0.988235 0 0 0 0 0 0 0 0 0 0 0 0 0 0.2 0.933333 0.992157 0.941176 0.247059 0 0 0 0 0 0 0.188235 0.898039 0.992157 0.992157 0 0 0 0 0 0 0 0 0 0 0 0.039216 0.639216 0.933333 0.988235 0.913725 0.278431 0 0 0 0 0 0 0 0.113725 0.843137 0.988235 0.988235 0 0 0 0 0 0 0 0 0 0 0 0.235294 0.988235 0.992157 0.988235 0.815686 0.07451 0 0 0 0 0 0 0 0.333333 0.988235 0.988235 0.552941 0 0 0 0 0 0 0 0 0 0 0.211765 0.878431 0.988235 0.992157 0.701961 0.329412 0.109804 0 0 0 0 0 0 0 0.698039 0.988235 0.913725 0.145098 0 0 0 0 0 0 0 0 0 0.188235 0.890196 0.988235 0.988235 0.745098 0.047059 0 0 0 0 0 0 0 0 0 0.882353 0.988235 0.568627 0 0 0 0 0 0 0 0 0 0.2 0.933333 0.992157 0.992157 0.992157 0.447059 0.294118 0 0 0 0 0 0 0 0 0.447059 0.992157 0.768627 0 0 0 0 0 0 0 0 0 0 0.623529 0.988235 0.988235 0.988235 0.988235 0.992157 0.47451 0 0 0 0 0 0 0 0.188235 0.933333 0.87451 0.509804 0 0 0 0 0 0 0 0 0 0 0.992157 0.988235 0.937255 0.792157 0.988235 0.894118 0.082353 0 0 0 0 0 0 0.027451 0.647059 0.992157 0.654902 0 0 0 0 0 0 0 0 0 0 0 0.623529 0.988235 0.913725 0.329412 0.376471 0.184314 0 0 0 0 0 0 0.027451 0.513725 0.988235 0.635294 0.219608 0 0 0 0 0 0 0 0 0 0 0 0.196078 0.929412 0.988235 0.988235 0.741176 0.309804 0 0 0 0 0 0 0.529412 0.988235 0.678431 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.223529 0.992157 0.992157 1 0.992157 0.992157 0.992157 0.992157 1 0.992157 0.992157 0.882353 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.023529 0.478431 0.654902 0.658824 0.952941 0.988235 0.988235 0.988235 0.992157 0.988235 0.729412 0.278431 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.196078 0.647059 0.764706 0.764706 0.768627 0.580392 0.047059 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
+4;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.180392 0.470588 0.623529 0.623529 0.623529 0.588235 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.243137 0.494118 0.862745 0.870588 0.960784 0.996078 0.996078 0.996078 0.996078 0.992157 0.466667 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.317647 0.639216 0.639216 0.639216 0.639216 0.639216 0.470588 0.262745 0.333333 0.929412 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.811765 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.811765 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.811765 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.184314 0.992157 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.192157 0.996078 0.384314 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.454902 0.980392 0.219608 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.564706 0.941176 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.588235 0.776471 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.945098 0.560784 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.054902 0.952941 0.356863 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.337255 0.917647 0.109804 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.698039 0.701961 0.019608 0.4 0.662745 0.662745 0.662745 0.662745 0.662745 0.662745 0.662745 0.376471 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.090196 0.639216 0.972549 0.945098 0.913725 0.996078 0.996078 0.996078 0.996078 1 0.996078 0.996078 1 0.996078 0 0 0 0 0 0 0 0 0 0 0.007843 0.105882 0.717647 0.776471 0.905882 0.996078 0.996078 0.988235 0.980392 0.862745 0.537255 0.223529 0.223529 0.368627 0.376471 0.6 0.6 0.6 0 0 0 0 0 0 0 0 0.262745 0.470588 0.6 0.996078 0.996078 0.996078 0.996078 0.847059 0.356863 0.156863 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.909804 0.705882 0.823529 0.635294 0.490196 0.219608 0.113725 0.062745 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.152941 0.152941 0.156863 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
diff --git a/doc_cn/ui/data_provider/pydataprovider2.rst b/doc_cn/ui/data_provider/pydataprovider2.rst
new file mode 100644
index 00000000000000..5f3fe31dad9f5b
--- /dev/null
+++ b/doc_cn/ui/data_provider/pydataprovider2.rst
@@ -0,0 +1,204 @@
+PyDataProvider2的使用
+=====================
+
+PyDataProvider是Paddle使用Python提供数据的推荐接口。使用该接口用户可以只关注如何
+从文件中读取每一条数据，而不用关心数据如何传输给Paddle，数据如何存储等等。该数据
+接口使用多线程读取数据，并提供了简单的Cache功能。
+
+
+简单的使用场景
+--------------
+
+这里以MNIST手写识别为例，来说明简单的PyDataProvider如何使用。MNIST是一个包含有
+70,000张灰度图片的数字分类数据集。对于MNIST而言，标签是0-9的数字，而特征即为
+28*28的像素灰度值。这里我们使用简单的文本文件表示MNIST图片，样例数据如下。
+
+..  literalinclude:: mnist_train.txt
+
+其数据使用;间隔，第一段数据为这张图片的label，第二段数据为这个图片的像素值。
+首先我们将这个数据文件(例如文件名是'mnist_train.txt')写入train.list。那么
+train.list即为
+
+..  literalinclude:: train.list
+
+那么对应的dataprovider既为
+
+..  literalinclude:: mnist_provider.py
+    :linenos:
+
+其中第一行是引入Paddle的PyDataProvider2包。主要函数是process函数。process函数
+具有两个参数，第一个参数是 settings 。这个参数在这个样例里没有使用，具
+体可以参考 settings 。第二个参数是filename，这个参数被Paddle进程传入，为
+train.list中的一行(即train.list若干数据文件路径的某一个路径)。
+
+:code:`@provider` 是一个Python的 `Decorator <http://www.learnpython.org/en/Decorators>`_
+。这行的作用是设置DataProvider的一些属性，并且标记process函数是一个DataProvider。
+如果不了解 `Decorator <http://www.learnpython.org/en/Decorators>`_ 是什么也没关系，
+只需要知道这只是一个标记属性的方法就可以了。
+
+属性 `input_types`_ 是设置这个DataProvider返回什么样的数据。这里设置的是返回一个
+28*28的稠密向量和一个[0-9]，10维的整数值。 `input_types`_ 具体可以设置成什么其他格
+式，请参考 `input_types`_ 的文档。
+
+process函数是实现数据输入的主函数，在这个函数中，实现了打开文本文件，从文本文件中读取
+每一行，并将每行转换成和 `input_types`_ 一致的特征，并在23行返回给Paddle进程。需要注意
+的是， 返回的顺序需要和 `input_types`_ 中定义的顺序一致。
+
+同时，返回数据在Paddle中是仅仅返回一条完整的训练样本，并且使用关键词 :code:`yield` 。
+在PyDataProvider中，可以为一个数据文件返回多条训练样本(就像这个样例一样)，只需要在
+process函数调用多次 :code:`yield` 即可。 :code:`yield` 是Python的一个关键词，相关的概
+念是 :code:`generator` 。使用这个关键词，可以在一个函数里，多次返回变量。
+
+在训练配置里，只需要使用一行代码即可以设置训练引用这个DataProvider。这个设置为
+
+..  literalinclude:: mnist_config.py
+
+这里说明了训练数据是 'train.list'，而没有测试数据。引用的DataProvider是 'mnist_provider' 
+这个模块中的 'process' 函数。
+
+至此，简单的PyDataProvider样例就说明完毕了。对于用户来说，讲数据发送给Paddle，仅仅需要
+知道如何从 **一个文件** 里面读取 **一条** 样本。而Paddle进程帮助用户做了
+
+* 将数据组合成Batch训练
+* Shuffle训练数据
+* 多线程数据读取
+* 缓存训练数据到内存(可选)
+* CPU->GPU双缓存
+
+是不是很简单呢？
+
+序列模型数据提供
+----------------
+
+序列模型是指数据的某一维度是一个序列形式，即包含时间步信息。所谓时间步信息，
+不一定和时间有关系，只是说明数据的顺序是重要的。例如，文本信息就是一个序列
+数据。
+
+这里举例的数据是英文情感分类的数据。数据是给一段英文文本，分类成正面情绪和
+负面情绪两类(用0和1表示)。样例数据为
+
+..  literalinclude:: sentimental_train.txt
+
+这里，DataProvider可以是
+
+..  literalinclude:: sentimental_provider.py
+
+这个序列模型比较复杂。主要是增加了初始化机制。其中 :code:`on_init` 函数是使用
+`@provider`_ 中的 `init_hook`_ 配置参数配置给DataProvider的。这个函数会在
+DataProvider创建的时候执行。这个初始化函数具有如下参数:
+
+* 第一个参数是 settings 对象。
+* 其他参数均使用key word argument形式传入。有部分参数是Paddle自动生成的，
+  参考 `init_hook`_ 。这里的 :code:`dictionary` 是从训练配置传入的dict对象。
+  即从单词字符串到单词id的字典。
+
+传入这个变量的方式为
+
+..  literalinclude:: sentimental_config.py
+
+这个声明基本上和mnist的样例一致。除了
+
+* 在配置中读取了字典
+* 在声明DataProvider的时候传入了dictionary作为参数。
+
+在 :code:`on_init` 函数中，配置了 `input_types` 。这个和在 `@provider`_ 中配置
+`input_types` 效果一致，但是在 `on_init` 中配置 `input_types` 是在运行时执行的，所以
+可以根据不同的数据配置不同的输入类型。这里的输入特征是词id的序列，所以将 :code:`seq_type`
+设置成了序列(同时，也可以使用 :code:`integer_sequence` 类型来设置)。
+
+同时，将字典存入了settings 对象。这个字典可以在 :code:`process` 函数中使用。 :code:`process`
+函数中的 settings 和 :code:`on_init` 中的settings 是同一个对象。
+
+而在 :code:`process` 函数中，基本的处理逻辑也和mnist逻辑一致。依次返回了文件中的每条数据。
+
+至此，基本的PyDataProvider使用介绍完毕了。具体DataProvider还具有什么功能，请参考下节reference。
+
+参考(Reference)
+---------------
+
+..  _@provider::
+
+@provider
++++++++++
+
+'@provider'是一个Python的 `Decorator`_ ，他可以将某一个函数标记成一个PyDataProvider。它包含的参数有:
+
+*  `input_types`_ 是数据输入格式。具体有哪些格式，参考 `input_types`_ 。
+*  should_shuffle 是个DataProvider是不是要做shuffle，如果不设置的话，训练的时候默认shuffle，
+   测试的时候默认不shuffle
+*  pool_size 是设置DataProvider在内存中暂存的数据条数。设置成-1的话，即不在乎内存暂存多少条数据。
+*  can_over_batch_size 表示是否允许Paddle暂存略微多余pool_size的数据。这样做可以避免很多死锁问题。
+   一般推荐设置成True
+*  calc_batch_size 传入的是一个函数，这个函数以一条数据为参数，返回batch_size的大小。默认情况下一条数据
+   是一个batch size，但是有时为了计算均衡性，可以将一条数据设置成多个batch size
+*  cache 是数据缓存的策略，参考 `cache`_
+*  init_hook 是初始化时调用的函数，参考 `init_hook`_
+
+
+..  _input_types::
+
+input_types
++++++++++++
+
+Paddle的数据包括四种主要类型，和三种序列模式。其中，四种数据类型是
+
+* dense_vector 表示稠密的浮点数向量。
+* sparse_binary_vector 表示稀疏的零一向量，即大部分值为0，有值的位置只能取1
+* sparse_float_vector 表示稀疏的向量，即大部分值为0，有值的部分可以是任何浮点数
+* integer 表示整数标签。
+
+而三种序列模式为
+
+* SequenceType.NO_SEQUENCE 即不是一条序列
+* SequenceType.SEQUENCE 即是一条时间序列
+* SequenceType.SUB_SEQUENCE 即是一条时间序列，且序列的每一个元素还是一个时间序列。
+
+不同的数据类型和序列模式返回的格式不同，列表如下
+
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+|                      | NO_SEQUENCE         | SEQUENCE                          |  SUB_SEQUENCE                                  |
++======================+=====================+===================================+================================================+
+| dense_vector         | [f, f, ...]         | [[f, ...], [f, ...], ...]         | [[[f, ...], ...], [[f, ...], ...],...]         |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| sparse_binary_vector | [i, i, ...]         | [[i, ...], [i, ...], ...]         | [[[i, ...], ...], [[i, ...], ...],...]         |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| sparse_float_vector  | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| integer_value        |  i                  | [i, i, ...]                       | [[i, ...], [i, ...], ...]                      |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+
+其中，f代表一个浮点数，i代表一个整数。
+
+..  _init_hook::
+..  _settings::
+
+init_hook
++++++++++
+
+init_hook可以传入一个函数。这个函数在初始化的时候会被调用。这个函数的参数是:
+
+
+
+* 第一个参数是 settings 对象。这个对象和process的第一个参数一致。具有的属性有
+    * settings.input_types 设置输入类型。参考 `input_types`_
+    * settings.logger 一个logging对象
+* 其他参数都使用key word argument传入。这些参数包括paddle定义的参数，和用户传入的参数。
+    * Paddle定义的参数包括:
+        * is_train bool参数，表示这个DataProvider是训练用的DataProvider或者测试用的
+          DataProvider
+        * file_list 所有文件列表。
+    * 用户定义的参数使用args在训练配置中设置。
+
+注意，paddle保留添加参数的权力，所以init_hook尽量使用 :code:`**kwargs` , 来接受不使用的
+函数来保证兼容性。
+
+..  _cache::
+
+cache
++++++
+
+DataProvider提供了两种简单的Cache策略。他们是
+
+* CacheType.NO_CACHE 不缓存任何数据，每次都会从python端读取数据
+* CacheType.CACHE_PASS_IN_MEM 第一个pass会从python端读取数据，剩下的pass会直接从内存里
+  读取数据。 
diff --git a/doc_cn/ui/data_provider/sentimental_config.py b/doc_cn/ui/data_provider/sentimental_config.py
new file mode 100644
index 00000000000000..051f75e32b5c0b
--- /dev/null
+++ b/doc_cn/ui/data_provider/sentimental_config.py
@@ -0,0 +1,11 @@
+from paddle.trainer_config_helpers import *
+
+dictionary = dict()
+...  #  read dictionary from outside
+
+define_py_data_sources2(train_list='train.list', test_list=None,
+                        module='sentimental_provider', obj='process',
+                        # above codes same as mnist sample.
+                        args={  # pass to provider.
+                            'dictionary': dictionary
+                        })
diff --git a/doc_cn/ui/data_provider/sentimental_provider.py b/doc_cn/ui/data_provider/sentimental_provider.py
new file mode 100644
index 00000000000000..bda37d7722a0bb
--- /dev/null
+++ b/doc_cn/ui/data_provider/sentimental_provider.py
@@ -0,0 +1,45 @@
+from paddle.trainer.PyDataProvider2 import *
+
+
+def on_init(settings, dictionary, **kwargs):
+    # on_init will invoke when data provider is initialized. The dictionary
+    # is passed from trainer_config, and is a dict object with type
+    # (word string => word id).
+
+    # set input types in runtime. It will do the same thing as
+    # @provider(input_types) will do, but it is set dynamically during runtime.
+    settings.input_types = [
+        # The text is a sequence of integer values, and each value is a word id.
+        # The whole sequence is the sentences that we want to predict its
+        # sentimental.
+        integer_value(len(dictionary), seq_type=SequenceType),  # text input
+
+        # label positive/negative
+        integer_value(2)
+    ]
+
+    # save dictionary as settings.dictionary. It will be used in process
+    # method.
+    settings.dictionary = dictionary
+
+
+@provider(init_hook=on_init)
+def process(settings, filename):
+    f = open(filename, 'r')
+
+    for line in f:  # read each line of file
+        label, sentence = line.split('\t')  # get label and sentence
+        words = sentence.split(' ')  # get words
+
+        # convert word string to word id
+        # the word not in dictionary will be ignored.
+        word_ids = []
+
+        for each_word in words:
+            if each_word in settings.dictionary:
+                word_ids.append(settings.dictionary[each_word])
+
+        # give data to paddle.
+        yield word_ids, int(label)
+
+    f.close()
diff --git a/doc_cn/ui/data_provider/sentimental_train.txt b/doc_cn/ui/data_provider/sentimental_train.txt
new file mode 100644
index 00000000000000..0060ac267c4bf8
--- /dev/null
+++ b/doc_cn/ui/data_provider/sentimental_train.txt
@@ -0,0 +1,3 @@
+0       I saw this movie at the AFI Dallas festival . It all takes place at a lake house and it looks wonderful .
+1       This documentary makes you travel all around the globe . It contains rare and stunning sequels from the wilderness .
+...
diff --git a/doc_cn/ui/data_provider/train.list b/doc_cn/ui/data_provider/train.list
new file mode 100644
index 00000000000000..92bdc0a8b4c21b
--- /dev/null
+++ b/doc_cn/ui/data_provider/train.list
@@ -0,0 +1 @@
+mnist_train.txt
diff --git a/doc_cn/ui/data_provider/write_new_dataprovider.rst b/doc_cn/ui/data_provider/write_new_dataprovider.rst
new file mode 100644
index 00000000000000..a2495fe66371eb
--- /dev/null
+++ b/doc_cn/ui/data_provider/write_new_dataprovider.rst
@@ -0,0 +1,4 @@
+自定义一个DataProvider
+====================
+
+TBD
\ No newline at end of file
diff --git a/doc_cn/ui/index.rst b/doc_cn/ui/index.rst
new file mode 100644
index 00000000000000..3a0f1e2dc301a5
--- /dev/null
+++ b/doc_cn/ui/index.rst
@@ -0,0 +1,30 @@
+配置
+====
+
+TBD
+
+数据提供
+========
+
+..  toctree::
+
+    data_provider/index.rst
+
+API参考
+=======
+
+TBD
+
+命令行参数
+==========
+
+..  toctree::
+
+    cmd/index.rst
+
+预测
+====
+
+..  toctree::
+
+    predict/swig_py_paddle.rst
diff --git a/doc_cn/ui/predict/swig_py_paddle.rst b/doc_cn/ui/predict/swig_py_paddle.rst
new file mode 100644
index 00000000000000..3615e6d079f058
--- /dev/null
+++ b/doc_cn/ui/predict/swig_py_paddle.rst
@@ -0,0 +1,40 @@
+Paddle的Python预测接口
+==================================
+
+Paddle目前使用Swig对其常用的预测接口进行了封装，使在Python环境下的预测接口更加简单。
+在Python环境下预测结果，主要分为以下几个步骤。
+
+* 读入解析训练配置
+* 构造GradientMachine
+* 准备数据
+* 预测
+
+典型的预测代码如下，使用mnist手写识别作为样例。
+
+..  literalinclude:: ../../../doc/ui/predict/predict_sample.py
+    :language: python
+    :linenos:
+
+主要的软件包为py_paddle.swig_paddle，这个软件包文档相对完善。可以使用python的 :code:`help()` 函数查询文档。主要步骤为:
+
+* 在程序开始阶段，使用命令行参数初始化paddle
+* 在98行载入paddle的训练文件。读取config
+* 在100行创建神经网络，并在83行载入参数。
+* 103行创建一个从工具类，用来转换数据。
+    - swig_paddle接受的原始数据是C++的Matrix，也就是直接写内存的float数组。
+    - 这个接口并不用户友好。所以，我们提供了一个工具类DataProviderWrapperConverter.
+    - 这个工具类接收和PyDataProviderWrapper一样的输入数据，请参考PyDataProviderWrapper的文档。
+* 在第105行执行预测。forwardTest是一个工具类，直接提取出神经网络Output层的输出结果。典型的输出结果为\:
+
+..  code-block:: text
+
+    [{'id': None, 'value': array([[  5.53018653e-09,   1.12194102e-05,   1.96644767e-09,
+          1.43630644e-02,   1.51111044e-13,   9.85625684e-01,
+          2.08823112e-10,   2.32777140e-08,   2.00186201e-09,
+          1.15501715e-08],
+       [  9.99982715e-01,   1.27787406e-10,   1.72296313e-05,
+          1.49316648e-09,   1.36540484e-11,   6.93137714e-10,
+          2.70634608e-08,   3.48565123e-08,   5.25639710e-09,
+          4.48684503e-08]], dtype=float32)}]
+
+其中，value即为softmax层的输出。由于数据是两个，所以输出的value。
diff --git a/paddle/.common_test_util.sh b/paddle/.common_test_util.sh
new file mode 100644
index 00000000000000..dec22e45619fb5
--- /dev/null
+++ b/paddle/.common_test_util.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+PORT_FILE=/tmp/paddle_test_ports
+PORT_LOCK_FILE=/tmp/paddle_test_ports.lock
+
+# Create flag file, all user can rw, ignore all error here
+touch $PORT_FILE $PORT_LOCK_FILE 2>/dev/null
+chmod a+rw $PORT_FILE $PORT_LOCK_FILE 2>/dev/null
+
+# acquire a range of ports that not used by other runtests.sh currentlly.
+# return 1 if ports is used by other, otherwise return 0.
+# NOTE: the acquire_ports/release_ports is interprocess mutexed.
+#
+# There are two parameter of this method
+# param 1: the begin of port range
+# param 2: the lenght of port range.
+# so, the port range is [param1, param1+param2)
+acquire_ports(){
+  (
+    flock -x 200
+    let "len=$1+$2"
+    for((i=$1; i<$len; i++))
+    do
+      grep -q $i $PORT_FILE
+      if [ $? -eq 0 ] ; then
+        return 1 # Port already write to $PORT_FILE
+      fi
+    done
+
+    for((i=$1; i<$len; i++))
+    do
+      echo $i >> $PORT_FILE # Write to $PORT_FILE
+    done
+    return 0
+  )200>$PORT_LOCK_FILE
+}
+
+# release a range of ports. Mark these ports is not used by runtests.sh.
+# NOTE: the acquire_ports/release_ports is interprocess mutexed.
+#
+# The parameter is same as acquire_ports, see acquire_ports' comments.
+release_ports(){
+  (
+    flock -x 200
+    let "len=$1+$2"
+    for((i=$1; i<$len; i++))
+    do
+      tmp=`sed "/$i/d" $PORT_FILE`  # remove port
+      echo $tmp > $PORT_FILE
+    done
+  )200>$PORT_LOCK_FILE
+}
+
+# use set_port  to get a random free port
+# such as    set_port -p port test_fuc   to run  test_fuc --port=random
+# use  -n to set_port test_fuc to get a continuous free port
+# such as    set_port  -n 10 -p port  test_fuc  to get ten continuous free port to run test_fuc --port=random
+set_port()
+{
+    num=1
+
+    port_type="port"
+    unset OPTIND
+    while   getopts  "n:p:"  opt
+    do
+        case  "$opt"   in
+            n)   echo  "get num ${OPTARG}"
+                 num=${OPTARG}
+                 ;;
+            p)   echo  "get port_type ${OPTARG}"
+                 port_type=${OPTARG}
+                 ;;
+        esac
+    done
+    shift $((OPTIND-1))
+    cmd=$@
+    for ((i=1;i<=10000;i++))
+    do
+        declare -i port=$RANDOM+10000
+        port_used_total=0
+        for((n=0;n<=num-1;n++))
+            do
+                declare -i port_check=$port+$n
+                port_used_num=`netstat -a |grep $port_check|wc -l`
+                declare -i port_used_total=$port_used_total+$port_used_num
+            done
+        if [ $port_used_total -ne 0 ]
+            then
+                continue
+        fi
+        # Lock Ports.
+        acquire_ports $port $num
+        if [ $? -ne 0 ]; then
+            continue
+        fi
+        $cmd --$port_type=$port
+        return_val=$?
+        release_ports $port $num
+        if [ $return_val -eq 0 ]; then
+            return 0
+        else
+            echo "$cmd run wrong"
+            return 1
+        fi
+    done
+
+}
\ No newline at end of file
diff --git a/paddle/.gitignore b/paddle/.gitignore
new file mode 100644
index 00000000000000..ae15b7e9751f98
--- /dev/null
+++ b/paddle/.gitignore
@@ -0,0 +1,42 @@
+*.o
+*.a
+.svn
+GPATH
+GRTAGS
+GTAGS
+.idl*
+*~
+*.pyc
+*.pb.cc
+*.pb.h
+*_pb2.py
+paddle_*
+output/
+google/
+Makefile
+log/
+.pptool_config
+hf/
+build
+issue.info
+
+ar
+g++
+gcc
+ld
+ld-linux-x86-64.so.2
+x86_64-scm-linux-gnu/
+.lint.*.md5
+
+examples/crf/*.bin
+
+.idea/
+Paddle_wrap.cxx
+Paddle_wrap.h
+paddle.py
+py_paddle-*.whl
+py_paddle/paddle.py
+.py_paddle_extra_link_flags
+HPPL_ERROR_LOG
+unittest.list
+proto
diff --git a/paddle/.set_port.sh b/paddle/.set_port.sh
new file mode 100755
index 00000000000000..33596fac600ed5
--- /dev/null
+++ b/paddle/.set_port.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DIRNAME=`dirname $0`
+source $DIRNAME/.common_test_util.sh
+set_port $@
diff --git a/paddle/.set_python_path.sh b/paddle/.set_python_path.sh
new file mode 100755
index 00000000000000..afde3e51db45d1
--- /dev/null
+++ b/paddle/.set_python_path.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# A simple test driver for cmake. 
+# set PYTHONPATH before run command.
+# Usage:
+#    ./.set_python_pash.sh -p YOUR_PYTHON_PATH {exec...}
+# 
+# It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...}
+#
+
+PYPATH=""
+set -x
+while getopts "d:" opt; do
+  case $opt in
+    d)
+      PYPATH=$OPTARG
+      ;;
+  esac
+done
+shift $(($OPTIND - 1))
+export PYTHONPATH=$PYPATH
+$@
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
new file mode 100644
index 00000000000000..c6fa7dc2b16e1c
--- /dev/null
+++ b/paddle/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_subdirectory(cuda)
+add_subdirectory(utils)
+add_subdirectory(math)
+add_subdirectory(parameter)
+add_subdirectory(gserver)
+add_subdirectory(pserver)
+add_subdirectory(trainer)
+add_subdirectory(scripts)
+
+if(WITH_PREDICT_SDK)
+    add_subdirectory(predict)
+endif()
+
+if(WITH_SWIG_PY)
+  add_subdirectory(api)
+endif()
+
+
diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
new file mode 100644
index 00000000000000..e030f16b42c942
--- /dev/null
+++ b/paddle/api/Arguments.cpp
@@ -0,0 +1,131 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "PaddleAPI.h"
+
+#include "paddle/parameter/Argument.h"
+
+struct ArgumentsPrivate {
+  std::vector<paddle::Argument> outputs;
+
+  inline paddle::Argument& getArg(size_t idx) throw(RangeError) {
+    if (idx < outputs.size()) {
+      return outputs[idx];
+    } else {
+      RangeError e;
+      throw e;
+    }
+  }
+
+  template <typename T>
+  std::shared_ptr<T>& cast(void* rawPtr) const {
+    return *(std::shared_ptr<T>*)(rawPtr);
+  }
+};
+
+size_t Arguments::getSlotNum() const { return m->outputs.size(); }
+
+Arguments* Arguments::createArguments(size_t slotNum) {
+  auto args = new Arguments();
+  args->m->outputs.resize(slotNum);
+  return args;
+}
+
+void Arguments::resize(size_t slotNum) { m->outputs.resize(slotNum); }
+
+Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return Matrix::createByPaddleMatrixPtr(&a.value);
+}
+
+Arguments::Arguments() : m(new ArgumentsPrivate()) {}
+
+Arguments::~Arguments() { delete m; }
+
+Arguments* Arguments::createByPaddleArgumentVector(void* ptr) {
+  auto p = (std::vector<paddle::Argument>*)(ptr);
+  auto args = new Arguments();
+  args->m->outputs = *p;
+  return args;
+}
+
+IVector* Arguments::getSlotIds(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return IVector::createByPaddleVectorPtr(&a.ids);
+}
+
+Matrix* Arguments::getSlotIn(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return Matrix::createByPaddleMatrixPtr(&a.in);
+}
+
+void Arguments::setSlotValue(size_t idx, Matrix* mat) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.value = m->cast<paddle::Matrix>(mat->getSharedPtr());
+}
+
+void Arguments::setSlotIn(size_t idx, Matrix* mat) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.in = m->cast<paddle::Matrix>(mat->getSharedPtr());
+}
+
+void Arguments::setSlotIds(size_t idx, IVector* vec) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
+  a.ids = v;
+}
+
+template <typename T1>
+static inline void doCopyFromSafely(std::shared_ptr<T1>& dest,
+                                    std::shared_ptr<T1>& src) {
+  if (src) {
+    if (dest) {
+      dest->copyFrom(*src);
+    } else {
+      dest = src;
+    }
+  }
+}
+
+IVector* Arguments::getSlotSequenceStartPositions(size_t idx) const
+    throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return IVector::createByPaddleVectorPtr(
+    &a.sequenceStartPositions->getMutableVector(false));
+}
+
+void Arguments::setSlotSequenceStartPositions(size_t idx,
+                                              IVector* vec) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
+  a.sequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
+}
+
+IVector* Arguments::getSlotSequenceDim(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return IVector::createByPaddleVectorPtr(&a.cpuSequenceDims);
+}
+
+void Arguments::setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.cpuSequenceDims = m->cast<paddle::IVector>(vec->getSharedPtr());
+}
+
+int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return a.getBatchSize();
+}
+
+void* Arguments::getInternalArgumentsPtr() const { return &m->outputs; }
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
new file mode 100644
index 00000000000000..f364f346d90482
--- /dev/null
+++ b/paddle/api/CMakeLists.txt
@@ -0,0 +1,71 @@
+set(API_SOURCES
+    Arguments.cpp
+    ConfigParser.cpp
+    GradientMachine.cpp
+    Matrix.cpp
+    Parameter.cpp
+    ParameterOptimizer.cpp
+    SequenceGenerator.cpp
+    Trainer.cpp
+    Util.cpp
+    Vector.cpp)
+set(API_HEADER
+    PaddleAPI.h
+    Internal.h)
+
+add_library(paddle_api STATIC
+        ${API_SOURCES})
+add_dependencies(paddle_api gen_proto_cpp)
+
+
+if(WITH_GFLAGS)
+  list(LENGTH "${GFLAGS_LIBRARIES}" GFLAGS_LIBRARIES_LENGTH)
+
+  if(${GFLAGS_LIBRARIES_LENGTH} EQUAL 0 AND TARGET "${GFLAGS_LIBRARIES}")
+    # Because gflags compiled by cmake, so it is imported by cmake target,
+    # not a real library path. Get the real library path here.
+    message(STATUS "GFLAGS Libraries is ${GFLAGS_LIBRARIES}")
+    get_target_property(GFLAGS_LOCATION ${GFLAGS_LIBRARIES} LOCATION)
+    message(STATUS "GFLAGS Target location is ${GFLAGS_LOCATION}")
+  else()
+    set(GFLAGS_LOCATION ${GFLAGS_LIBRARIES})
+  endif()
+endif()
+
+
+configure_file(
+    paddle_api_config.py.in
+    ${PROJ_ROOT}/paddle/api/paddle_api_config.py
+)
+
+generate_python_api(python_swig_sources)
+
+set(PY_PADDLE_WHEEL_NAME
+${PROJ_ROOT}/paddle/py_paddle-0.1.1a10-cp27-none-linux_x86_64.whl)
+
+# TODO(yuyang18) : make wheel name calculated by cmake
+add_custom_command(OUTPUT ${PY_PADDLE_WHEEL_NAME}
+    COMMAND ${PYTHON_EXECUTABLE} setup.py  bdist_wheel &&
+            cp dist/*.whl . &&
+            rm -rf dist py_paddle.egg-info build
+    WORKING_DIRECTORY ${PROJ_ROOT}/paddle
+    DEPENDS python_swig_sources
+            paddle_parameter
+            paddle_math
+            paddle_utils
+            paddle_gserver
+            paddle_pserver
+            paddle_trainer
+            paddle_api
+            paddle_cuda
+)
+
+install(FILES ${PY_PADDLE_WHEEL_NAME}
+    DESTINATION opt/paddle/share/wheels)
+
+add_custom_target(python_api_wheel ALL DEPENDS
+  ${PY_PADDLE_WHEEL_NAME})
+
+if(WITH_TESTING)
+    add_subdirectory(test)
+endif()
diff --git a/paddle/api/ConfigParser.cpp b/paddle/api/ConfigParser.cpp
new file mode 100644
index 00000000000000..c5ee784a0bda09
--- /dev/null
+++ b/paddle/api/ConfigParser.cpp
@@ -0,0 +1,142 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "PaddleAPI.h"
+#include "paddle/trainer/Trainer.h"
+
+struct TrainerConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfig> conf;
+  TrainerConfigPrivate() : conf(std::make_shared<paddle::TrainerConfig>()) {}
+};
+
+struct ModelConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfig> conf;
+};
+
+struct ParameterConfigPrivate {
+  paddle::ParameterPtr parameter;
+  paddle::ParameterConfig config;
+
+  inline paddle::ParameterConfig* getConfigPtr() {
+    if (parameter != nullptr) {
+      auto& conf = parameter->getConfig();
+      return const_cast<paddle::ParameterConfig*>(&conf);
+    } else {
+      return &config;
+    }
+  }
+};
+
+struct OptimizationConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfig> trainer_config;
+  paddle::OptimizationConfig config;
+
+  paddle::OptimizationConfig& getConfig() {
+    if (trainer_config != nullptr) {
+      return *trainer_config->mutable_opt_config();
+    } else {
+      return config;
+    }
+  }
+};
+
+TrainerConfig::TrainerConfig() : m(new TrainerConfigPrivate()) {}
+
+TrainerConfig::~TrainerConfig() { delete m; }
+
+TrainerConfig* TrainerConfig::createFromTrainerConfigFile(
+    const std::string& confPath) {
+  LOG(INFO) << "load trainer config from " << confPath;
+  paddle::TrainerConfigHelper helper(confPath);
+  //! TODO(yuyang18): Make TrainerConfigPrivate to TrainerConfigHelper
+  auto retv = new TrainerConfig();
+  *retv->m->conf = helper.getConfig();
+  return retv;
+}
+
+ModelConfig::ModelConfig() : m(new ModelConfigPrivate()) {}
+
+ModelConfig::~ModelConfig() { delete m; }
+
+ModelConfig* TrainerConfig::getModelConfig() const {
+  auto retv = new ModelConfig();
+  retv->m->conf = m->conf;
+  return retv;
+}
+
+void* ModelConfig::getPaddleModelConfig() const {
+  return m->conf->mutable_model_config();
+}
+
+ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {}
+
+ParameterConfig::~ParameterConfig() {
+  if (m) {
+    delete m;
+  }
+}
+
+ParameterConfig* ParameterConfig::createParameterConfigFromParameterSharedPtr(
+    void* ptr) {
+  auto& p = *(paddle::ParameterPtr*)(ptr);
+  if (p != nullptr) {
+    auto conf = new ParameterConfig();
+    conf->m->parameter = p;
+    return conf;
+  } else {
+    return nullptr;
+  }
+}
+
+ParameterConfig* ParameterConfig::createParameterConfigFromParameterPtr(
+    void* ptr) {
+  auto& p = *(paddle::Parameter*)(ptr);
+  auto conf = new ParameterConfig();
+  conf->m->config = p.getConfig();
+  return conf;
+}
+
+std::string ParameterConfig::toProtoString() const {
+  return m->getConfigPtr()->SerializeAsString();
+}
+
+void* ParameterConfig::getRawPtr() { return m->getConfigPtr(); }
+
+OptimizationConfig::OptimizationConfig() : m(new OptimizationConfigPrivate()) {}
+
+OptimizationConfig::~OptimizationConfig() {
+  if (m) {
+    delete m;
+  }
+}
+
+std::string OptimizationConfig::toProtoString() {
+  return m->getConfig().SerializeAsString();
+}
+
+OptimizationConfig* TrainerConfig::getOptimizationConfig() const {
+  auto opt_config = new OptimizationConfig();
+  opt_config->m->trainer_config = m->conf;
+  return opt_config;
+}
+
+void* OptimizationConfig::getRawPtr() { return &m->getConfig(); }
+
+OptimizationConfig* OptimizationConfig::createFromProtoString(
+    const std::string& str) {
+  auto conf = new OptimizationConfig();
+  conf->m->config.ParseFromString(str);
+  return conf;
+}
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
new file mode 100644
index 00000000000000..6f1d63575a80f3
--- /dev/null
+++ b/paddle/api/GradientMachine.cpp
@@ -0,0 +1,161 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "PaddleAPI.h"
+#include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+#include "Internal.h"
+
+std::vector<int> GradientMachine::defaultParamTypes = {
+    PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM};
+
+struct GradientMachinePrivate {
+  std::shared_ptr<paddle::GradientMachine> machine;
+
+  template <typename T>
+  inline T& cast(void* ptr) {
+    return *(T*)(ptr);
+  }
+};
+
+GradientMachine::GradientMachine() : m(new GradientMachinePrivate()) {}
+
+GradientMachine::~GradientMachine() { delete m; }
+
+GradientMachine* GradientMachine::createFromPaddleModelPtr(
+    void* confPtr, GradientMatchineCreateMode mode,
+    const std::vector<int>& types) {
+  auto& conf = *(paddle::ModelConfig*)(confPtr);
+  std::vector<ParameterType> realTypes;
+  staticCastVector(&realTypes, types);
+  auto machineRawPtr = paddle::GradientMachine::create(conf, mode, realTypes);
+  auto machinePtr = std::shared_ptr<paddle::GradientMachine>(machineRawPtr);
+  if (machinePtr != nullptr) {
+    auto machine = new GradientMachine();
+    machine->m->machine = machinePtr;
+    return machine;
+  } else {
+    return nullptr;
+  }
+}
+
+GradientMachine* GradientMachine::createByConfigProtoStr(
+    const std::string& protoStr, GradientMatchineCreateMode mode,
+    const std::vector<int>& types) {
+  paddle::ModelConfig conf;
+  conf.ParseFromString(protoStr);
+  if (conf.IsInitialized()) {
+    return GradientMachine::createFromPaddleModelPtr(&conf, mode, types);
+  } else {
+    return nullptr;
+  }
+}
+
+GradientMachine* GradientMachine::createByModelConfig(
+    ModelConfig* conf, GradientMatchineCreateMode mode,
+    const std::vector<int>& types) {
+  auto confPtr = (paddle::ModelConfig*)conf->getPaddleModelConfig();
+  return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
+}
+
+void GradientMachine::forward(const Arguments& inArgs, Arguments* outArgs,
+                              PassType passType) {
+  auto& in =
+      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
+  auto& out = m->cast<std::vector<paddle::Argument>>(
+      outArgs->getInternalArgumentsPtr());
+  paddle::PassType pt = (paddle::PassType)(passType);
+  m->machine->forward(in, &out, pt);
+}
+
+UpdateCallback::~UpdateCallback() {}
+
+void UpdateCallback::apply(Parameter* p) {
+  // UNUSED(p);
+}
+
+class UpdateCallbackWrapper {
+public:
+  explicit UpdateCallbackWrapper(const UpdateCallback& callback)
+      : callback(const_cast<UpdateCallback&>(callback)) {}
+
+  void operator()(paddle::Parameter* param) {
+    auto p = Parameter::createFromRawPtr(&param);
+    // @TODO Use Stack variable instead.
+    callback.apply(p);
+    delete p;
+  }
+
+private:
+  UpdateCallback& callback;
+};
+
+void GradientMachine::backward(const UpdateCallback& callback) {
+  m->machine->backward(UpdateCallbackWrapper(callback));
+}
+
+void GradientMachine::forwardBackward(const Arguments& inArgs,
+                                      Arguments* outArgs, PassType passType,
+                                      const UpdateCallback& callback) {
+  auto& in =
+      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
+  auto& out = m->cast<std::vector<paddle::Argument>>(
+      outArgs->getInternalArgumentsPtr());
+  paddle::PassType pt = (paddle::PassType)(passType);
+  m->machine->forwardBackward(in, &out, pt, UpdateCallbackWrapper(callback));
+}
+
+void GradientMachine::loadParameters(const std::string& path) {
+  m->machine->loadParameters(path);
+}
+
+size_t GradientMachine::getParameterSize() const {
+  return m->machine->getParameters().size();
+}
+
+Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
+  auto params = m->machine->getParameters();
+  if (i < params.size()) {
+    return Parameter::createFromSharedPtr(&m->machine->getParameters()[i]);
+  } else {
+    throw RangeError();
+  }
+}
+
+void GradientMachine::randParameters() { m->machine->randParameters(); }
+
+Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
+  throw(UnsupportError) {
+  auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(m->machine);
+  if (nn) {
+    auto mat = nn->getLayerOutput(layerName);
+    return Matrix::createByPaddleMatrixPtr(&mat);
+  } else {
+    throw UnsupportError();
+  }
+}
+
+SequenceGenerator* GradientMachine::asSequenceGenerator(
+    const std::vector<std::string>& dict, size_t begin_id, size_t end_id,
+    size_t max_length, size_t beam_size) {
+  SequenceGenerator* r =
+      SequenceGenerator::createByGradientMachineSharedPtr(&m->machine);
+  r->setDict(dict);
+  r->setBos(begin_id);
+  r->setEos(end_id);
+  r->setMaxLength(max_length);
+  r->setBeamSize(beam_size);
+  return r;
+}
diff --git a/paddle/api/Internal.h b/paddle/api/Internal.h
new file mode 100644
index 00000000000000..b990f650be9fa4
--- /dev/null
+++ b/paddle/api/Internal.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "PaddleAPI.h"
+
+#include <vector>
+#include <algorithm>
+
+template <typename T1, typename T2>
+void staticCastVector(std::vector<T2>* dest, const std::vector<T1>& src) {
+  dest->resize(src.size());
+  std::transform(src.begin(), src.end(), dest->begin(), [](T1 t){
+    return static_cast<T2>(t);
+  });
+}
diff --git a/paddle/api/Matrix.cpp b/paddle/api/Matrix.cpp
new file mode 100644
index 00000000000000..6a79f83495a569
--- /dev/null
+++ b/paddle/api/Matrix.cpp
@@ -0,0 +1,286 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "PaddleAPI.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+#include "paddle/math/CpuSparseMatrix.h"
+#include <iostream>
+#include <cstring>
+
+struct MatrixPrivate {
+  std::shared_ptr<paddle::Matrix> mat;
+};
+
+Matrix::Matrix() : m(new MatrixPrivate()) {}
+
+Matrix* Matrix::createByPaddleMatrixPtr(void* sharedPtr) {
+  auto* mat = reinterpret_cast<paddle::MatrixPtr*>(sharedPtr);
+  if ((*mat) != nullptr) {
+    auto m = new Matrix();
+    m->m->mat = *mat;
+    return m;
+  } else {
+    return nullptr;
+  }
+}
+
+Matrix* Matrix::createZero(size_t height, size_t width, bool useGpu) {
+  auto m = new Matrix();
+  m->m->mat = paddle::Matrix::create(height, width, useGpu);
+  m->m->mat->zero();
+  return m;
+}
+
+Matrix* Matrix::createDense(const std::vector<float>& data, size_t height,
+                            size_t width, bool useGpu) {
+  auto m = new Matrix();
+  m->m->mat = paddle::Matrix::create(height, width, useGpu);
+  m->m->mat->copyFrom(data.data(), data.size());
+  return m;
+}
+
+Matrix* Matrix::createCpuDenseFromNumpy(float* data, int dim1, int dim2,
+                                        bool copy) {
+  auto m = new Matrix();
+  if (copy) {
+    m->m->mat = paddle::Matrix::create(dim1, dim2);
+    m->m->mat->copyFrom(data, dim1 * dim2);
+  } else {
+    m->m->mat = paddle::Matrix::create(data, dim1, dim2, false);
+  }
+  return m;
+}
+
+Matrix* Matrix::createGpuDenseFromNumpy(float* data, int dim1, int dim2) {
+  auto m = new Matrix();
+  m->m->mat = paddle::Matrix::create(dim1, dim2, false, true);
+  m->m->mat->copyFrom(data, dim1 * dim2);
+  return m;
+}
+
+Matrix* Matrix::createSparse(size_t height, size_t width, size_t nnz,
+                             bool isNonVal, bool isTrans, bool useGpu) {
+  auto m = new Matrix();
+  m->m->mat = paddle::Matrix::createSparseMatrix(
+      height, width, nnz, isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
+      isTrans, useGpu);
+  return m;
+}
+
+Matrix::~Matrix() { delete m; }
+
+size_t Matrix::getHeight() const { return m->mat->getHeight(); }
+
+size_t Matrix::getWidth() const { return m->mat->getWidth(); }
+
+float Matrix::get(size_t x, size_t y) const throw(RangeError) {
+  if (x > this->getWidth() || y > this->getHeight()) {
+    RangeError e;
+    throw e;
+  }
+  return m->mat->getElement(x, y);
+}
+
+void Matrix::set(size_t x, size_t y, float val) throw(RangeError,
+                                                      UnsupportError) {
+  if (x > this->getWidth() || y > this->getHeight()) {
+    RangeError e;
+    throw e;
+  }
+  auto rawMat = m->mat.get();
+  if (auto cDenseMat = dynamic_cast<paddle::CpuMatrix*>(rawMat)) {
+    *(cDenseMat->getData() + x + y * cDenseMat->getWidth()) = val;
+  } else {
+    UnsupportError e;
+    throw e;
+  }
+}
+
+bool Matrix::isSparse() const {
+  auto raw_mat = m->mat.get();
+  return dynamic_cast<paddle::CpuSparseMatrix*>(raw_mat) != nullptr ||
+         dynamic_cast<paddle::GpuSparseMatrix*>(raw_mat) != nullptr;
+}
+
+SparseValueType Matrix::getSparseValueType() const throw(UnsupportError) {
+  auto cpuSparseMat =
+      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
+  if (cpuSparseMat != nullptr) {
+    return (SparseValueType)cpuSparseMat->getValueType();
+  } else {
+    auto gpuSparseMat =
+        std::dynamic_pointer_cast<paddle::GpuSparseMatrix>(m->mat);
+    if (gpuSparseMat != nullptr) {
+      return (SparseValueType)gpuSparseMat->getValueType();
+    } else {
+      UnsupportError e;
+      throw e;
+    }
+  }
+}
+
+SparseFormatType Matrix::getSparseFormat() const throw(UnsupportError) {
+  auto cpuSparseMat =
+      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
+  if (cpuSparseMat != nullptr) {
+    return (SparseFormatType)cpuSparseMat->getFormat();
+  } else {
+    auto gpuSparseMat =
+        std::dynamic_pointer_cast<paddle::GpuSparseMatrix>(m->mat);
+    if (gpuSparseMat != nullptr) {
+      return SPARSE_CSR;
+    } else {
+      UnsupportError e;
+      throw e;
+    }
+  }
+}
+
+IntArray Matrix::getSparseRowCols(size_t i) const
+    throw(UnsupportError, RangeError) {
+  auto cpuSparseMat =
+      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
+  if (cpuSparseMat != nullptr &&
+      cpuSparseMat->getFormat() == paddle::SPARSE_CSR) {
+    if (i < cpuSparseMat->getHeight()) {
+      // cpuSparseMat->print(std::cout);
+      size_t len = cpuSparseMat->getColNum(i);
+      return IntArray(cpuSparseMat->getRowCols(i), len);
+    } else {
+      RangeError e;
+      throw e;
+    }
+  } else {
+    UnsupportError e;
+    throw e;
+  }
+}
+
+IntWithFloatArray Matrix::getSparseRowColsVal(size_t i) const
+    throw(UnsupportError, RangeError) {
+  auto cpuSparseMat =
+      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
+  if (cpuSparseMat != nullptr &&
+      cpuSparseMat->getValueType() == paddle::FLOAT_VALUE) {
+    if (i < cpuSparseMat->getHeight()) {
+      return IntWithFloatArray(cpuSparseMat->getRowValues(i),
+                               cpuSparseMat->getRowCols(i),
+                               cpuSparseMat->getColNum(i));
+    } else {
+      RangeError e;
+      throw e;
+    }
+  } else {
+    UnsupportError e;
+    throw e;
+  }
+}
+
+FloatArray Matrix::getData() const {
+  auto rawMat = m->mat.get();
+  if (dynamic_cast<paddle::GpuMemoryHandle*>(rawMat->getMemoryHandle().get())) {
+    // is gpu. then copy data
+    float* data = rawMat->getData();
+    size_t len = rawMat->getElementCnt();
+    float* cpuData = new float[len];
+    hl_memcpy_device2host(cpuData, data, len * sizeof(float));
+    FloatArray ret_val(cpuData, len);
+    ret_val.needFree = true;
+    return ret_val;
+  } else {
+    FloatArray ret_val(rawMat->getData(), rawMat->getElementCnt());
+    return ret_val;
+  }
+}
+
+void Matrix::sparseCopyFrom(
+    const std::vector<int>& rows, const std::vector<int>& cols,
+    const std::vector<float>& vals) throw(UnsupportError) {
+  auto cpuSparseMat =
+      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
+  if (cpuSparseMat != nullptr) {
+    // LOG(INFO) <<"RowSize = "<<rows.size()
+    //  <<" ColSize = "<<cols.size()
+    //  <<" ValSize = "<<vals.size();
+    cpuSparseMat->copyFrom(const_cast<std::vector<int>&>(rows),
+                           const_cast<std::vector<int>&>(cols),
+                           const_cast<std::vector<float>&>(vals));
+  } else {
+    UnsupportError e;
+    throw e;
+  }
+}
+
+void* Matrix::getSharedPtr() const { return &m->mat; }
+
+void Matrix::toNumpyMatInplace(float** view_data, int* dim1,
+                               int* dim2) throw(UnsupportError) {
+  auto cpuMat = std::dynamic_pointer_cast<paddle::CpuMatrix>(m->mat);
+  if (cpuMat) {
+    *dim1 = cpuMat->getHeight();
+    *dim2 = cpuMat->getWidth();
+    *view_data = cpuMat->getData();
+  } else {
+    throw UnsupportError();
+  }
+}
+void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
+                            int* dim2) throw(UnsupportError) {
+  static_assert(sizeof(paddle::real) == sizeof(float),
+                "Currently PaddleAPI only support for single "
+                "precision version of paddle.");
+  if (this->isSparse()) {
+    throw UnsupportError();
+  } else {
+    *dim1 = m->mat->getHeight();
+    *dim2 = m->mat->getWidth();
+    *view_m_data = new float[(*dim1) * (*dim2)];
+    if (auto cpuMat = dynamic_cast<paddle::CpuMatrix*>(m->mat.get())) {
+      auto src = cpuMat->getData();
+      auto dest = *view_m_data;
+      std::memcpy(dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
+    } else if (auto gpuMat = dynamic_cast<paddle::GpuMatrix*>(m->mat.get())) {
+      auto src = gpuMat->getData();
+      auto dest = *view_m_data;
+      hl_memcpy_device2host(dest, src,
+                            sizeof(paddle::real) * (*dim1) * (*dim2));
+    } else {
+      LOG(WARNING) << "Unexpected Situation";
+      throw UnsupportError();
+    }
+  }
+}
+
+void Matrix::copyFromNumpyMat(float* data, int dim1,
+                              int dim2) throw(UnsupportError, RangeError) {
+  if (isSparse()) {
+    throw UnsupportError();
+  } else {
+    if (this->getHeight() == (size_t)dim1 && this->getWidth() == (size_t)dim2) {
+      if (m->mat->getData() != data) {
+        m->mat->copyFrom(data, dim1 * dim2);
+      }
+    } else {
+      throw RangeError();
+    }
+  }
+}
+
+bool Matrix::isGpu() const {
+  auto rawPtr = m->mat.get();
+  return dynamic_cast<paddle::GpuMatrix*>(rawPtr) != nullptr ||
+         dynamic_cast<paddle::GpuSparseMatrix*>(rawPtr) != nullptr;
+}
diff --git a/paddle/api/Paddle.swig b/paddle/api/Paddle.swig
new file mode 100644
index 00000000000000..a09f24ce1ccf5d
--- /dev/null
+++ b/paddle/api/Paddle.swig
@@ -0,0 +1,183 @@
+%module(directors="1") swig_paddle
+%include "std_string.i"
+%{
+#define SWIG_FILE_WITH_INIT
+#include "api/PaddleAPI.h"   
+%}
+%include "std_vector.i"
+%include "std_pair.i"
+#ifdef SWIGPYTHON
+%include "numpy.i"
+#endif
+
+%init %{
+#ifdef SWIGPYTHON
+import_array();
+#endif
+%}
+
+
+namespace std {
+%template(vector_int) vector<int>;
+%template(vector_uint) vector<unsigned int>;
+%template(vector_float) vector<float>;
+%template(vector_string) vector<string>;
+%template(vector_vec_star) vector<Vector*>;
+}
+#ifdef SWIGPYTHON 
+%typemap(in) (int argc, char** argv) { 
+    int i = 0; 
+    if (!PyList_Check($input)) { 
+        PyErr_SetString(PyExc_ValueError, "Expecting a list"); 
+        return NULL; 
+    } 
+    $1 = PyList_Size($input); 
+    $2 = (char **) malloc(($1+1)*sizeof(char *)); 
+    for (i = 0; i < $1; i++) { 
+        PyObject *s = PyList_GetItem($input,i); 
+        if (!PyString_Check(s)) { 
+            free($2); 
+            PyErr_SetString(PyExc_ValueError, "List items must be strings"); 
+            return NULL; 
+        } 
+        $2[i] = PyString_AsString(s); 
+    } 
+    $2[i] = 0; 
+} 
+%typemap(freearg) (int argc, char** argv) { 
+    if ($2) free($2); 
+} 
+
+%typemap(out) FloatArray {
+  $result = PyList_New($1.length);
+  for (size_t i=0; i<$1.length; ++i) {
+    PyList_SetItem($result, i, PyFloat_FromDouble($1.buf[i]));
+  }  
+  if($1.needFree) {
+    delete [] $1.buf;  
+  }
+}
+
+%typemap(out) IntArray {
+  $result = PyList_New($1.length);  
+  for (size_t i=0; i<$1.length; ++i) {
+    PyList_SetItem($result, i, PyInt_FromLong($1.buf[i]));  
+  }
+  if ($1.needFree) {
+    delete [] $1.buf;  
+  }
+}
+
+%typemap(out) IntWithFloatArray {
+  $result = PyList_New($1.length);
+  for (size_t i=0; i<$1.length; ++i) {
+    PyList_SetItem($result, i, PyTuple_Pack(2, 
+      PyInt_FromLong($1.idxBuf[i]),
+      PyFloat_FromDouble($1.valBuf[i])
+    ));
+  }
+  if ($1.needFree) {
+    delete [] $1.idxBuf;
+    delete [] $1.valBuf;
+  } 
+}
+
+
+%rename(__getitem__) IVector::get;
+%rename(__setitem__) IVector::set;
+%rename(__len__) IVector::getSize;
+%rename(__getitem__) Vector::get;
+%rename(__setitem__) Vector::set;
+%rename(__len__) Vector::getSize;
+%rename(__call__) ParameterTraverseCallback::apply;
+
+%apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) { 
+  (float* data, int dim1, int dim2) 
+}
+
+%apply (float** ARGOUTVIEW_ARRAY2, int* DIM1, int* DIM2) { 
+  (float** view_data, int* dim1, int* dim2) 
+}
+
+%apply (float** ARGOUTVIEWM_ARRAY2, int* DIM1, int* DIM2) {
+  (float** view_m_data, int* dim1, int* dim2)  
+}
+
+%apply (int** ARGOUTVIEWM_ARRAY1, int* DIM1) {
+  (int** view_m_data, int* dim1)  
+}
+
+%apply (int* INPLACE_ARRAY1, int DIM1) { 
+  (int* data, int dim) 
+}
+
+%apply (int** ARGOUTVIEW_ARRAY1, int* DIM1) {
+  (int** view_data, int* dim1)  
+}
+
+%apply (float* INPLACE_ARRAY1, int DIM1) {
+  (float* data, int dim)
+}
+
+%apply (float** ARGOUTVIEW_ARRAY1, int* DIM1) {
+  (float** view_data, int* dim1)
+}
+
+%apply (float** ARGOUTVIEWM_ARRAY1, int* DIM1) {
+  (float** view_m_data, int* dim1)
+}
+
+#endif
+// The below functions internally create object by "new", so it should use
+// use SWIG to handle gc. There are hints for SWIG to handle GC.
+%newobject Matrix::createZero;
+%newobject Matrix::createSparse;
+%newobject Matrix::createDense;
+%newobject Vector::createZero;
+%newobject Vector::create;
+%newobject Vector::createCpuVectorFromNumpy;
+%newobject Vector::createGpuVectorFromNumpy;
+%newobject IVector::createZero;
+%newobject IVector::create;
+%newobject Trainer::createByCommandLine;
+%newobject Trainer::getNetworkOutput;
+%newobject Trainer::getLayerOutput;
+%newobject Arguments::getSlotValue;
+%newobject Arguments::getSlotIds;
+%newobject Arguments::getSlotIn;
+%newobject Arguments::getSlotSequenceStartPositions;
+%newobject Arguments::getSlotSequenceDim;
+%newobject Arguments::createArguments;
+%newobject GradientMachine::createByConfigProtoStr;
+%newobject GradientMachine::createByModelConfig;
+%newobject GradientMachine::asSequenceGenerator;
+%newobject GradientMachine::getParameter;
+%newobject GradientMachine::getLayerOutput;
+%newobject TrainerConfig::createFromTrainerConfigFile;
+%newobject TrainerConfig::getModelConfig;
+%newobject TrainerConfig::getOptimizationConfig;
+%newobject Parameter::getBuf;
+%newobject Parameter::getConfig;
+%newobject ParameterOptimizer::create;
+%newobject ParameterOptimizer::needSpecialTraversal;
+
+%feature("director") UpdateCallback;
+%feature("autodoc", 1); // To generate method stub, for code hint in ide
+
+// Ignore many private class, and method cannot be handled by swig.
+%ignore MatrixPrivate;
+%ignore TrainerPrivate;
+%ignore IVector::operator[];
+%ignore ArgumentsPrivate;
+%ignore GradientMachinePrivate;
+%ignore TrainerConfigPrivate;
+%ignore ModelConfigPrivate;
+%ignore ParameterPrivate;
+%ignore SequenceGeneratorPrivate;
+%ignore VectorPrivate;
+%ignore ParameterConfigPrivate;
+%ignore OptimizationConfigPrivate;
+%ignore ParameterTraverseCallbackPrivate;
+%include "utils/GlobalConstants.h"
+%include "api/PaddleAPI.h"
+
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
new file mode 100644
index 00000000000000..45bf977a7c150b
--- /dev/null
+++ b/paddle/api/PaddleAPI.h
@@ -0,0 +1,800 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string>
+#include <vector>
+#include "paddle/utils/GlobalConstants.h"
+
+/// Import Paddle's enumeration into global namespace.
+using namespace paddle::enumeration_wrapper;  // NOLINT
+
+#define DISABLE_COPY_AND_ASSIGN(classname) \
+  classname(const classname& other);       \
+  classname& operator=(const classname& other)
+
+/**
+ * @brief Initialize paddle.
+ *
+ * In python, this method should be invoked as
+ * @code
+ *  import sys
+ *  import paddle
+ *  paddle.initPaddle(sys.argv)
+ *  or you can change arguments as any list of str.
+ * @endcode
+ */
+void initPaddle(int argc, char** argv);
+
+/// Return true if this py_paddle is compiled in GPU Version
+bool isGpuVersion();
+
+/// The Error of IO Operation. Such as file not found, etc.
+class IOError {};
+
+/// Out of range error
+class RangeError {};
+
+/// Not support Error, such as access GPU memory directly, etc.
+class UnsupportError {};
+
+/// This type will map to python's list of float.
+struct FloatArray {
+  const float* buf;
+  const size_t length;
+  bool needFree;  // true if the buf is dynamic alloced.
+  FloatArray(const float* b, const size_t l);
+};
+
+/// This type will map to python's list of int
+struct IntArray {
+  const int* buf;
+  const size_t length;
+  bool needFree;
+  IntArray(const int* b, const size_t l, bool f = false);
+};
+
+/// This type will map to python's list of (int, float)
+struct IntWithFloatArray {
+  const float* valBuf;
+  const int* idxBuf;
+  const size_t length;
+  bool needFree;
+  IntWithFloatArray(const float* v, const int* i, size_t l, bool f = false);
+};
+
+enum SparseValueType { SPARSE_NON_VALUE = 0, SPARSE_VALUE = 1 };
+
+enum SparseFormatType { SPARSE_CSR = 0, SPARSE_CSC = 1 };
+
+/**
+ * In Python, -1UL is hard to write. So define a const value used by python
+ * side.
+ */
+const size_t NO_SPARSE_ID = -1UL;
+
+struct MatrixPrivate;
+class Matrix {
+  Matrix();  // User Cannot Create Matrix.
+  DISABLE_COPY_AND_ASSIGN(Matrix);
+  static Matrix* createByPaddleMatrixPtr(void* sharedPtr);
+
+public:
+  virtual ~Matrix();
+
+  /**
+   * Create A Matrix with height,width, which is filled by zero.
+   */
+  static Matrix* createZero(size_t height, size_t width, bool useGpu = false);
+
+  /**
+   * Create Sparse Matrix.
+   *
+   * After create sparse, sparseCopyFrom can be used to fill matrix.
+   *
+   * @param nnz  Number of non zero values.
+   *
+   * @note the default sparse type is SPARSE_CSR.
+   */
+  static Matrix* createSparse(size_t height, size_t width, size_t nnz,
+                              bool isNonVal = true, bool trans = false,
+                              bool useGpu = false);
+
+  /**
+   * Create Dense Matrix.
+   *
+   * @param data  list of float should be passed in python.
+   * @note        the value will be copy into a new matrix.
+   */
+  static Matrix* createDense(const std::vector<float>& data, size_t height,
+                             size_t width, bool useGpu = false);
+
+  /**
+   *  Create Cpu Dense Matrix from numpy matrix, dtype=float32
+   *
+   *  @param data  a numpy matrix.
+   *  @param dim1  dimension of data.
+   *  @param dim2  dimension of data.
+   *  @param copy  true if copy into a new matrix, false will create
+   *               matrix inplace.
+   */
+  static Matrix* createCpuDenseFromNumpy(float* data, int dim1, int dim2,
+                                         bool copy = false);
+
+  /// Create Gpu Dense Matrix from numpy matrix, dtype=float32
+  static Matrix* createGpuDenseFromNumpy(float* data, int dim1, int dim2);
+
+  /**
+   * Cast to numpy matrix.
+   *
+   * @note    This method take no parameter in python.
+   * @note    This method in python will return a numpy matrix, not void.
+   * @note    Only CpuDenseMatrix is supported.
+   *
+   * Example:
+   * @code
+   * import paddle
+   * m = paddle.Matrix.createZero(10,2)
+   * numpy_mat = m.toNumpyMat()
+   * @endcode
+   */
+  void toNumpyMatInplace(float** view_data, int* dim1,
+                         int* dim2) throw(UnsupportError);
+
+  /// Copy To numpy mat.
+  void copyToNumpyMat(float** view_m_data, int* dim1,
+                      int* dim2) throw(UnsupportError);
+
+  /// Copy From Numpy Mat
+  void copyFromNumpyMat(float* data, int dim1, int dim2) throw(UnsupportError,
+                                                               RangeError);
+
+  /// return true if this matrix is sparse.
+  bool isSparse() const;
+
+  SparseValueType getSparseValueType() const throw(UnsupportError);
+
+  SparseFormatType getSparseFormat() const throw(UnsupportError);
+
+  IntArray getSparseRowCols(size_t i) const throw(UnsupportError, RangeError);
+
+  IntWithFloatArray getSparseRowColsVal(size_t i) const
+      throw(UnsupportError, RangeError);
+
+  size_t getHeight() const;
+
+  size_t getWidth() const;
+
+  float get(size_t x, size_t y) const throw(RangeError);
+
+  void set(size_t x, size_t y, float val) throw(RangeError, UnsupportError);
+
+  /// return type is list of float
+  FloatArray getData() const;
+
+  /**
+   * Copy from rows, cols, values.
+   *
+   * if sparse_nonvalue, the values should be []
+   */
+  void sparseCopyFrom(const std::vector<int>& rows,
+                      const std::vector<int>& cols,
+                      const std::vector<float>& values =
+                          std::vector<float>()) throw(UnsupportError);
+
+  bool isGpu() const;
+
+private:
+  void* getSharedPtr() const;
+
+  MatrixPrivate* m;
+  friend class Trainer;
+  friend class GradientMachine;
+  friend class Arguments;
+};
+
+struct VectorPrivate;
+class Vector {
+  DISABLE_COPY_AND_ASSIGN(Vector);
+  Vector();
+  static Vector* createByPaddleVectorPtr(void* ptr);
+
+  void* getSharedPtr();
+
+public:
+  ~Vector();
+
+  /// Create Vector filled with zero.
+  static Vector* createZero(size_t sz, bool useGpu = false);
+
+  /**
+   * Create Vector from list of float.
+   *
+   * It will create a new vector, and copy data into it.
+   */
+  static Vector* create(const std::vector<float>& data, bool useGpu = false);
+
+  /**
+   * Create Cpu Vector from numpy array, which dtype=float32
+   *
+   * If copy is false, it will create vector inplace.
+   */
+  static Vector* createCpuVectorFromNumpy(float* data, int dim,
+                                          bool copy = false);
+
+  /// Create Gpu Vector from numpy array, which dtype=float32
+  static Vector* createGpuVectorFromNumpy(float* data, int dim);
+
+  /// Cast to numpy array inplace.
+  void toNumpyArrayInplace(float** view_data, int* dim1) throw(UnsupportError);
+
+  /// Copy to numpy array.
+  void copyToNumpyArray(float** view_m_data, int* dim1);
+
+  /// Copy from numpy array.
+  void copyFromNumpyArray(float* data, int dim);
+
+  /// __getitem__ in python
+  float get(const size_t idx) const throw(RangeError, UnsupportError);
+
+  /// __setitem__ in python
+  void set(const size_t idx, float val) throw(RangeError, UnsupportError);
+
+  /// Return is GPU vector or not.
+  bool isGpu() const;
+
+  /// __len__ in python
+  size_t getSize() const;
+
+private:
+  VectorPrivate* m;
+
+private:
+  friend class Parameter;
+  friend class ParameterOptimizer;
+  friend struct ParameterTraverseCallbackPrivate;
+};
+
+struct IVectorPrivate;
+class IVector {
+  IVector();
+  DISABLE_COPY_AND_ASSIGN(IVector);
+  static IVector* createByPaddleVectorPtr(void* ptr);
+
+public:
+  /// Create IVector filled with zero
+  static IVector* createZero(size_t sz, bool useGpu = false);
+
+  /**
+   * Create IVector from list of int.
+   * It will create a new vector, and copy data into it.
+   */
+  static IVector* create(const std::vector<int>& data, bool useGpu = false);
+
+  /**
+   * Create Cpu IVector from numpy array, which dtype=int32
+   *
+   * If copy is false, it will create vector inplace
+   */
+  static IVector* createCpuVectorFromNumpy(int* data, int dim,
+                                           bool copy = false);
+  /**
+   * Create Gpu IVector from numpy array, which dtype=int32
+   */
+  static IVector* createGpuVectorFromNumy(int* data, int dim);
+
+  /// Cast to numpy array inplace.
+  void toNumpyArrayInplace(int** view_data, int* dim1) throw(UnsupportError);
+
+  /// Copy to numpy array.
+  void copyToNumpyArray(int** view_m_data, int* dim1);
+
+  /// Copy from numpy array.
+  void copyFromNumpyArray(int* data, int dim);
+
+  virtual ~IVector();
+
+  /// Return a list of int, the memory is alloced and copied.
+  IntArray getData() const;
+
+  /// This method will map to python [] method.
+  int& operator[](const size_t idx) throw(RangeError, UnsupportError);
+
+  const int& operator[](const size_t idx) const
+      throw(RangeError, UnsupportError);
+
+  inline int get(const size_t idx) const throw(RangeError, UnsupportError) {
+    return (*this)[idx];
+  }
+
+  inline void set(const size_t idx, int val) throw(RangeError, UnsupportError) {
+    (*this)[idx] = val;
+  }
+
+  /// Return true if it is gpu vector.
+  bool isGpu() const;
+
+  /// This method will map to python __len__();
+  size_t getSize() const;
+
+private:
+  void* getSharedPtr() const;
+
+  friend class Arguments;
+  IVectorPrivate* m;
+};
+
+struct ArgumentsPrivate;
+
+/// The Arguments is actual a std::vector<paddle::Argument> in paddle.
+class Arguments {
+private:
+  Arguments();  // Internal Create.
+  DISABLE_COPY_AND_ASSIGN(Arguments);
+
+public:
+  /**
+   * Create a arguments with size.
+   * Note that it can be zero.
+   */
+  static Arguments* createArguments(size_t slotNum);
+
+  void resize(size_t slotNum);
+
+  virtual ~Arguments();
+
+  /**
+   * Return the slot number that aguments contains.
+   *
+   * It is actually the vector's size
+   */
+  size_t getSlotNum() const;
+
+  /**
+   * The get functions of Arguments
+   *
+   * the param idx is the slot id
+   */
+  Matrix* getSlotValue(size_t idx) const throw(RangeError);
+  IVector* getSlotIds(size_t idx) const throw(RangeError);
+  Matrix* getSlotIn(size_t idx) const throw(RangeError);
+  IVector* getSlotSequenceStartPositions(size_t idx) const throw(RangeError);
+  IVector* getSlotSequenceDim(size_t idx) const throw(RangeError);
+  // End Of get functions of Arguments
+
+  int64_t getBatchSize(size_t idx = 0) const throw(RangeError);
+
+  /**
+   * The set functions of Arguments.
+   *
+   * The param idx is the slot id.
+   * The other param is the input Matrix or vector.
+   */
+  void setSlotValue(size_t idx, Matrix* mat) throw(RangeError);
+  void setSlotIn(size_t idx, Matrix* mat) throw(RangeError);
+  void setSlotIds(size_t idx, IVector* vec) throw(RangeError);
+  void setSlotSequenceStartPositions(size_t idx,
+                                     IVector* vec) throw(RangeError);
+  void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError);
+
+private:
+  static Arguments* createByPaddleArgumentVector(void* ptr);
+  void* getInternalArgumentsPtr() const;
+
+private:
+  ArgumentsPrivate* m;
+  friend class Trainer;
+  friend class GradientMachine;
+  friend class SequenceGenerator;
+};
+
+enum GradientMatchineCreateMode {
+  CREATE_MODE_NORMAL = 0,
+  CREATE_MODE_TESTING = 4
+};
+
+struct ParameterConfigPrivate;
+class ParameterConfig {
+  DISABLE_COPY_AND_ASSIGN(ParameterConfig);
+  ParameterConfig();
+
+  /**
+   * Internal methods
+   */
+  static ParameterConfig* createParameterConfigFromParameterSharedPtr(
+      void* ptr);
+  static ParameterConfig* createParameterConfigFromParameterPtr(void* ptr);
+  void* getRawPtr();
+
+public:
+  ~ParameterConfig();
+
+  /**
+   * return proto buf string.
+   */
+  std::string toProtoString() const;
+
+private:
+  ParameterConfigPrivate* m;
+
+private:
+  friend class Parameter;
+  friend class ParameterOptimizer;
+  friend struct ParameterTraverseCallbackPrivate;
+};
+
+struct OptimizationConfigPrivate;
+class OptimizationConfig {
+  DISABLE_COPY_AND_ASSIGN(OptimizationConfig);
+  OptimizationConfig();
+  void* getRawPtr();
+
+public:
+  static OptimizationConfig* createFromProtoString(const std::string& str);
+  ~OptimizationConfig();
+
+  /**
+   * return protobuf string.
+   */
+  std::string toProtoString();
+
+private:
+  OptimizationConfigPrivate* m;
+
+  friend class TrainerConfig;
+  friend class ParameterOptimizer;
+};
+
+struct ParameterPrivate;
+class Parameter {
+private:
+  Parameter();
+  DISABLE_COPY_AND_ASSIGN(Parameter);
+
+public:
+  virtual ~Parameter();
+
+  /**
+   * get parameter name
+   */
+  std::string getName() const;
+
+  /**
+   * get buf in Parameter
+   */
+  Vector* getBuf(ParameterType type);
+
+  /**
+   * get id
+   */
+  size_t getID() const;
+
+  ParameterConfig* getConfig();
+
+private:
+  static Parameter* createFromRawPtr(void* ptr);
+  static Parameter* createFromSharedPtr(void* ptr);
+
+private:
+  ParameterPrivate* m;
+  friend class UpdateCallbackWrapper;
+  friend class GradientMachine;
+};
+
+struct ModelConfigPrivate;
+/**
+ * You can only get model config from TrainerConfig.
+ *
+ * It is used by GradientMachine.
+ */
+class ModelConfig {
+private:
+  ModelConfig();
+  DISABLE_COPY_AND_ASSIGN(ModelConfig);
+
+public:
+  virtual ~ModelConfig();
+
+private:
+  void* getPaddleModelConfig() const;
+
+  ModelConfigPrivate* m;
+  friend class TrainerConfig;
+  friend struct TrainerConfigPrivate;
+  friend class GradientMachine;
+};
+
+struct TrainerConfigPrivate;
+/**
+ * To get TrainerConfig from file.
+ *
+ * It is used by GradientMachine.
+ */
+class TrainerConfig {
+private:
+  TrainerConfig();
+  DISABLE_COPY_AND_ASSIGN(TrainerConfig);
+
+public:
+  virtual ~TrainerConfig();
+
+  static TrainerConfig* createFromTrainerConfigFile(
+      const std::string& configPath);
+
+  ModelConfig* getModelConfig() const;
+
+  OptimizationConfig* getOptimizationConfig() const;
+
+private:
+  TrainerConfigPrivate* m;
+};
+
+/**
+ * The callback in backword.
+ *
+ * You can inherit this class in python.
+ *
+ * @code
+ * class UpdateCallbackInPython(paddle.UpdateCallback):
+ *   def __init__(self):
+ *     paddle.UpdateCallback.__init__(self)
+ *
+ *   def apply(self, param):
+ *     assert isinstance(param, paddle.Parameter)
+ * @endcode
+ */
+class UpdateCallback {
+public:
+  virtual ~UpdateCallback();
+  virtual void apply(Parameter* p);
+};
+
+struct ParameterTraverseCallbackPrivate;
+class ParameterTraverseCallback {
+  DISABLE_COPY_AND_ASSIGN(ParameterTraverseCallback);
+  ParameterTraverseCallback();
+
+public:
+  ~ParameterTraverseCallback();
+
+  void apply(const std::vector<Vector*>& vecs, const ParameterConfig& config,
+             size_t sparseId);
+
+private:
+  ParameterTraverseCallbackPrivate* m;
+  friend class ParameterOptimizer;
+};
+
+/**
+ * The ParameterOptimizer Wrapper Class.
+ *
+ * Basically same as common/ParameterOptimizer.h
+ */
+struct ParameterOptimizerPrivate;
+class ParameterOptimizer {
+  DISABLE_COPY_AND_ASSIGN(ParameterOptimizer);
+  ParameterOptimizer();
+
+public:
+  static ParameterOptimizer* create(OptimizationConfig* config);
+
+  ~ParameterOptimizer();
+
+  void init(size_t numRows, const ParameterConfig* config);
+
+  void startPass();
+
+  void finishPass();
+
+  void startBatch(size_t numSamplesProcessed);
+
+  void finishBatch();
+
+  void update(const std::vector<Vector*>& vecs, const ParameterConfig& conf,
+              size_t sparseId = NO_SPARSE_ID);
+
+  std::vector<int> getParameterTypes() const;
+
+  ParameterTraverseCallback* needSpecialTraversal(
+      const ParameterConfig& config) const;
+
+private:
+  ParameterOptimizerPrivate* m;
+};
+
+class SequenceGenerator;
+
+struct GradientMachinePrivate;
+class GradientMachine {
+private:
+  GradientMachine();
+  DISABLE_COPY_AND_ASSIGN(GradientMachine);
+
+public:
+  virtual ~GradientMachine();
+
+  /**
+   * Create By ProtoStr.
+   *
+   * The ProtoStr can be generate by python's protobuf code.
+   */
+  static GradientMachine* createByConfigProtoStr(
+      const std::string& protoStr,
+      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
+      const std::vector<int>& parameterTypes = defaultParamTypes);
+
+  /**
+   * Create by ModelConfig object.
+   *
+   * To get ModelConfig, you can get TrainerConfig from config file, then get
+   * model config by TrainerConfig
+   */
+  static GradientMachine* createByModelConfig(
+      ModelConfig* conf, GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
+      const std::vector<int>& parameterTypes = defaultParamTypes);
+
+  /**
+   * The forward stage of GradientMachine.
+   *
+   * @note  the outArgs could be zero length arguemnts.
+   * @note  THIS METHOD IS VERY USEFULL FOR PREDICT FROM TRAINED MODEL.
+   */
+  void forward(const Arguments& inArgs, Arguments* outArgs, PassType passType);
+
+  /**
+   * The backward stage of GradientMachine.
+   *
+   * @note  Currently the ParameterUpdater is not wrapped in SWIG, so backward
+   * cannot actually train a network. But you can write a update callback to
+   * change the parameter or implement a ParameterUpdater in python side.
+   */
+  void backward(const UpdateCallback& callback = UpdateCallback());
+
+  /**
+   * Combine forward/backward
+   */
+  void forwardBackward(const Arguments& inArgs, Arguments* outArgs,
+                       PassType passType,
+                       const UpdateCallback& callback = UpdateCallback());
+
+  void loadParameters(const std::string& path);
+
+  size_t getParameterSize() const;
+  Parameter* getParameter(size_t i) throw(RangeError);
+
+  void randParameters();
+
+  Matrix* getLayerOutput(const std::string& layerName) const
+      throw(UnsupportError);
+
+  /**
+   * Create a sequence generator.
+   *
+   * @note  It just like a paddle_gen_sequence.
+   */
+  SequenceGenerator* asSequenceGenerator(
+      const std::vector<std::string>& dict = std::vector<std::string>(),
+      size_t begin_id = 0UL, size_t end_id = 0UL, size_t max_length = 100UL,
+      size_t beam_size = -1UL);
+
+private:
+  GradientMachinePrivate* m;
+
+  static GradientMachine* createFromPaddleModelPtr(
+      void* confPtr, GradientMatchineCreateMode mode,
+      const std::vector<int>& types);
+
+  // Not to use c++ 11 init-list, so we use static var as function default arg.
+  static std::vector<int> defaultParamTypes;
+};
+
+struct TrainerPrivate;
+class Trainer {
+private:
+  TrainerPrivate* m;
+  Trainer();
+  DISABLE_COPY_AND_ASSIGN(Trainer);
+
+public:
+  virtual ~Trainer();
+
+  /// Create A Trainer By TrainerConfig. using paddle command line.
+  static Trainer* createByCommandLine() throw(IOError);
+
+  /// Start Train.
+  void startTrain();
+  void finishTrain();
+
+  /// Start Pass.
+  void startTrainPass();
+  void finishTrainPass();
+
+  void setBatchSize(size_t batchSize);
+
+  /**
+   * Train one batch,
+   *
+   * @param batchSize -1 wiil use command line or batch size set before,
+   *                  otherwise use this batchSize for train.
+   *
+   * @return true if all batch finished.
+   */
+  bool trainOneBatch(size_t batchSize = -1UL);
+
+  bool prepareBatchData(size_t batchSize = -1UL);
+
+  void finishTrainOneBatch();
+
+  void forwardOneBatch() throw(UnsupportError);
+
+  Arguments* getNetworkOutput();
+
+  Matrix* getLayerOutput(const std::string& layerName);
+};
+
+/// The N-Best results generated from one input sequence.
+class ISequenceResults {
+public:
+  virtual ~ISequenceResults();
+
+  /// Number of result.
+  virtual size_t getSize() const = 0;
+
+  /**
+   * Get sentence from dictionary.
+   *
+   * @param id  the index of result.
+   * @param split  if true, the return sentence will be splited with ' ' by
+   *               each word. Default is false.
+   */
+  virtual std::string getSentence(size_t id, bool split = false) const
+      throw(RangeError) = 0;
+  virtual std::vector<int> getSequence(size_t id) const throw(RangeError) = 0;
+  virtual float getScore(size_t id) const throw(RangeError) = 0;
+};
+
+struct SequenceGeneratorPrivate;
+class SequenceGenerator {
+  DISABLE_COPY_AND_ASSIGN(SequenceGenerator);
+  SequenceGenerator();
+
+public:
+  virtual ~SequenceGenerator();
+
+  /**
+   * Generate Sequence by input.
+   *
+   * @note  The inArgs is just one sequence of data.
+   * @note  The return will get a N-best generate result by inArgs.
+   *        Sort by score.
+   */
+  ISequenceResults* generateSequence(const Arguments& inArgs) const;
+
+  void setDict(const std::vector<std::string>& dict);
+  void setBos(size_t bos);
+  void setEos(size_t eos);
+  void setMaxLength(size_t maxlength);
+  void setBeamSize(size_t beamSize);
+
+private:
+  static SequenceGenerator* createByGradientMachineSharedPtr(void* ptr);
+  friend class GradientMachine;
+
+private:
+  SequenceGeneratorPrivate* m;
+};
diff --git a/paddle/api/Parameter.cpp b/paddle/api/Parameter.cpp
new file mode 100644
index 00000000000000..8b56adc97c2d61
--- /dev/null
+++ b/paddle/api/Parameter.cpp
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "PaddleAPI.h"
+#include "paddle/parameter/Parameter.h"
+
+struct ParameterPrivate {
+  std::shared_ptr<paddle::Parameter> sharedPtr;
+  paddle::Parameter* rawPtr;
+
+  ParameterPrivate() : sharedPtr(nullptr), rawPtr(nullptr) {}
+
+  paddle::Parameter* getPtr() {
+    if (sharedPtr) {
+      return sharedPtr.get();
+    } else {
+      return rawPtr;
+    }
+  }
+};
+
+Parameter::Parameter() : m(new ParameterPrivate()) {}
+
+Parameter::~Parameter() { delete m; }
+
+Parameter* Parameter::createFromRawPtr(void* ptr) {
+  auto p = new Parameter();
+  p->m->rawPtr = *static_cast<paddle::Parameter**>(ptr);
+  return p;
+}
+
+Parameter* Parameter::createFromSharedPtr(void* ptr) {
+  auto& p = *(paddle::ParameterPtr*)(ptr);
+  if (p == nullptr) {
+    return nullptr;
+  } else {
+    auto retParam = new Parameter();
+    retParam->m->sharedPtr = p;
+    return retParam;
+  }
+}
+
+std::string Parameter::getName() const { return m->getPtr()->getName(); }
+
+Vector* Parameter::getBuf(ParameterType type) {
+  auto buf = m->getPtr()->getBuf(type);
+  return Vector::createByPaddleVectorPtr(&buf);
+}
+
+ParameterConfig* Parameter::getConfig() {
+  if (m->sharedPtr) {
+    return ParameterConfig::createParameterConfigFromParameterSharedPtr(
+        &m->sharedPtr);
+  } else {
+    return ParameterConfig::createParameterConfigFromParameterPtr(m->rawPtr);
+  }
+}
+
+size_t Parameter::getID() const { return m->getPtr()->getID(); }
diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp
new file mode 100644
index 00000000000000..e087defc6043c1
--- /dev/null
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -0,0 +1,129 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "PaddleAPI.h"
+#include "paddle/parameter/ParameterOptimizer.h"
+#include "Internal.h"
+#include <algorithm>
+
+struct ParameterOptimizerPrivate {
+  std::unique_ptr<paddle::ParameterOptimizer> optimizer;
+};
+
+struct ParameterTraverseCallbackPrivate {
+  paddle::ParameterOptimizer::TraverseCallback callback;
+
+  ParameterTraverseCallbackPrivate() {}
+
+  ParameterTraverseCallbackPrivate(
+      const paddle::ParameterOptimizer::TraverseCallback& callback)
+      : callback(callback) {}
+
+  void apply(const std::vector<Vector*>& vecs, const ParameterConfig& conf,
+             size_t sparseId) {
+    std::vector<paddle::VectorPtr> real_vecs;
+    real_vecs.resize(vecs.size());
+    std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) {
+      if (v) {
+        return *(paddle::VectorPtr*)(v->getSharedPtr());
+      } else {
+        return paddle::VectorPtr();
+      }
+    });
+
+    paddle::ParameterConfig& real_conf =
+        *(paddle::ParameterConfig*)(const_cast<ParameterConfig&>(conf)
+                                        .getRawPtr());
+    callback(real_vecs.data(), real_conf, sparseId);
+  }
+};
+
+ParameterOptimizer::ParameterOptimizer() : m(new ParameterOptimizerPrivate()) {}
+
+ParameterOptimizer::~ParameterOptimizer() {
+  if (m) {
+    delete m;
+  }
+}
+
+ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
+  CHECK(config != nullptr);
+  auto opt_config_ptr = (paddle::OptimizationConfig*)config->getRawPtr();
+  auto retOptimizer = new ParameterOptimizer();
+  retOptimizer->m->optimizer.reset(
+      paddle::ParameterOptimizer::create(*opt_config_ptr, false));
+  return retOptimizer;
+}
+
+void ParameterOptimizer::init(size_t numRows, const ParameterConfig* config) {
+  auto& conf = *(paddle::ParameterConfig*)(const_cast<ParameterConfig*>(config)
+                                               ->getRawPtr());
+  m->optimizer->init(numRows, &conf);
+}
+
+void ParameterOptimizer::startPass() { m->optimizer->startPass(); }
+
+void ParameterOptimizer::finishPass() { m->optimizer->finishPass(); }
+
+void ParameterOptimizer::startBatch(size_t numSamplesProcessed) {
+  constexpr size_t high_1 = 1UL << (sizeof(size_t) * 8 - 1);
+  CHECK_EQ(numSamplesProcessed & high_1, 0UL);  // Safely cast.
+  m->optimizer->startBatch((int64_t)numSamplesProcessed);
+}
+
+void ParameterOptimizer::finishBatch() { m->optimizer->finishBatch(); }
+
+void ParameterOptimizer::update(const std::vector<Vector*>& vecs,
+                                const ParameterConfig& conf, size_t sparseId) {
+  ParameterTraverseCallbackPrivate invoker([&](
+      const paddle::VectorPtr _vecs[], const paddle::ParameterConfig& config,
+      size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
+  invoker.apply(vecs, conf, sparseId);
+}
+
+std::vector<int> ParameterOptimizer::getParameterTypes() const {
+  std::vector<int> returnValue;
+  staticCastVector(&returnValue, m->optimizer->getParameterTypes());
+  return returnValue;
+}
+
+ParameterTraverseCallback::ParameterTraverseCallback()
+    : m(new ParameterTraverseCallbackPrivate()) {}
+
+ParameterTraverseCallback::~ParameterTraverseCallback() {
+  if (m) {
+    delete m;
+  }
+}
+
+void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
+                                      const ParameterConfig& conf,
+                                      size_t sparseId) {
+  m->apply(vecs, conf, sparseId);
+}
+
+ParameterTraverseCallback* ParameterOptimizer::needSpecialTraversal(
+    const ParameterConfig& config) const {
+  auto& param_config = *(paddle::ParameterConfig*)const_cast<ParameterConfig&>(
+                            config).getRawPtr();
+  auto callback = m->optimizer->needSpecialTraversal(param_config);
+  if (callback) {
+    auto retCallback = new ParameterTraverseCallback();
+    retCallback->m->callback = callback;
+    return retCallback;
+  } else {
+    return nullptr;
+  }
+}
diff --git a/paddle/api/SequenceGenerator.cpp b/paddle/api/SequenceGenerator.cpp
new file mode 100644
index 00000000000000..9d353ccc8e281e
--- /dev/null
+++ b/paddle/api/SequenceGenerator.cpp
@@ -0,0 +1,240 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "PaddleAPI.h"
+#include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/parameter/Argument.h"
+#include "paddle/utils/Flags.h"
+#include <vector>
+#include <sstream>
+#include <algorithm>
+#include <iterator>
+
+// used to represent partial sequence
+struct Path {
+  std::vector<int> ids;
+  float logProb;
+  paddle::MachineState machineState;
+
+  Path() { logProb = 0; }
+
+  Path(std::vector<int>& ids, float logProb, paddle::MachineState& machineState)
+      : ids(ids), logProb(logProb), machineState(machineState) {}
+
+  bool operator<(const Path& other) const { return (logProb > other.logProb); }
+};
+
+// Return top k (k == beam_size) optimal paths using beam search. The last
+// element of inArgs is the Argument of feedback. gradMachine has MaxIdLayer
+// as output and outArgs thus stores top k labels and their probabilities per
+// position
+static void findNBest(paddle::GradientMachine* gradMachine,
+                      std::vector<paddle::Argument>& inArgs,
+                      std::vector<Path>& finalPaths, size_t bos_id,
+                      size_t eos_id, size_t max_length) {
+  std::vector<Path> paths;
+  Path emptyPath;
+  paths.push_back(emptyPath);
+  finalPaths.clear();
+  gradMachine->resetState();
+  paddle::Argument feedback = inArgs.back();
+  feedback.ids->setElement(0, (int)(bos_id));
+  float minFinalPathLogProb = 0;
+  size_t beam = 0;
+  int id;
+  std::vector<paddle::Argument> outArgs;
+  while (true) {  // iterate over each generated word
+    std::vector<Path> newPaths;
+    paddle::MachineState machineState;
+    for (size_t j = 0; j < paths.size(); j++) {
+      Path& path = paths[j];
+      if (path.machineState.size() > 0) {
+        gradMachine->setState(path.machineState);
+        feedback.ids->setElement(0, path.ids.back());
+      }
+      gradMachine->forward(inArgs, &outArgs, paddle::PASS_TEST);
+      gradMachine->getState(machineState);
+      beam = outArgs[0].ids->getSize();
+      for (size_t k = 0; k < beam; k++) {
+        id = outArgs[0].ids->getElement(k);
+        float prob = outArgs[0].in->getElement(0, k);
+        std::vector<int> nids(path.ids);
+        nids.push_back(id);
+        float newLogProb = path.logProb + log(prob);
+        Path newPath(nids, newLogProb, machineState);
+        if (id == (int)eos_id || nids.size() >= max_length) {
+          finalPaths.push_back(newPath);
+          if (minFinalPathLogProb > newPath.logProb) {
+            minFinalPathLogProb = newPath.logProb;
+          }
+        } else {
+          newPaths.push_back(newPath);
+        }
+      }
+    }
+
+    if (newPaths.size() == 0) {
+      break;
+    }
+    std::nth_element(newPaths.begin(),
+                     newPaths.begin() + std::min(beam, newPaths.size()),
+                     newPaths.end());
+    if (newPaths.size() > beam) {
+      newPaths.resize(beam);
+    }
+    // pathA < pathB means pathA.logProb > pathB.logProb
+    float maxPathLogProb =
+        std::min_element(newPaths.begin(), newPaths.end())->logProb;
+    if (finalPaths.size() >= beam && minFinalPathLogProb >= maxPathLogProb) {
+      break;
+    }
+    paths = newPaths;
+  }  // end while
+
+  std::partial_sort(finalPaths.begin(),
+                    finalPaths.begin() + std::min(beam, finalPaths.size()),
+                    finalPaths.end());
+  if (finalPaths.size() > beam) {
+    finalPaths.resize(beam);
+  }
+}
+
+struct SequenceGeneratorPrivate {
+  std::shared_ptr<paddle::GradientMachine> machine;
+  std::shared_ptr<std::vector<std::string>> dict;
+  size_t beginPos;
+  size_t endPos;
+  size_t maxLength;
+
+  paddle::Argument feedback;
+
+  template <typename T>
+  inline T& cast(void* ptr) {
+    return *(T*)(ptr);
+  }
+
+  inline void findNBest(std::vector<paddle::Argument>& inArgs,
+                        std::vector<Path>& path) {
+    ::findNBest(machine.get(), inArgs, path, beginPos, endPos, maxLength);
+  }
+
+  SequenceGeneratorPrivate()
+      : dict(std::make_shared<std::vector<std::string>>()),
+        beginPos(0UL),
+        endPos(0UL),
+        maxLength(0UL),
+        feedback(__create_feedback__()) {}
+
+private:
+  static paddle::Argument __create_feedback__() {
+    paddle::Argument feedback;
+    feedback.ids = paddle::IVector::create(/* size= */ 1, FLAGS_use_gpu);
+
+    feedback.sequenceStartPositions =
+        paddle::ICpuGpuVector::create(/* size= */ 2, /* useGpu= */ false);
+    feedback.sequenceStartPositions->getMutableData(false)[0] = 0;
+    feedback.sequenceStartPositions->getMutableData(false)[1] = 1;
+    return feedback;
+  }
+};
+
+SequenceGenerator::SequenceGenerator() : m(new SequenceGeneratorPrivate()) {}
+
+SequenceGenerator::~SequenceGenerator() { delete m; }
+
+class PathSequenceResults : public ISequenceResults {
+  // ISequenceResults interface
+public:
+  PathSequenceResults(const std::shared_ptr<std::vector<Path>>& path,
+                      const std::shared_ptr<std::vector<std::string>>& dict)
+      : path_(path), dict_(dict) {}
+
+  size_t getSize() const { return path_->size(); }
+  std::string getSentence(size_t id, bool split) const throw(RangeError) {
+    if (id < getSize()) {
+      Path& p = (*path_)[id];
+      std::ostringstream sout;
+      std::transform(p.ids.begin(), p.ids.end(),
+                     std::ostream_iterator<std::string>(sout, split ? " " : ""),
+                     [&](int id) { return (*dict_)[id]; });
+      return sout.str();
+    } else {
+      RangeError e;
+      throw e;
+    }
+  }
+  std::vector<int> getSequence(size_t id) const throw(RangeError) {
+    if (id < getSize()) {
+      Path& p = (*path_)[id];
+      return p.ids;
+    } else {
+      RangeError e;
+      throw e;
+    }
+  }
+  float getScore(size_t id) const throw(RangeError) {
+    if (id < getSize()) {
+      Path& p = (*path_)[id];
+      return p.logProb;
+    } else {
+      RangeError e;
+      throw e;
+    }
+  }
+
+private:
+  std::shared_ptr<std::vector<Path>> path_;
+  std::shared_ptr<std::vector<std::string>> dict_;
+};
+
+ISequenceResults* SequenceGenerator::generateSequence(
+    const Arguments& inArgs) const {
+  auto& in_args =
+      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
+  for (auto& arg : in_args) {
+    arg.sequenceStartPositions = m->feedback.sequenceStartPositions;
+  }
+  in_args.push_back(m->feedback);
+  auto path = std::make_shared<std::vector<Path>>();
+  m->findNBest(in_args, *path);
+  return new PathSequenceResults(path, m->dict);
+}
+
+SequenceGenerator* SequenceGenerator::createByGradientMachineSharedPtr(
+    void* ptr) {
+  SequenceGenerator* r = new SequenceGenerator();
+  r->m->machine = r->m->cast<std::shared_ptr<paddle::GradientMachine>>(ptr);
+  return r;
+}
+
+void SequenceGenerator::setDict(const std::vector<std::string>& dict) {
+  *m->dict = dict;
+}
+
+void SequenceGenerator::setBos(size_t bos) { m->beginPos = bos; }
+
+void SequenceGenerator::setEos(size_t eos) { m->endPos = eos; }
+
+void SequenceGenerator::setMaxLength(size_t maxLength) {
+  m->maxLength = maxLength;
+}
+
+void SequenceGenerator::setBeamSize(size_t beamSize) {
+  if (beamSize != -1UL) {
+    FLAGS_beam_size = beamSize;
+  }
+}
+
+ISequenceResults::~ISequenceResults() {}
diff --git a/paddle/api/Trainer.cpp b/paddle/api/Trainer.cpp
new file mode 100644
index 00000000000000..95b578c8db9fdc
--- /dev/null
+++ b/paddle/api/Trainer.cpp
@@ -0,0 +1,183 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+
+#include <stdlib.h>
+#include <memory>
+#include <atomic>
+
+#include "paddle/trainer/ParamUtil.h"
+#include "paddle/trainer/Trainer.h"
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/trainer/TrainerInternal.h"
+#include "paddle/utils/Flags.h"
+
+using paddle::real;
+
+P_DECLARE_string(config);
+P_DECLARE_string(init_model_path);
+P_DECLARE_int32(start_pass);
+
+struct TrainPassContext {
+  int64_t batchId;
+  int32_t batchSize;
+  real avgTestCost;
+  int64_t numAvgTests;
+  int passInnerId;
+  paddle::DataBatch data;
+  std::vector<paddle::Argument> forwardOutput;
+};
+
+struct TrainerPrivate : public paddle::Trainer {
+  void startTrain();
+  void finishTrain();
+
+  void startTrainPass();
+  void finishTrainPass();
+
+  bool _trainOneBatch();
+
+  bool _prepareBatchData();
+  void _forwardOneBatch() throw(UnsupportError);
+
+  TrainerPrivate() : paddle::Trainer() {}
+
+  TrainPassContext trainPassContext;
+};
+
+Trainer::Trainer() : m(new TrainerPrivate()) {
+  auto conf = paddle::TrainerConfigHelper::createFromFlags();
+  if (conf != nullptr) {
+    m->init(conf);
+  }
+}
+
+Trainer::~Trainer() { delete m; }
+
+Trainer* Trainer::createByCommandLine() throw(IOError) {
+  auto retv = new Trainer();
+  if (retv->m->getConfig().IsInitialized()) {
+    return retv;
+  } else {
+    throw IOError();
+  }
+}
+
+void Trainer::startTrain() { m->startTrain(); }
+
+void TrainerPrivate::startTrain() {
+  srand(this->config_->getConfig().start_pass() + 1);
+  this->dataProvider_->reset();
+  this->trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
+}
+
+void Trainer::finishTrain() { m->finishTrain(); }
+
+void TrainerPrivate::finishTrain() {
+  this->trainerInternal_.getGradientMachine()->finish();
+}
+
+void Trainer::startTrainPass() { m->startTrainPass(); }
+
+void TrainerPrivate::startTrainPass() {
+  this->stats_.reset();
+  this->trainPassContext.batchId = 0;
+  this->trainPassContext.batchSize = this->config_->getOptConfig().batch_size();
+  this->trainPassContext.avgTestCost = 0;
+  this->trainPassContext.numAvgTests = 0;
+  this->trainPassContext.passInnerId = 0;
+  this->trainerInternal_.getParameterUpdater()->startPass();
+  this->evaluator_->start();
+}
+
+void Trainer::finishTrainPass() { m->finishTrainPass(); }
+
+void TrainerPrivate::finishTrainPass() {
+  this->trainerInternal_.getGradientMachine()->onPassEnd();
+  this->trainerInternal_.getParameterUpdater()->finishPass();
+  evaluator_->finish();
+}
+
+void Trainer::setBatchSize(size_t batchSize) {
+  this->m->trainPassContext.batchSize = batchSize;
+}
+
+bool Trainer::trainOneBatch(size_t batchSize) {
+  if (batchSize == -1UL) {
+    this->setBatchSize(batchSize);
+  }
+  return m->_trainOneBatch();
+}
+
+bool TrainerPrivate::_trainOneBatch() {
+  if (this->_prepareBatchData()) {
+    return true;
+  }
+  this->trainerInternal_.trainOneBatch(this->trainPassContext.batchId,
+                                       this->trainPassContext.data);
+  return false;
+}
+
+Matrix* Trainer::getLayerOutput(const std::string& layerName) {
+  auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(
+          this->m->getGradientMachine());
+  CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
+  auto m = nn->getLayerOutput(layerName);
+  return Matrix::createByPaddleMatrixPtr(&m);
+}
+
+bool Trainer::prepareBatchData(size_t batchSize) {
+  if (batchSize != -1UL) {
+    this->setBatchSize(batchSize);
+  }
+  return this->m->_prepareBatchData();
+}
+
+bool TrainerPrivate::_prepareBatchData() {
+  int num = dataProvider_->getNextBatch(this->trainPassContext.batchSize,
+                                        &this->trainPassContext.data);
+  return num == 0;
+}
+
+void Trainer::finishTrainOneBatch() { ++m->trainPassContext.batchId; }
+
+void Trainer::forwardOneBatch() throw(UnsupportError) { m->_forwardOneBatch(); }
+
+void TrainerPrivate::_forwardOneBatch() throw(UnsupportError) {
+  auto& dataBatch = this->trainPassContext.data;
+
+  int64_t actualBatchSize = dataBatch.getSize();
+  if (actualBatchSize == 0) {
+    return;
+  }
+
+  const std::vector<paddle::Argument>& inArgs = dataBatch.getStreams();
+  std::vector<paddle::Argument>& outArgs = this->trainPassContext.forwardOutput;
+  outArgs.clear();
+  paddle::PassType passType =
+      this->trainerInternal_.getParameterUpdater()->startBatch(actualBatchSize);
+
+  if (config_->getOptConfig().use_sparse_remote_updater()) {
+    this->trainerInternal_.getGradientMachine()->prefetch(inArgs);
+    this->trainerInternal_.getParameterUpdater()->getParametersRemote();
+  }
+  this->trainerInternal_.getGradientMachine()->forward(
+        inArgs, &outArgs, passType);
+}
+
+Arguments* Trainer::getNetworkOutput() {
+  return Arguments::createByPaddleArgumentVector(
+      &m->trainPassContext.forwardOutput);
+}
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
new file mode 100644
index 00000000000000..4e655c324a1ede
--- /dev/null
+++ b/paddle/api/Util.cpp
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+
+#include "paddle/utils/Util.h"
+#include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/parameter/Parameter.h"
+
+#include <fenv.h>
+#include <iostream>
+#include <iterator>
+#include <algorithm>
+
+void initPaddle(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  paddle::initPython(argc, argv);
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+}
+
+FloatArray::FloatArray(const float* b, const size_t l)
+    : buf(b), length(l), needFree(false) {}
+
+IntArray::IntArray(const int* b, const size_t l, bool f)
+    : buf(b), length(l), needFree(f) {}
+
+IntWithFloatArray::IntWithFloatArray(const float* v, const int* i, size_t l,
+                                     bool f)
+    : valBuf(v), idxBuf(i), length(l), needFree(f) {}
+
+bool isGpuVersion() {
+#ifdef PADDLE_ONLY_CPU
+  return false;
+#else
+  return true;
+#endif
+}
+
+static_assert(NUM_PARAMETER_TYPES == paddle::NUM_PARAMETER_TYPES,
+              "The Parameter Type should be same in core/api and core/common");
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
new file mode 100644
index 00000000000000..1affc1a5fefb8a
--- /dev/null
+++ b/paddle/api/Vector.cpp
@@ -0,0 +1,257 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "PaddleAPI.h"
+
+#include "paddle/math/Vector.h"
+
+#include <cstring>
+
+struct IVectorPrivate {
+  paddle::IVectorPtr vec;
+};
+
+IVector::IVector() : m(new IVectorPrivate()) {}
+
+IVector* IVector::createZero(size_t sz, bool useGpu) {
+  auto v = new IVector();
+  v->m->vec = paddle::IVector::create(sz, useGpu);
+  v->m->vec->zeroMem();
+  return v;
+}
+
+IVector* IVector::create(const std::vector<int>& data, bool useGpu) {
+  auto v = new IVector();
+  v->m->vec = paddle::IVector::create(data.size(), useGpu);
+  v->m->vec->copyFrom(data.data(), data.size());
+  return v;
+}
+
+IVector* IVector::createCpuVectorFromNumpy(int* data, int dim, bool copy) {
+  auto v = new IVector();
+  if (copy) {
+    v->m->vec = paddle::IVector::create(dim, false);
+    v->m->vec->copyFrom(data, dim);
+  } else {
+    v->m->vec = paddle::IVector::create(data, dim, false);
+  }
+  return v;
+}
+
+IVector* IVector::createGpuVectorFromNumy(int* data, int dim) {
+  auto v = new IVector();
+  v->m->vec = paddle::IVector::create(dim, true);
+  v->m->vec->copyFrom(data, dim);
+  return v;
+}
+
+bool IVector::isGpu() const {
+  return dynamic_cast<paddle::GpuIVector*>(m->vec.get()) != nullptr;
+}
+
+IntArray IVector::getData() const {
+  if (this->isGpu()) {
+    int* src = m->vec->getData();
+    size_t len = m->vec->getSize();
+    int* dest = new int[len];
+    hl_memcpy_device2host(dest, src, len * sizeof(int));
+    return IntArray(dest, len, true);
+  } else {
+    return IntArray(m->vec->getData(), m->vec->getSize());
+  }
+}
+
+int& IVector::operator[](const size_t idx) throw(RangeError, UnsupportError) {
+  if (this->isGpu()) {
+    UnsupportError e;
+    throw e;
+  } else {
+    if (idx >= m->vec->getSize()) {
+      RangeError e;
+      throw e;
+    }
+  }
+  return m->vec->getData()[idx];
+}
+
+const int& IVector::operator[](const size_t idx) const
+    throw(RangeError, UnsupportError) {
+  return (*const_cast<IVector*>(this))[idx];
+}
+
+IVector* IVector::createByPaddleVectorPtr(void* ptr) {
+  auto* p = (paddle::IVectorPtr*)ptr;
+  if ((*p) != nullptr) {
+    IVector* vec = new IVector();
+    vec->m->vec = *p;
+    return vec;
+  } else {
+    return nullptr;
+  }
+}
+
+IVector::~IVector() { delete m; }
+
+void* IVector::getSharedPtr() const { return &m->vec; }
+
+size_t IVector::getSize() const { return m->vec->getSize(); }
+
+void IVector::toNumpyArrayInplace(int** data, int* dim1) throw(UnsupportError) {
+  auto v = std::dynamic_pointer_cast<paddle::CpuIVector>(m->vec);
+  if (v) {
+    *data = v->getData();
+    *dim1 = v->getSize();
+  } else {
+    throw UnsupportError();
+  }
+}
+
+void IVector::copyToNumpyArray(int** view_m_data, int* dim1) {
+  *dim1 = m->vec->getSize();
+  *view_m_data = new int[*dim1];
+  if (auto cpuVec = dynamic_cast<paddle::CpuIVector*>(m->vec.get())) {
+    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(int) * (*dim1));
+  } else if (auto gpuVec = dynamic_cast<paddle::GpuIVector*>(m->vec.get())) {
+    hl_memcpy_device2host(*view_m_data, gpuVec->getData(),
+                          sizeof(int) * (*dim1));
+  } else {
+    LOG(INFO) << "Unexpected situation";
+  }
+}
+
+void IVector::copyFromNumpyArray(int* data, int dim) {
+  m->vec->resize(dim);
+  m->vec->copyFrom(data, dim);
+}
+
+struct VectorPrivate {
+  paddle::VectorPtr vec;
+
+  void safeAccessData(const size_t idx,
+                      const std::function<void(float&)>& func) const
+      throw(RangeError, UnsupportError) {
+    auto cpuVec = std::dynamic_pointer_cast<const paddle::CpuVector>(vec);
+    if (cpuVec != nullptr) {
+      if (idx < vec->getSize()) {
+        func(vec->getData()[idx]);
+      } else {
+        throw RangeError();
+      }
+    } else {
+      throw UnsupportError();
+    }
+  }
+};
+
+Vector::Vector() : m(new VectorPrivate()) {}
+
+Vector::~Vector() {
+  if (m) {
+    delete m;
+  }
+}
+
+Vector* Vector::createZero(size_t sz, bool useGpu) {
+  auto retVec = new Vector();
+  retVec->m->vec = paddle::Vector::create(sz, useGpu);
+  retVec->m->vec->zero();
+  return retVec;
+}
+
+Vector* Vector::create(const std::vector<float>& data, bool useGpu) {
+  auto retVec = new Vector();
+  retVec->m->vec = paddle::Vector::create(data.size(), useGpu);
+  retVec->m->vec->copyFrom(data.data(), data.size());
+  return retVec;
+}
+
+Vector* Vector::createByPaddleVectorPtr(void* ptr) {
+  auto& v = *(paddle::VectorPtr*)(ptr);
+  if (v == nullptr) {
+    return nullptr;
+  } else {
+    auto retVec = new Vector();
+    retVec->m->vec = v;
+    return retVec;
+  }
+}
+
+Vector* Vector::createCpuVectorFromNumpy(float* data, int dim, bool copy) {
+  CHECK_GT(dim, 0);
+  auto retVec = new Vector();
+  if (copy) {
+    retVec->m->vec = paddle::Vector::create((size_t)dim, false);
+    return retVec;
+  } else {
+    retVec->m->vec = paddle::Vector::create(data, (size_t)dim, false);
+  }
+  return retVec;
+}
+
+Vector* Vector::createGpuVectorFromNumpy(float* data, int dim) {
+  CHECK_GT(dim, 0);
+  auto retVec = new Vector();
+  retVec->m->vec = paddle::Vector::create((size_t)dim, true);
+  retVec->m->vec->copyFrom(data, (size_t)dim);
+  return retVec;
+}
+
+void Vector::toNumpyArrayInplace(float** view_data,
+                                 int* dim1) throw(UnsupportError) {
+  auto v = std::dynamic_pointer_cast<paddle::CpuVector>(m->vec);
+  if (v != nullptr) {
+    *view_data = v->getData();
+    *dim1 = (int)v->getSize();
+  } else {
+    throw UnsupportError();
+  }
+}
+
+void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
+  *dim1 = m->vec->getSize();
+  *view_m_data = new float[*dim1];
+  if (auto cpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
+    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
+  } else if (auto gpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
+    hl_memcpy_device2host(*view_m_data, gpuVec->getData(),
+                          sizeof(float) * (*dim1));
+  } else {
+    LOG(INFO) << "Unexpected situation";
+  }
+}
+
+void Vector::copyFromNumpyArray(float* data, int dim) {
+  m->vec->resize(dim);
+  m->vec->copyFrom(data, dim);
+}
+
+bool Vector::isGpu() const {
+  return std::dynamic_pointer_cast<paddle::GpuVector>(m->vec) != nullptr;
+}
+
+float Vector::get(const size_t idx) const throw(RangeError, UnsupportError) {
+  float r;
+  m->safeAccessData(idx, [&](float& o) { r = o; });
+  return r;
+}
+
+void Vector::set(const size_t idx, float val) throw(RangeError,
+                                                    UnsupportError) {
+  m->safeAccessData(idx, [&](float& o) { o = val; });
+}
+
+size_t Vector::getSize() const { return m->vec->getSize(); }
+
+void* Vector::getSharedPtr() { return &m->vec; }
diff --git a/paddle/api/__init__.py b/paddle/api/__init__.py
new file mode 100644
index 00000000000000..7f9e87eee60376
--- /dev/null
+++ b/paddle/api/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/paddle/api/numpy.i b/paddle/api/numpy.i
new file mode 100644
index 00000000000000..2ddc11de7a40d1
--- /dev/null
+++ b/paddle/api/numpy.i
@@ -0,0 +1,3161 @@
+/* -*- C -*-  (not really, but good for syntax highlighting) */
+
+/*
+ * Copyright (c) 2005-2015, NumPy Developers.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *        notice, this list of conditions and the following disclaimer.
+ *
+ *     * Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials provided
+ *        with the distribution.
+ *
+ *     * Neither the name of the NumPy Developers nor the names of any
+ *        contributors may be used to endorse or promote products derived
+ *        from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef SWIGPYTHON
+
+%{
+#ifndef SWIG_FILE_WITH_INIT
+#define NO_IMPORT_ARRAY
+#endif
+#include "stdio.h"
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include <numpy/arrayobject.h>
+%}
+
+/**********************************************************************/
+
+%fragment("NumPy_Backward_Compatibility", "header")
+{
+%#if NPY_API_VERSION < 0x00000007
+%#define NPY_ARRAY_DEFAULT NPY_DEFAULT
+%#define NPY_ARRAY_FARRAY  NPY_FARRAY
+%#define NPY_FORTRANORDER  NPY_FORTRAN
+%#endif
+}
+
+/**********************************************************************/
+
+/* The following code originally appeared in
+ * enthought/kiva/agg/src/numeric.i written by Eric Jones.  It was
+ * translated from C++ to C by John Hunter.  Bill Spotz has modified
+ * it to fix some minor bugs, upgrade from Numeric to numpy (all
+ * versions), add some comments and functionality, and convert from
+ * direct code insertion to SWIG fragments.
+ */
+
+%fragment("NumPy_Macros", "header")
+{
+/* Macros to extract array attributes.
+ */
+%#if NPY_API_VERSION < 0x00000007
+%#define is_array(a)            ((a) && PyArray_Check((PyArrayObject*)a))
+%#define array_type(a)          (int)(PyArray_TYPE((PyArrayObject*)a))
+%#define array_numdims(a)       (((PyArrayObject*)a)->nd)
+%#define array_dimensions(a)    (((PyArrayObject*)a)->dimensions)
+%#define array_size(a,i)        (((PyArrayObject*)a)->dimensions[i])
+%#define array_strides(a)       (((PyArrayObject*)a)->strides)
+%#define array_stride(a,i)      (((PyArrayObject*)a)->strides[i])
+%#define array_data(a)          (((PyArrayObject*)a)->data)
+%#define array_descr(a)         (((PyArrayObject*)a)->descr)
+%#define array_flags(a)         (((PyArrayObject*)a)->flags)
+%#define array_enableflags(a,f) (((PyArrayObject*)a)->flags) = f
+%#else
+%#define is_array(a)            ((a) && PyArray_Check(a))
+%#define array_type(a)          PyArray_TYPE((PyArrayObject*)a)
+%#define array_numdims(a)       PyArray_NDIM((PyArrayObject*)a)
+%#define array_dimensions(a)    PyArray_DIMS((PyArrayObject*)a)
+%#define array_strides(a)       PyArray_STRIDES((PyArrayObject*)a)
+%#define array_stride(a,i)      PyArray_STRIDE((PyArrayObject*)a,i)
+%#define array_size(a,i)        PyArray_DIM((PyArrayObject*)a,i)
+%#define array_data(a)          PyArray_DATA((PyArrayObject*)a)
+%#define array_descr(a)         PyArray_DESCR((PyArrayObject*)a)
+%#define array_flags(a)         PyArray_FLAGS((PyArrayObject*)a)
+%#define array_enableflags(a,f) PyArray_ENABLEFLAGS((PyArrayObject*)a,f)
+%#endif
+%#define array_is_contiguous(a) (PyArray_ISCONTIGUOUS((PyArrayObject*)a))
+%#define array_is_native(a)     (PyArray_ISNOTSWAPPED((PyArrayObject*)a))
+%#define array_is_fortran(a)    (PyArray_ISFORTRAN((PyArrayObject*)a))
+}
+
+/**********************************************************************/
+
+%fragment("NumPy_Utilities",
+          "header")
+{
+  /* Given a PyObject, return a string describing its type.
+   */
+  const char* pytype_string(PyObject* py_obj)
+  {
+    if (py_obj == NULL          ) return "C NULL value";
+    if (py_obj == Py_None       ) return "Python None" ;
+    if (PyCallable_Check(py_obj)) return "callable"    ;
+    if (PyString_Check(  py_obj)) return "string"      ;
+    if (PyInt_Check(     py_obj)) return "int"         ;
+    if (PyFloat_Check(   py_obj)) return "float"       ;
+    if (PyDict_Check(    py_obj)) return "dict"        ;
+    if (PyList_Check(    py_obj)) return "list"        ;
+    if (PyTuple_Check(   py_obj)) return "tuple"       ;
+%#if PY_MAJOR_VERSION < 3
+    if (PyFile_Check(    py_obj)) return "file"        ;
+    if (PyModule_Check(  py_obj)) return "module"      ;
+    if (PyInstance_Check(py_obj)) return "instance"    ;
+%#endif
+
+    return "unknown type";
+  }
+
+  /* Given a NumPy typecode, return a string describing the type.
+   */
+  const char* typecode_string(int typecode)
+  {
+    static const char* type_names[25] = {"bool",
+                                         "byte",
+                                         "unsigned byte",
+                                         "short",
+                                         "unsigned short",
+                                         "int",
+                                         "unsigned int",
+                                         "long",
+                                         "unsigned long",
+                                         "long long",
+                                         "unsigned long long",
+                                         "float",
+                                         "double",
+                                         "long double",
+                                         "complex float",
+                                         "complex double",
+                                         "complex long double",
+                                         "object",
+                                         "string",
+                                         "unicode",
+                                         "void",
+                                         "ntypes",
+                                         "notype",
+                                         "char",
+                                         "unknown"};
+    return typecode < 24 ? type_names[typecode] : type_names[24];
+  }
+
+  /* Make sure input has correct numpy type.  This now just calls
+     PyArray_EquivTypenums().
+   */
+  int type_match(int actual_type,
+                 int desired_type)
+  {
+    return PyArray_EquivTypenums(actual_type, desired_type);
+  }
+
+%#ifdef SWIGPY_USE_CAPSULE
+  void free_cap(PyObject * cap)
+  {
+    void* array = (void*) PyCapsule_GetPointer(cap,SWIGPY_CAPSULE_NAME);
+    if (array != NULL) free(array);
+  }
+%#endif
+
+
+}
+
+/**********************************************************************/
+
+%fragment("NumPy_Object_to_Array",
+          "header",
+          fragment="NumPy_Backward_Compatibility",
+          fragment="NumPy_Macros",
+          fragment="NumPy_Utilities")
+{
+  /* Given a PyObject pointer, cast it to a PyArrayObject pointer if
+   * legal.  If not, set the python error string appropriately and
+   * return NULL.
+   */
+  PyArrayObject* obj_to_array_no_conversion(PyObject* input,
+                                            int        typecode)
+  {
+    PyArrayObject* ary = NULL;
+    if (is_array(input) && (typecode == NPY_NOTYPE ||
+                            PyArray_EquivTypenums(array_type(input), typecode)))
+    {
+      ary = (PyArrayObject*) input;
+    }
+    else if is_array(input)
+    {
+      const char* desired_type = typecode_string(typecode);
+      const char* actual_type  = typecode_string(array_type(input));
+      PyErr_Format(PyExc_TypeError,
+                   "Array of type '%s' required.  Array of type '%s' given",
+                   desired_type, actual_type);
+      ary = NULL;
+    }
+    else
+    {
+      const char* desired_type = typecode_string(typecode);
+      const char* actual_type  = pytype_string(input);
+      PyErr_Format(PyExc_TypeError,
+                   "Array of type '%s' required.  A '%s' was given",
+                   desired_type,
+                   actual_type);
+      ary = NULL;
+    }
+    return ary;
+  }
+
+  /* Convert the given PyObject to a NumPy array with the given
+   * typecode.  On success, return a valid PyArrayObject* with the
+   * correct type.  On failure, the python error string will be set and
+   * the routine returns NULL.
+   */
+  PyArrayObject* obj_to_array_allow_conversion(PyObject* input,
+                                               int       typecode,
+                                               int*      is_new_object)
+  {
+    PyArrayObject* ary = NULL;
+    PyObject*      py_obj;
+    if (is_array(input) && (typecode == NPY_NOTYPE ||
+                            PyArray_EquivTypenums(array_type(input),typecode)))
+    {
+      ary = (PyArrayObject*) input;
+      *is_new_object = 0;
+    }
+    else
+    {
+      py_obj = PyArray_FROMANY(input, typecode, 0, 0, NPY_ARRAY_DEFAULT);
+      /* If NULL, PyArray_FromObject will have set python error value.*/
+      ary = (PyArrayObject*) py_obj;
+      *is_new_object = 1;
+    }
+    return ary;
+  }
+
+  /* Given a PyArrayObject, check to see if it is contiguous.  If so,
+   * return the input pointer and flag it as not a new object.  If it is
+   * not contiguous, create a new PyArrayObject using the original data,
+   * flag it as a new object and return the pointer.
+   */
+  PyArrayObject* make_contiguous(PyArrayObject* ary,
+                                 int*           is_new_object,
+                                 int            min_dims,
+                                 int            max_dims)
+  {
+    PyArrayObject* result;
+    if (array_is_contiguous(ary))
+    {
+      result = ary;
+      *is_new_object = 0;
+    }
+    else
+    {
+      result = (PyArrayObject*) PyArray_ContiguousFromObject((PyObject*)ary,
+                                                              array_type(ary),
+                                                              min_dims,
+                                                              max_dims);
+      *is_new_object = 1;
+    }
+    return result;
+  }
+
+  /* Given a PyArrayObject, check to see if it is Fortran-contiguous.
+   * If so, return the input pointer, but do not flag it as not a new
+   * object.  If it is not Fortran-contiguous, create a new
+   * PyArrayObject using the original data, flag it as a new object
+   * and return the pointer.
+   */
+  PyArrayObject* make_fortran(PyArrayObject* ary,
+                              int*           is_new_object)
+  {
+    PyArrayObject* result;
+    if (array_is_fortran(ary))
+    {
+      result = ary;
+      *is_new_object = 0;
+    }
+    else
+    {
+      Py_INCREF(array_descr(ary));
+      result = (PyArrayObject*) PyArray_FromArray(ary,
+                                                  array_descr(ary),
+                                                  NPY_FORTRANORDER);
+      *is_new_object = 1;
+    }
+    return result;
+  }
+
+  /* Convert a given PyObject to a contiguous PyArrayObject of the
+   * specified type.  If the input object is not a contiguous
+   * PyArrayObject, a new one will be created and the new object flag
+   * will be set.
+   */
+  PyArrayObject* obj_to_array_contiguous_allow_conversion(PyObject* input,
+                                                          int       typecode,
+                                                          int*      is_new_object)
+  {
+    int is_new1 = 0;
+    int is_new2 = 0;
+    PyArrayObject* ary2;
+    PyArrayObject* ary1 = obj_to_array_allow_conversion(input,
+                                                        typecode,
+                                                        &is_new1);
+    if (ary1)
+    {
+      ary2 = make_contiguous(ary1, &is_new2, 0, 0);
+      if ( is_new1 && is_new2)
+      {
+        Py_DECREF(ary1);
+      }
+      ary1 = ary2;
+    }
+    *is_new_object = is_new1 || is_new2;
+    return ary1;
+  }
+
+  /* Convert a given PyObject to a Fortran-ordered PyArrayObject of the
+   * specified type.  If the input object is not a Fortran-ordered
+   * PyArrayObject, a new one will be created and the new object flag
+   * will be set.
+   */
+  PyArrayObject* obj_to_array_fortran_allow_conversion(PyObject* input,
+                                                       int       typecode,
+                                                       int*      is_new_object)
+  {
+    int is_new1 = 0;
+    int is_new2 = 0;
+    PyArrayObject* ary2;
+    PyArrayObject* ary1 = obj_to_array_allow_conversion(input,
+                                                        typecode,
+                                                        &is_new1);
+    if (ary1)
+    {
+      ary2 = make_fortran(ary1, &is_new2);
+      if (is_new1 && is_new2)
+      {
+        Py_DECREF(ary1);
+      }
+      ary1 = ary2;
+    }
+    *is_new_object = is_new1 || is_new2;
+    return ary1;
+  }
+} /* end fragment */
+
+/**********************************************************************/
+
+%fragment("NumPy_Array_Requirements",
+          "header",
+          fragment="NumPy_Backward_Compatibility",
+          fragment="NumPy_Macros")
+{
+  /* Test whether a python object is contiguous.  If array is
+   * contiguous, return 1.  Otherwise, set the python error string and
+   * return 0.
+   */
+  int require_contiguous(PyArrayObject* ary)
+  {
+    int contiguous = 1;
+    if (!array_is_contiguous(ary))
+    {
+      PyErr_SetString(PyExc_TypeError,
+                      "Array must be contiguous.  A non-contiguous array was given");
+      contiguous = 0;
+    }
+    return contiguous;
+  }
+
+  /* Test whether a python object is (C_ or F_) contiguous.  If array is
+   * contiguous, return 1.  Otherwise, set the python error string and
+   * return 0.
+   */
+  int require_c_or_f_contiguous(PyArrayObject* ary)
+  {
+    int contiguous = 1;
+    if (!(array_is_contiguous(ary) || array_is_fortran(ary)))
+    {
+      PyErr_SetString(PyExc_TypeError,
+                      "Array must be contiguous (C_ or F_).  A non-contiguous array was given");
+      contiguous = 0;
+    }
+    return contiguous;
+  }
+
+  /* Require that a numpy array is not byte-swapped.  If the array is
+   * not byte-swapped, return 1.  Otherwise, set the python error string
+   * and return 0.
+   */
+  int require_native(PyArrayObject* ary)
+  {
+    int native = 1;
+    if (!array_is_native(ary))
+    {
+      PyErr_SetString(PyExc_TypeError,
+                      "Array must have native byteorder.  "
+                      "A byte-swapped array was given");
+      native = 0;
+    }
+    return native;
+  }
+
+  /* Require the given PyArrayObject to have a specified number of
+   * dimensions.  If the array has the specified number of dimensions,
+   * return 1.  Otherwise, set the python error string and return 0.
+   */
+  int require_dimensions(PyArrayObject* ary,
+                         int            exact_dimensions)
+  {
+    int success = 1;
+    if (array_numdims(ary) != exact_dimensions)
+    {
+      PyErr_Format(PyExc_TypeError,
+                   "Array must have %d dimensions.  Given array has %d dimensions",
+                   exact_dimensions,
+                   array_numdims(ary));
+      success = 0;
+    }
+    return success;
+  }
+
+  /* Require the given PyArrayObject to have one of a list of specified
+   * number of dimensions.  If the array has one of the specified number
+   * of dimensions, return 1.  Otherwise, set the python error string
+   * and return 0.
+   */
+  int require_dimensions_n(PyArrayObject* ary,
+                           int*           exact_dimensions,
+                           int            n)
+  {
+    int success = 0;
+    int i;
+    char dims_str[255] = "";
+    char s[255];
+    for (i = 0; i < n && !success; i++)
+    {
+      if (array_numdims(ary) == exact_dimensions[i])
+      {
+        success = 1;
+      }
+    }
+    if (!success)
+    {
+      for (i = 0; i < n-1; i++)
+      {
+        sprintf(s, "%d, ", exact_dimensions[i]);
+        strcat(dims_str,s);
+      }
+      sprintf(s, " or %d", exact_dimensions[n-1]);
+      strcat(dims_str,s);
+      PyErr_Format(PyExc_TypeError,
+                   "Array must have %s dimensions.  Given array has %d dimensions",
+                   dims_str,
+                   array_numdims(ary));
+    }
+    return success;
+  }
+
+  /* Require the given PyArrayObject to have a specified shape.  If the
+   * array has the specified shape, return 1.  Otherwise, set the python
+   * error string and return 0.
+   */
+  int require_size(PyArrayObject* ary,
+                   npy_intp*      size,
+                   int            n)
+  {
+    int i;
+    int success = 1;
+    int len;
+    char desired_dims[255] = "[";
+    char s[255];
+    char actual_dims[255] = "[";
+    for(i=0; i < n;i++)
+    {
+      if (size[i] != -1 &&  size[i] != array_size(ary,i))
+      {
+        success = 0;
+      }
+    }
+    if (!success)
+    {
+      for (i = 0; i < n; i++)
+      {
+        if (size[i] == -1)
+        {
+          sprintf(s, "*,");
+        }
+        else
+        {
+          sprintf(s, "%ld,", (long int)size[i]);
+        }
+        strcat(desired_dims,s);
+      }
+      len = strlen(desired_dims);
+      desired_dims[len-1] = ']';
+      for (i = 0; i < n; i++)
+      {
+        sprintf(s, "%ld,", (long int)array_size(ary,i));
+        strcat(actual_dims,s);
+      }
+      len = strlen(actual_dims);
+      actual_dims[len-1] = ']';
+      PyErr_Format(PyExc_TypeError,
+                   "Array must have shape of %s.  Given array has shape of %s",
+                   desired_dims,
+                   actual_dims);
+    }
+    return success;
+  }
+
+  /* Require the given PyArrayObject to to be Fortran ordered.  If the
+   * the PyArrayObject is already Fortran ordered, do nothing.  Else,
+   * set the Fortran ordering flag and recompute the strides.
+   */
+  int require_fortran(PyArrayObject* ary)
+  {
+    int success = 1;
+    int nd = array_numdims(ary);
+    int i;
+    npy_intp * strides = array_strides(ary);
+    if (array_is_fortran(ary)) return success;
+    /* Set the Fortran ordered flag */
+    array_enableflags(ary,NPY_ARRAY_FARRAY);
+    /* Recompute the strides */
+    strides[0] = strides[nd-1];
+    for (i=1; i < nd; ++i)
+      strides[i] = strides[i-1] * array_size(ary,i-1);
+    return success;
+  }
+}
+
+/* Combine all NumPy fragments into one for convenience */
+%fragment("NumPy_Fragments",
+          "header",
+          fragment="NumPy_Backward_Compatibility",
+          fragment="NumPy_Macros",
+          fragment="NumPy_Utilities",
+          fragment="NumPy_Object_to_Array",
+          fragment="NumPy_Array_Requirements")
+{
+}
+
+/* End John Hunter translation (with modifications by Bill Spotz)
+ */
+
+/* %numpy_typemaps() macro
+ *
+ * This macro defines a family of 75 typemaps that allow C arguments
+ * of the form
+ *
+ *    1. (DATA_TYPE IN_ARRAY1[ANY])
+ *    2. (DATA_TYPE* IN_ARRAY1, DIM_TYPE DIM1)
+ *    3. (DIM_TYPE DIM1, DATA_TYPE* IN_ARRAY1)
+ *
+ *    4. (DATA_TYPE IN_ARRAY2[ANY][ANY])
+ *    5. (DATA_TYPE* IN_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+ *    6. (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_ARRAY2)
+ *    7. (DATA_TYPE* IN_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+ *    8. (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_FARRAY2)
+ *
+ *    9. (DATA_TYPE IN_ARRAY3[ANY][ANY][ANY])
+ *   10. (DATA_TYPE* IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+ *   11. (DATA_TYPE** IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+ *   12. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_ARRAY3)
+ *   13. (DATA_TYPE* IN_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+ *   14. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_FARRAY3)
+ *
+ *   15. (DATA_TYPE IN_ARRAY4[ANY][ANY][ANY][ANY])
+ *   16. (DATA_TYPE* IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+ *   17. (DATA_TYPE** IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+ *   18. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, , DIM_TYPE DIM4, DATA_TYPE* IN_ARRAY4)
+ *   19. (DATA_TYPE* IN_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+ *   20. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_FARRAY4)
+ *
+ *   21. (DATA_TYPE INPLACE_ARRAY1[ANY])
+ *   22. (DATA_TYPE* INPLACE_ARRAY1, DIM_TYPE DIM1)
+ *   23. (DIM_TYPE DIM1, DATA_TYPE* INPLACE_ARRAY1)
+ *
+ *   24. (DATA_TYPE INPLACE_ARRAY2[ANY][ANY])
+ *   25. (DATA_TYPE* INPLACE_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+ *   26. (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_ARRAY2)
+ *   27. (DATA_TYPE* INPLACE_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+ *   28. (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_FARRAY2)
+ *
+ *   29. (DATA_TYPE INPLACE_ARRAY3[ANY][ANY][ANY])
+ *   30. (DATA_TYPE* INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+ *   31. (DATA_TYPE** INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+ *   32. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_ARRAY3)
+ *   33. (DATA_TYPE* INPLACE_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+ *   34. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_FARRAY3)
+ *
+ *   35. (DATA_TYPE INPLACE_ARRAY4[ANY][ANY][ANY][ANY])
+ *   36. (DATA_TYPE* INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+ *   37. (DATA_TYPE** INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+ *   38. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_ARRAY4)
+ *   39. (DATA_TYPE* INPLACE_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+ *   40. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_FARRAY4)
+ *
+ *   41. (DATA_TYPE ARGOUT_ARRAY1[ANY])
+ *   42. (DATA_TYPE* ARGOUT_ARRAY1, DIM_TYPE DIM1)
+ *   43. (DIM_TYPE DIM1, DATA_TYPE* ARGOUT_ARRAY1)
+ *
+ *   44. (DATA_TYPE ARGOUT_ARRAY2[ANY][ANY])
+ *
+ *   45. (DATA_TYPE ARGOUT_ARRAY3[ANY][ANY][ANY])
+ *
+ *   46. (DATA_TYPE ARGOUT_ARRAY4[ANY][ANY][ANY][ANY])
+ *
+ *   47. (DATA_TYPE** ARGOUTVIEW_ARRAY1, DIM_TYPE* DIM1)
+ *   48. (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEW_ARRAY1)
+ *
+ *   49. (DATA_TYPE** ARGOUTVIEW_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+ *   50. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_ARRAY2)
+ *   51. (DATA_TYPE** ARGOUTVIEW_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+ *   52. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_FARRAY2)
+ *
+ *   53. (DATA_TYPE** ARGOUTVIEW_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
+ *   54. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEW_ARRAY3)
+ *   55. (DATA_TYPE** ARGOUTVIEW_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
+ *   56. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEW_FARRAY3)
+ *
+ *   57. (DATA_TYPE** ARGOUTVIEW_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+ *   58. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEW_ARRAY4)
+ *   59. (DATA_TYPE** ARGOUTVIEW_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+ *   60. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEW_FARRAY4)
+ *
+ *   61. (DATA_TYPE** ARGOUTVIEWM_ARRAY1, DIM_TYPE* DIM1)
+ *   62. (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEWM_ARRAY1)
+ *
+ *   63. (DATA_TYPE** ARGOUTVIEWM_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+ *   64. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_ARRAY2)
+ *   65. (DATA_TYPE** ARGOUTVIEWM_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+ *   66. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_FARRAY2)
+ *
+ *   67. (DATA_TYPE** ARGOUTVIEWM_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
+ *   68. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEWM_ARRAY3)
+ *   69. (DATA_TYPE** ARGOUTVIEWM_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
+ *   70. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEWM_FARRAY3)
+ *
+ *   71. (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+ *   72. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_ARRAY4)
+ *   73. (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+ *   74. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_FARRAY4)
+ *
+ *   75. (DATA_TYPE* INPLACE_ARRAY_FLAT, DIM_TYPE DIM_FLAT)
+ *
+ * where "DATA_TYPE" is any type supported by the NumPy module, and
+ * "DIM_TYPE" is any int-like type suitable for specifying dimensions.
+ * The difference between "ARRAY" typemaps and "FARRAY" typemaps is
+ * that the "FARRAY" typemaps expect Fortran ordering of
+ * multidimensional arrays.  In python, the dimensions will not need
+ * to be specified (except for the "DATA_TYPE* ARGOUT_ARRAY1"
+ * typemaps).  The IN_ARRAYs can be a numpy array or any sequence that
+ * can be converted to a numpy array of the specified type.  The
+ * INPLACE_ARRAYs must be numpy arrays of the appropriate type.  The
+ * ARGOUT_ARRAYs will be returned as new numpy arrays of the
+ * appropriate type.
+ *
+ * These typemaps can be applied to existing functions using the
+ * %apply directive.  For example:
+ *
+ *     %apply (double* IN_ARRAY1, int DIM1) {(double* series, int length)};
+ *     double prod(double* series, int length);
+ *
+ *     %apply (int DIM1, int DIM2, double* INPLACE_ARRAY2)
+ *           {(int rows, int cols, double* matrix        )};
+ *     void floor(int rows, int cols, double* matrix, double f);
+ *
+ *     %apply (double IN_ARRAY3[ANY][ANY][ANY])
+ *           {(double tensor[2][2][2]         )};
+ *     %apply (double ARGOUT_ARRAY3[ANY][ANY][ANY])
+ *           {(double low[2][2][2]                )};
+ *     %apply (double ARGOUT_ARRAY3[ANY][ANY][ANY])
+ *           {(double upp[2][2][2]                )};
+ *     void luSplit(double tensor[2][2][2],
+ *                  double low[2][2][2],
+ *                  double upp[2][2][2]    );
+ *
+ * or directly with
+ *
+ *     double prod(double* IN_ARRAY1, int DIM1);
+ *
+ *     void floor(int DIM1, int DIM2, double* INPLACE_ARRAY2, double f);
+ *
+ *     void luSplit(double IN_ARRAY3[ANY][ANY][ANY],
+ *                  double ARGOUT_ARRAY3[ANY][ANY][ANY],
+ *                  double ARGOUT_ARRAY3[ANY][ANY][ANY]);
+ */
+
+%define %numpy_typemaps(DATA_TYPE, DATA_TYPECODE, DIM_TYPE)
+
+/************************/
+/* Input Array Typemaps */
+/************************/
+
+/* Typemap suite for (DATA_TYPE IN_ARRAY1[ANY])
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE IN_ARRAY1[ANY])
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE IN_ARRAY1[ANY])
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[1] = { $1_dim0 };
+  array = obj_to_array_contiguous_allow_conversion($input,
+                                                   DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 1) ||
+      !require_size(array, size, 1)) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+%typemap(freearg)
+  (DATA_TYPE IN_ARRAY1[ANY])
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE* IN_ARRAY1, DIM_TYPE DIM1)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* IN_ARRAY1, DIM_TYPE DIM1)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* IN_ARRAY1, DIM_TYPE DIM1)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[1] = { -1 };
+  array = obj_to_array_contiguous_allow_conversion($input,
+                                                   DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 1) ||
+      !require_size(array, size, 1)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+}
+%typemap(freearg)
+  (DATA_TYPE* IN_ARRAY1, DIM_TYPE DIM1)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DATA_TYPE* IN_ARRAY1)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DATA_TYPE* IN_ARRAY1)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DATA_TYPE* IN_ARRAY1)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[1] = {-1};
+  array = obj_to_array_contiguous_allow_conversion($input,
+                                                   DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 1) ||
+      !require_size(array, size, 1)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DATA_TYPE*) array_data(array);
+}
+%typemap(freearg)
+  (DIM_TYPE DIM1, DATA_TYPE* IN_ARRAY1)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE IN_ARRAY2[ANY][ANY])
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE IN_ARRAY2[ANY][ANY])
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE IN_ARRAY2[ANY][ANY])
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[2] = { $1_dim0, $1_dim1 };
+  array = obj_to_array_contiguous_allow_conversion($input,
+                                                   DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 2) ||
+      !require_size(array, size, 2)) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+%typemap(freearg)
+  (DATA_TYPE IN_ARRAY2[ANY][ANY])
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE* IN_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* IN_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* IN_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[2] = { -1, -1 };
+  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 2) ||
+      !require_size(array, size, 2)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+}
+%typemap(freearg)
+  (DATA_TYPE* IN_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_ARRAY2)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_ARRAY2)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_ARRAY2)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[2] = { -1, -1 };
+  array = obj_to_array_contiguous_allow_conversion($input,
+                                                   DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 2) ||
+      !require_size(array, size, 2)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DATA_TYPE*) array_data(array);
+}
+%typemap(freearg)
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_ARRAY2)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE* IN_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* IN_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* IN_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[2] = { -1, -1 };
+  array = obj_to_array_fortran_allow_conversion($input,
+                                                DATA_TYPECODE,
+                                                &is_new_object);
+  if (!array || !require_dimensions(array, 2) ||
+      !require_size(array, size, 2) || !require_fortran(array)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+}
+%typemap(freearg)
+  (DATA_TYPE* IN_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_FARRAY2)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_FARRAY2)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_FARRAY2)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[2] = { -1, -1 };
+  array = obj_to_array_fortran_allow_conversion($input,
+                                                   DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 2) ||
+      !require_size(array, size, 2) || !require_fortran(array)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DATA_TYPE*) array_data(array);
+}
+%typemap(freearg)
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_FARRAY2)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE IN_ARRAY3[ANY][ANY][ANY])
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE IN_ARRAY3[ANY][ANY][ANY])
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE IN_ARRAY3[ANY][ANY][ANY])
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[3] = { $1_dim0, $1_dim1, $1_dim2 };
+  array = obj_to_array_contiguous_allow_conversion($input,
+                                                   DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 3) ||
+      !require_size(array, size, 3)) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+%typemap(freearg)
+  (DATA_TYPE IN_ARRAY3[ANY][ANY][ANY])
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE* IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[3] = { -1, -1, -1 };
+  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 3) ||
+      !require_size(array, size, 3)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+  $4 = (DIM_TYPE) array_size(array,2);
+}
+%typemap(freearg)
+  (DATA_TYPE* IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE** IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE** IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+{
+  /* for now, only concerned with lists */
+  $1 = PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE** IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+  (DATA_TYPE** array=NULL, PyArrayObject** object_array=NULL, int* is_new_object_array=NULL)
+{
+  npy_intp size[2] = { -1, -1 };
+  PyArrayObject* temp_array;
+  Py_ssize_t i;
+  int is_new_object;
+
+  /* length of the list */
+  $2 = PyList_Size($input);
+
+  /* the arrays */
+  array = (DATA_TYPE **)malloc($2*sizeof(DATA_TYPE *));
+  object_array = (PyArrayObject **)calloc($2,sizeof(PyArrayObject *));
+  is_new_object_array = (int *)calloc($2,sizeof(int));
+
+  if (array == NULL || object_array == NULL || is_new_object_array == NULL)
+  {
+    SWIG_fail;
+  }
+
+  for (i=0; i<$2; i++)
+  {
+    temp_array = obj_to_array_contiguous_allow_conversion(PySequence_GetItem($input,i), DATA_TYPECODE, &is_new_object);
+
+    /* the new array must be stored so that it can be destroyed in freearg */
+    object_array[i] = temp_array;
+    is_new_object_array[i] = is_new_object;
+
+    if (!temp_array || !require_dimensions(temp_array, 2)) SWIG_fail;
+
+    /* store the size of the first array in the list, then use that for comparison. */
+    if (i == 0)
+    {
+      size[0] = array_size(temp_array,0);
+      size[1] = array_size(temp_array,1);
+    }
+
+    if (!require_size(temp_array, size, 2)) SWIG_fail;
+
+    array[i] = (DATA_TYPE*) array_data(temp_array);
+  }
+
+  $1 = (DATA_TYPE**) array;
+  $3 = (DIM_TYPE) size[0];
+  $4 = (DIM_TYPE) size[1];
+}
+%typemap(freearg)
+  (DATA_TYPE** IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+{
+  Py_ssize_t i;
+
+  if (array$argnum!=NULL) free(array$argnum);
+
+  /*freeing the individual arrays if needed */
+  if (object_array$argnum!=NULL)
+  {
+    if (is_new_object_array$argnum!=NULL)
+    {
+      for (i=0; i<$2; i++)
+      {
+        if (object_array$argnum[i] != NULL && is_new_object_array$argnum[i])
+        { Py_DECREF(object_array$argnum[i]); }
+      }
+      free(is_new_object_array$argnum);
+    }
+    free(object_array$argnum);
+  }
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3,
+ *                    DATA_TYPE* IN_ARRAY3)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_ARRAY3)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_ARRAY3)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[3] = { -1, -1, -1 };
+  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 3) ||
+      !require_size(array, size, 3)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DIM_TYPE) array_size(array,2);
+  $4 = (DATA_TYPE*) array_data(array);
+}
+%typemap(freearg)
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_ARRAY3)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE* IN_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* IN_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* IN_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[3] = { -1, -1, -1 };
+  array = obj_to_array_fortran_allow_conversion($input, DATA_TYPECODE,
+                                                &is_new_object);
+  if (!array || !require_dimensions(array, 3) ||
+      !require_size(array, size, 3) | !require_fortran(array)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+  $4 = (DIM_TYPE) array_size(array,2);
+}
+%typemap(freearg)
+  (DATA_TYPE* IN_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3,
+ *                    DATA_TYPE* IN_FARRAY3)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_FARRAY3)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_FARRAY3)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[3] = { -1, -1, -1 };
+  array = obj_to_array_fortran_allow_conversion($input,
+                                                   DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 3) ||
+      !require_size(array, size, 3) || !require_fortran(array)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DIM_TYPE) array_size(array,2);
+  $4 = (DATA_TYPE*) array_data(array);
+}
+%typemap(freearg)
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_FARRAY3)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE IN_ARRAY4[ANY][ANY][ANY][ANY])
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE IN_ARRAY4[ANY][ANY][ANY][ANY])
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE IN_ARRAY4[ANY][ANY][ANY][ANY])
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[4] = { $1_dim0, $1_dim1, $1_dim2 , $1_dim3};
+  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 4) ||
+      !require_size(array, size, 4)) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+%typemap(freearg)
+  (DATA_TYPE IN_ARRAY4[ANY][ANY][ANY][ANY])
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE* IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[4] = { -1, -1, -1, -1 };
+  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 4) ||
+      !require_size(array, size, 4)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+  $4 = (DIM_TYPE) array_size(array,2);
+  $5 = (DIM_TYPE) array_size(array,3);
+}
+%typemap(freearg)
+  (DATA_TYPE* IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE** IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE** IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+{
+  /* for now, only concerned with lists */
+  $1 = PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE** IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+  (DATA_TYPE** array=NULL, PyArrayObject** object_array=NULL, int* is_new_object_array=NULL)
+{
+  npy_intp size[3] = { -1, -1, -1 };
+  PyArrayObject* temp_array;
+  Py_ssize_t i;
+  int is_new_object;
+
+  /* length of the list */
+  $2 = PyList_Size($input);
+
+  /* the arrays */
+  array = (DATA_TYPE **)malloc($2*sizeof(DATA_TYPE *));
+  object_array = (PyArrayObject **)calloc($2,sizeof(PyArrayObject *));
+  is_new_object_array = (int *)calloc($2,sizeof(int));
+
+  if (array == NULL || object_array == NULL || is_new_object_array == NULL)
+  {
+    SWIG_fail;
+  }
+
+  for (i=0; i<$2; i++)
+  {
+    temp_array = obj_to_array_contiguous_allow_conversion(PySequence_GetItem($input,i), DATA_TYPECODE, &is_new_object);
+
+    /* the new array must be stored so that it can be destroyed in freearg */
+    object_array[i] = temp_array;
+    is_new_object_array[i] = is_new_object;
+
+    if (!temp_array || !require_dimensions(temp_array, 3)) SWIG_fail;
+
+    /* store the size of the first array in the list, then use that for comparison. */
+    if (i == 0)
+    {
+      size[0] = array_size(temp_array,0);
+      size[1] = array_size(temp_array,1);
+      size[2] = array_size(temp_array,2);
+    }
+
+    if (!require_size(temp_array, size, 3)) SWIG_fail;
+
+    array[i] = (DATA_TYPE*) array_data(temp_array);
+  }
+
+  $1 = (DATA_TYPE**) array;
+  $3 = (DIM_TYPE) size[0];
+  $4 = (DIM_TYPE) size[1];
+  $5 = (DIM_TYPE) size[2];
+}
+%typemap(freearg)
+  (DATA_TYPE** IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+{
+  Py_ssize_t i;
+
+  if (array$argnum!=NULL) free(array$argnum);
+
+  /*freeing the individual arrays if needed */
+  if (object_array$argnum!=NULL)
+  {
+    if (is_new_object_array$argnum!=NULL)
+    {
+      for (i=0; i<$2; i++)
+      {
+        if (object_array$argnum[i] != NULL && is_new_object_array$argnum[i])
+        { Py_DECREF(object_array$argnum[i]); }
+      }
+      free(is_new_object_array$argnum);
+    }
+    free(object_array$argnum);
+  }
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4,
+ *                    DATA_TYPE* IN_ARRAY4)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_ARRAY4)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_ARRAY4)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[4] = { -1, -1, -1 , -1};
+  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 4) ||
+      !require_size(array, size, 4)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DIM_TYPE) array_size(array,2);
+  $4 = (DIM_TYPE) array_size(array,3);
+  $5 = (DATA_TYPE*) array_data(array);
+}
+%typemap(freearg)
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_ARRAY4)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE* IN_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* IN_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* IN_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[4] = { -1, -1, -1, -1 };
+  array = obj_to_array_fortran_allow_conversion($input, DATA_TYPECODE,
+                                                &is_new_object);
+  if (!array || !require_dimensions(array, 4) ||
+      !require_size(array, size, 4) | !require_fortran(array)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+  $4 = (DIM_TYPE) array_size(array,2);
+  $5 = (DIM_TYPE) array_size(array,3);
+}
+%typemap(freearg)
+  (DATA_TYPE* IN_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4,
+ *                    DATA_TYPE* IN_FARRAY4)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_FARRAY4)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_FARRAY4)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[4] = { -1, -1, -1 , -1 };
+  array = obj_to_array_fortran_allow_conversion($input, DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 4) ||
+      !require_size(array, size, 4) || !require_fortran(array)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DIM_TYPE) array_size(array,2);
+  $4 = (DIM_TYPE) array_size(array,3);
+  $5 = (DATA_TYPE*) array_data(array);
+}
+%typemap(freearg)
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_FARRAY4)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/***************************/
+/* In-Place Array Typemaps */
+/***************************/
+
+/* Typemap suite for (DATA_TYPE INPLACE_ARRAY1[ANY])
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE INPLACE_ARRAY1[ANY])
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE INPLACE_ARRAY1[ANY])
+  (PyArrayObject* array=NULL)
+{
+  npy_intp size[1] = { $1_dim0 };
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,1) || !require_size(array, size, 1) ||
+      !require_contiguous(array) || !require_native(array)) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+
+/* Typemap suite for (DATA_TYPE* INPLACE_ARRAY1, DIM_TYPE DIM1)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* INPLACE_ARRAY1, DIM_TYPE DIM1)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* INPLACE_ARRAY1, DIM_TYPE DIM1)
+  (PyArrayObject* array=NULL, int i=1)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,1) || !require_contiguous(array)
+      || !require_native(array)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = 1;
+  for (i=0; i < array_numdims(array); ++i) $2 *= array_size(array,i);
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DATA_TYPE* INPLACE_ARRAY1)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DATA_TYPE* INPLACE_ARRAY1)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DATA_TYPE* INPLACE_ARRAY1)
+  (PyArrayObject* array=NULL, int i=0)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,1) || !require_contiguous(array)
+      || !require_native(array)) SWIG_fail;
+  $1 = 1;
+  for (i=0; i < array_numdims(array); ++i) $1 *= array_size(array,i);
+  $2 = (DATA_TYPE*) array_data(array);
+}
+
+/* Typemap suite for (DATA_TYPE INPLACE_ARRAY2[ANY][ANY])
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE INPLACE_ARRAY2[ANY][ANY])
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE INPLACE_ARRAY2[ANY][ANY])
+  (PyArrayObject* array=NULL)
+{
+  npy_intp size[2] = { $1_dim0, $1_dim1 };
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,2) || !require_size(array, size, 2) ||
+      !require_contiguous(array) || !require_native(array)) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+
+/* Typemap suite for (DATA_TYPE* INPLACE_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* INPLACE_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* INPLACE_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,2) || !require_contiguous(array)
+      || !require_native(array)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_ARRAY2)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_ARRAY2)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_ARRAY2)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,2) || !require_contiguous(array) ||
+      !require_native(array)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DATA_TYPE*) array_data(array);
+}
+
+/* Typemap suite for (DATA_TYPE* INPLACE_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* INPLACE_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* INPLACE_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,2) || !require_contiguous(array)
+      || !require_native(array) || !require_fortran(array)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_FARRAY2)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_FARRAY2)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_FARRAY2)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,2) || !require_contiguous(array) ||
+      !require_native(array) || !require_fortran(array)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DATA_TYPE*) array_data(array);
+}
+
+/* Typemap suite for (DATA_TYPE INPLACE_ARRAY3[ANY][ANY][ANY])
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE INPLACE_ARRAY3[ANY][ANY][ANY])
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE INPLACE_ARRAY3[ANY][ANY][ANY])
+  (PyArrayObject* array=NULL)
+{
+  npy_intp size[3] = { $1_dim0, $1_dim1, $1_dim2 };
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,3) || !require_size(array, size, 3) ||
+      !require_contiguous(array) || !require_native(array)) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+
+/* Typemap suite for (DATA_TYPE* INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,3) || !require_contiguous(array) ||
+      !require_native(array)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+  $4 = (DIM_TYPE) array_size(array,2);
+}
+
+/* Typemap suite for (DATA_TYPE** INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE** INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+{
+  $1 = PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE** INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+  (DATA_TYPE** array=NULL, PyArrayObject** object_array=NULL)
+{
+  npy_intp size[2] = { -1, -1 };
+  PyArrayObject* temp_array;
+  Py_ssize_t i;
+
+  /* length of the list */
+  $2 = PyList_Size($input);
+
+  /* the arrays */
+  array = (DATA_TYPE **)malloc($2*sizeof(DATA_TYPE *));
+  object_array = (PyArrayObject **)calloc($2,sizeof(PyArrayObject *));
+
+  if (array == NULL || object_array == NULL)
+  {
+    SWIG_fail;
+  }
+
+  for (i=0; i<$2; i++)
+  {
+    temp_array = obj_to_array_no_conversion(PySequence_GetItem($input,i), DATA_TYPECODE);
+
+    /* the new array must be stored so that it can be destroyed in freearg */
+    object_array[i] = temp_array;
+
+    if ( !temp_array || !require_dimensions(temp_array, 2) ||
+      !require_contiguous(temp_array) ||
+      !require_native(temp_array) ||
+      !PyArray_EquivTypenums(array_type(temp_array), DATA_TYPECODE)
+    ) SWIG_fail;
+
+    /* store the size of the first array in the list, then use that for comparison. */
+    if (i == 0)
+    {
+      size[0] = array_size(temp_array,0);
+      size[1] = array_size(temp_array,1);
+    }
+
+    if (!require_size(temp_array, size, 2)) SWIG_fail;
+
+    array[i] = (DATA_TYPE*) array_data(temp_array);
+  }
+
+  $1 = (DATA_TYPE**) array;
+  $3 = (DIM_TYPE) size[0];
+  $4 = (DIM_TYPE) size[1];
+}
+%typemap(freearg)
+  (DATA_TYPE** INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+{
+  if (array$argnum!=NULL) free(array$argnum);
+  if (object_array$argnum!=NULL) free(object_array$argnum);
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3,
+ *                    DATA_TYPE* INPLACE_ARRAY3)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_ARRAY3)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_ARRAY3)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,3) || !require_contiguous(array)
+      || !require_native(array)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DIM_TYPE) array_size(array,2);
+  $4 = (DATA_TYPE*) array_data(array);
+}
+
+/* Typemap suite for (DATA_TYPE* INPLACE_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* INPLACE_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* INPLACE_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,3) || !require_contiguous(array) ||
+      !require_native(array) || !require_fortran(array)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+  $4 = (DIM_TYPE) array_size(array,2);
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3,
+ *                    DATA_TYPE* INPLACE_FARRAY3)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_FARRAY3)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_FARRAY3)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,3) || !require_contiguous(array)
+      || !require_native(array) || !require_fortran(array)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DIM_TYPE) array_size(array,2);
+  $4 = (DATA_TYPE*) array_data(array);
+}
+
+/* Typemap suite for (DATA_TYPE INPLACE_ARRAY4[ANY][ANY][ANY][ANY])
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE INPLACE_ARRAY4[ANY][ANY][ANY][ANY])
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE INPLACE_ARRAY4[ANY][ANY][ANY][ANY])
+  (PyArrayObject* array=NULL)
+{
+  npy_intp size[4] = { $1_dim0, $1_dim1, $1_dim2 , $1_dim3 };
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,4) || !require_size(array, size, 4) ||
+      !require_contiguous(array) || !require_native(array)) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+
+/* Typemap suite for (DATA_TYPE* INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,4) || !require_contiguous(array) ||
+      !require_native(array)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+  $4 = (DIM_TYPE) array_size(array,2);
+  $5 = (DIM_TYPE) array_size(array,3);
+}
+
+/* Typemap suite for (DATA_TYPE** INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE** INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+{
+  $1 = PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE** INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+  (DATA_TYPE** array=NULL, PyArrayObject** object_array=NULL)
+{
+  npy_intp size[3] = { -1, -1, -1 };
+  PyArrayObject* temp_array;
+  Py_ssize_t i;
+
+  /* length of the list */
+  $2 = PyList_Size($input);
+
+  /* the arrays */
+  array = (DATA_TYPE **)malloc($2*sizeof(DATA_TYPE *));
+  object_array = (PyArrayObject **)calloc($2,sizeof(PyArrayObject *));
+
+  if (array == NULL || object_array == NULL)
+  {
+    SWIG_fail;
+  }
+
+  for (i=0; i<$2; i++)
+  {
+    temp_array = obj_to_array_no_conversion(PySequence_GetItem($input,i), DATA_TYPECODE);
+
+    /* the new array must be stored so that it can be destroyed in freearg */
+    object_array[i] = temp_array;
+
+    if ( !temp_array || !require_dimensions(temp_array, 3) ||
+      !require_contiguous(temp_array) ||
+      !require_native(temp_array) ||
+      !PyArray_EquivTypenums(array_type(temp_array), DATA_TYPECODE)
+    ) SWIG_fail;
+
+    /* store the size of the first array in the list, then use that for comparison. */
+    if (i == 0)
+    {
+      size[0] = array_size(temp_array,0);
+      size[1] = array_size(temp_array,1);
+      size[2] = array_size(temp_array,2);
+    }
+
+    if (!require_size(temp_array, size, 3)) SWIG_fail;
+
+    array[i] = (DATA_TYPE*) array_data(temp_array);
+  }
+
+  $1 = (DATA_TYPE**) array;
+  $3 = (DIM_TYPE) size[0];
+  $4 = (DIM_TYPE) size[1];
+  $5 = (DIM_TYPE) size[2];
+}
+%typemap(freearg)
+  (DATA_TYPE** INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+{
+  if (array$argnum!=NULL) free(array$argnum);
+  if (object_array$argnum!=NULL) free(object_array$argnum);
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4,
+ *                    DATA_TYPE* INPLACE_ARRAY4)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_ARRAY4)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_ARRAY4)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,4) || !require_contiguous(array)
+      || !require_native(array)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DIM_TYPE) array_size(array,2);
+  $4 = (DIM_TYPE) array_size(array,3);
+  $5 = (DATA_TYPE*) array_data(array);
+}
+
+/* Typemap suite for (DATA_TYPE* INPLACE_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* INPLACE_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* INPLACE_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,4) || !require_contiguous(array) ||
+      !require_native(array) || !require_fortran(array)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+  $4 = (DIM_TYPE) array_size(array,2);
+  $5 = (DIM_TYPE) array_size(array,3);
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3,
+ *                    DATA_TYPE* INPLACE_FARRAY4)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_FARRAY4)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_FARRAY4)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,4) || !require_contiguous(array)
+      || !require_native(array) || !require_fortran(array)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DIM_TYPE) array_size(array,2);
+  $4 = (DIM_TYPE) array_size(array,3);
+  $5 = (DATA_TYPE*) array_data(array);
+}
+
+/*************************/
+/* Argout Array Typemaps */
+/*************************/
+
+/* Typemap suite for (DATA_TYPE ARGOUT_ARRAY1[ANY])
+ */
+%typemap(in,numinputs=0,
+         fragment="NumPy_Backward_Compatibility,NumPy_Macros")
+  (DATA_TYPE ARGOUT_ARRAY1[ANY])
+  (PyObject* array = NULL)
+{
+  npy_intp dims[1] = { $1_dim0 };
+  array = PyArray_SimpleNew(1, dims, DATA_TYPECODE);
+  if (!array) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+%typemap(argout)
+  (DATA_TYPE ARGOUT_ARRAY1[ANY])
+{
+  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
+}
+
+/* Typemap suite for (DATA_TYPE* ARGOUT_ARRAY1, DIM_TYPE DIM1)
+ */
+%typemap(in,numinputs=1,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* ARGOUT_ARRAY1, DIM_TYPE DIM1)
+  (PyObject* array = NULL)
+{
+  npy_intp dims[1];
+  if (!PyInt_Check($input))
+  {
+    const char* typestring = pytype_string($input);
+    PyErr_Format(PyExc_TypeError,
+                 "Int dimension expected.  '%s' given.",
+                 typestring);
+    SWIG_fail;
+  }
+  $2 = (DIM_TYPE) PyInt_AsLong($input);
+  dims[0] = (npy_intp) $2;
+  array = PyArray_SimpleNew(1, dims, DATA_TYPECODE);
+  if (!array) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+}
+%typemap(argout)
+  (DATA_TYPE* ARGOUT_ARRAY1, DIM_TYPE DIM1)
+{
+  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DATA_TYPE* ARGOUT_ARRAY1)
+ */
+%typemap(in,numinputs=1,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DATA_TYPE* ARGOUT_ARRAY1)
+  (PyObject* array = NULL)
+{
+  npy_intp dims[1];
+  if (!PyInt_Check($input))
+  {
+    const char* typestring = pytype_string($input);
+    PyErr_Format(PyExc_TypeError,
+                 "Int dimension expected.  '%s' given.",
+                 typestring);
+    SWIG_fail;
+  }
+  $1 = (DIM_TYPE) PyInt_AsLong($input);
+  dims[0] = (npy_intp) $1;
+  array = PyArray_SimpleNew(1, dims, DATA_TYPECODE);
+  if (!array) SWIG_fail;
+  $2 = (DATA_TYPE*) array_data(array);
+}
+%typemap(argout)
+  (DIM_TYPE DIM1, DATA_TYPE* ARGOUT_ARRAY1)
+{
+  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
+}
+
+/* Typemap suite for (DATA_TYPE ARGOUT_ARRAY2[ANY][ANY])
+ */
+%typemap(in,numinputs=0,
+         fragment="NumPy_Backward_Compatibility,NumPy_Macros")
+  (DATA_TYPE ARGOUT_ARRAY2[ANY][ANY])
+  (PyObject* array = NULL)
+{
+  npy_intp dims[2] = { $1_dim0, $1_dim1 };
+  array = PyArray_SimpleNew(2, dims, DATA_TYPECODE);
+  if (!array) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+%typemap(argout)
+  (DATA_TYPE ARGOUT_ARRAY2[ANY][ANY])
+{
+  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
+}
+
+/* Typemap suite for (DATA_TYPE ARGOUT_ARRAY3[ANY][ANY][ANY])
+ */
+%typemap(in,numinputs=0,
+         fragment="NumPy_Backward_Compatibility,NumPy_Macros")
+  (DATA_TYPE ARGOUT_ARRAY3[ANY][ANY][ANY])
+  (PyObject* array = NULL)
+{
+  npy_intp dims[3] = { $1_dim0, $1_dim1, $1_dim2 };
+  array = PyArray_SimpleNew(3, dims, DATA_TYPECODE);
+  if (!array) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+%typemap(argout)
+  (DATA_TYPE ARGOUT_ARRAY3[ANY][ANY][ANY])
+{
+  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
+}
+
+/* Typemap suite for (DATA_TYPE ARGOUT_ARRAY4[ANY][ANY][ANY][ANY])
+ */
+%typemap(in,numinputs=0,
+         fragment="NumPy_Backward_Compatibility,NumPy_Macros")
+  (DATA_TYPE ARGOUT_ARRAY4[ANY][ANY][ANY][ANY])
+  (PyObject* array = NULL)
+{
+  npy_intp dims[4] = { $1_dim0, $1_dim1, $1_dim2, $1_dim3 };
+  array = PyArray_SimpleNew(4, dims, DATA_TYPECODE);
+  if (!array) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+%typemap(argout)
+  (DATA_TYPE ARGOUT_ARRAY4[ANY][ANY][ANY][ANY])
+{
+  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
+}
+
+/*****************************/
+/* Argoutview Array Typemaps */
+/*****************************/
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_ARRAY1, DIM_TYPE* DIM1)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEW_ARRAY1, DIM_TYPE* DIM1    )
+  (DATA_TYPE*  data_temp = NULL , DIM_TYPE  dim_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility")
+  (DATA_TYPE** ARGOUTVIEW_ARRAY1, DIM_TYPE* DIM1)
+{
+  npy_intp dims[1] = { *$2 };
+  PyObject* obj = PyArray_SimpleNewFromData(1, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEW_ARRAY1)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DATA_TYPE** ARGOUTVIEW_ARRAY1)
+  (DIM_TYPE  dim_temp, DATA_TYPE*  data_temp = NULL )
+{
+  $1 = &dim_temp;
+  $2 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility")
+  (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEW_ARRAY1)
+{
+  npy_intp dims[1] = { *$1 };
+  PyObject* obj = PyArray_SimpleNewFromData(1, dims, DATA_TYPECODE, (void*)(*$2));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEW_ARRAY2, DIM_TYPE* DIM1     , DIM_TYPE* DIM2     )
+  (DATA_TYPE*  data_temp = NULL , DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility")
+  (DATA_TYPE** ARGOUTVIEW_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+{
+  npy_intp dims[2] = { *$2, *$3 };
+  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_ARRAY2)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1     , DIM_TYPE* DIM2     , DATA_TYPE** ARGOUTVIEW_ARRAY2)
+  (DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp, DATA_TYPE*  data_temp = NULL )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_ARRAY2)
+{
+  npy_intp dims[2] = { *$1, *$2 };
+  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$3));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEW_FARRAY2, DIM_TYPE* DIM1     , DIM_TYPE* DIM2     )
+  (DATA_TYPE*  data_temp = NULL  , DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
+  (DATA_TYPE** ARGOUTVIEW_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+{
+  npy_intp dims[2] = { *$2, *$3 };
+  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_FARRAY2)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1     , DIM_TYPE* DIM2     , DATA_TYPE** ARGOUTVIEW_FARRAY2)
+  (DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp, DATA_TYPE*  data_temp = NULL  )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_FARRAY2)
+{
+  npy_intp dims[2] = { *$1, *$2 };
+  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$3));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
+                      DIM_TYPE* DIM3)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEW_ARRAY3, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    )
+  (DATA_TYPE* data_temp = NULL  , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+  $4 = &dim3_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility")
+  (DATA_TYPE** ARGOUTVIEW_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
+{
+  npy_intp dims[3] = { *$2, *$3, *$4 };
+  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3,
+                      DATA_TYPE** ARGOUTVIEW_ARRAY3)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEW_ARRAY3)
+  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DATA_TYPE* data_temp = NULL)
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &dim3_temp;
+  $4 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEW_ARRAY3)
+{
+  npy_intp dims[3] = { *$1, *$2, *$3 };
+  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$4));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
+                      DIM_TYPE* DIM3)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEW_FARRAY3, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    )
+  (DATA_TYPE* data_temp = NULL   , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+  $4 = &dim3_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
+  (DATA_TYPE** ARGOUTVIEW_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
+{
+  npy_intp dims[3] = { *$2, *$3, *$4 };
+  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3,
+                      DATA_TYPE** ARGOUTVIEW_FARRAY3)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DATA_TYPE** ARGOUTVIEW_FARRAY3)
+  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DATA_TYPE* data_temp = NULL   )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &dim3_temp;
+  $4 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEW_FARRAY3)
+{
+  npy_intp dims[3] = { *$1, *$2, *$3 };
+  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$4));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
+                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEW_ARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
+  (DATA_TYPE* data_temp = NULL  , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+  $4 = &dim3_temp;
+  $5 = &dim4_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility")
+  (DATA_TYPE** ARGOUTVIEW_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+{
+  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
+                      DATA_TYPE** ARGOUTVIEW_ARRAY4)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEW_ARRAY4)
+  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL  )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &dim3_temp;
+  $4 = &dim4_temp;
+  $5 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEW_ARRAY4)
+{
+  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
+                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEW_FARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
+  (DATA_TYPE* data_temp = NULL   , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+  $4 = &dim3_temp;
+  $5 = &dim4_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
+  (DATA_TYPE** ARGOUTVIEW_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+{
+  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
+                      DATA_TYPE** ARGOUTVIEW_FARRAY4)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEW_FARRAY4)
+  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL   )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &dim3_temp;
+  $4 = &dim4_temp;
+  $5 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEW_FARRAY4)
+{
+  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/*************************************/
+/* Managed Argoutview Array Typemaps */
+/*************************************/
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_ARRAY1, DIM_TYPE* DIM1)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEWM_ARRAY1, DIM_TYPE* DIM1    )
+  (DATA_TYPE*  data_temp = NULL  , DIM_TYPE  dim_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
+  (DATA_TYPE** ARGOUTVIEWM_ARRAY1, DIM_TYPE* DIM1)
+{
+  npy_intp dims[1] = { *$2 };
+  PyObject* obj = PyArray_SimpleNewFromData(1, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEWM_ARRAY1)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DATA_TYPE** ARGOUTVIEWM_ARRAY1)
+  (DIM_TYPE  dim_temp, DATA_TYPE*  data_temp = NULL  )
+{
+  $1 = &dim_temp;
+  $2 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
+  (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEWM_ARRAY1)
+{
+  npy_intp dims[1] = { *$1 };
+  PyObject* obj = PyArray_SimpleNewFromData(1, dims, DATA_TYPECODE, (void*)(*$2));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEWM_ARRAY2, DIM_TYPE* DIM1     , DIM_TYPE* DIM2     )
+  (DATA_TYPE*  data_temp = NULL  , DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
+  (DATA_TYPE** ARGOUTVIEWM_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+{
+  npy_intp dims[2] = { *$2, *$3 };
+  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_ARRAY2)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1     , DIM_TYPE* DIM2     , DATA_TYPE** ARGOUTVIEWM_ARRAY2)
+  (DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp, DATA_TYPE*  data_temp = NULL  )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_ARRAY2)
+{
+  npy_intp dims[2] = { *$1, *$2 };
+  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$3));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEWM_FARRAY2, DIM_TYPE* DIM1     , DIM_TYPE* DIM2     )
+  (DATA_TYPE*  data_temp = NULL   , DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
+  (DATA_TYPE** ARGOUTVIEWM_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+{
+  npy_intp dims[2] = { *$2, *$3 };
+  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_FARRAY2)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1     , DIM_TYPE* DIM2     , DATA_TYPE** ARGOUTVIEWM_FARRAY2)
+  (DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp, DATA_TYPE*  data_temp = NULL   )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_FARRAY2)
+{
+  npy_intp dims[2] = { *$1, *$2 };
+  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$3));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
+                      DIM_TYPE* DIM3)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEWM_ARRAY3, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    )
+  (DATA_TYPE* data_temp = NULL   , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+  $4 = &dim3_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
+  (DATA_TYPE** ARGOUTVIEWM_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
+{
+  npy_intp dims[3] = { *$2, *$3, *$4 };
+  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3,
+                      DATA_TYPE** ARGOUTVIEWM_ARRAY3)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DATA_TYPE** ARGOUTVIEWM_ARRAY3)
+  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DATA_TYPE* data_temp = NULL   )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &dim3_temp;
+  $4 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEWM_ARRAY3)
+{
+  npy_intp dims[3] = { *$1, *$2, *$3 };
+  PyObject* obj= PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$4));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
+                      DIM_TYPE* DIM3)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEWM_FARRAY3, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    )
+  (DATA_TYPE* data_temp = NULL    , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+  $4 = &dim3_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
+  (DATA_TYPE** ARGOUTVIEWM_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
+{
+  npy_intp dims[3] = { *$2, *$3, *$4 };
+  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3,
+                      DATA_TYPE** ARGOUTVIEWM_FARRAY3)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DATA_TYPE** ARGOUTVIEWM_FARRAY3)
+  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DATA_TYPE* data_temp = NULL    )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &dim3_temp;
+  $4 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEWM_FARRAY3)
+{
+  npy_intp dims[3] = { *$1, *$2, *$3 };
+  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$4));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
+                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
+  (DATA_TYPE* data_temp = NULL   , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+  $4 = &dim3_temp;
+  $5 = &dim4_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
+  (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+{
+  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
+                      DATA_TYPE** ARGOUTVIEWM_ARRAY4)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEWM_ARRAY4)
+  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL   )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &dim3_temp;
+  $4 = &dim4_temp;
+  $5 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_ARRAY4)
+{
+  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
+                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
+  (DATA_TYPE* data_temp = NULL    , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+  $4 = &dim3_temp;
+  $5 = &dim4_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
+  (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
+{
+  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
+                      DATA_TYPE** ARGOUTVIEWM_FARRAY4)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEWM_FARRAY4)
+  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL    )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &dim3_temp;
+  $4 = &dim4_temp;
+  $5 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_FARRAY4)
+{
+  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
+                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
+  (DATA_TYPE* data_temp = NULL   , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+  $4 = &dim3_temp;
+  $5 = &dim4_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
+  (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+{
+  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
+                      DATA_TYPE** ARGOUTVIEWM_ARRAY4)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEWM_ARRAY4)
+  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL   )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &dim3_temp;
+  $4 = &dim4_temp;
+  $5 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_ARRAY4)
+{
+  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
+                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
+  (DATA_TYPE* data_temp = NULL    , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+  $4 = &dim3_temp;
+  $5 = &dim4_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
+  (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+{
+  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
+                      DATA_TYPE** ARGOUTVIEWM_FARRAY4)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEWM_FARRAY4)
+  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL    )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &dim3_temp;
+  $4 = &dim4_temp;
+  $5 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_FARRAY4)
+{
+  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/**************************************/
+/* In-Place Array Typemap - flattened */
+/**************************************/
+
+/* Typemap suite for (DATA_TYPE* INPLACE_ARRAY_FLAT, DIM_TYPE DIM_FLAT)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* INPLACE_ARRAY_FLAT, DIM_TYPE DIM_FLAT)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* INPLACE_ARRAY_FLAT, DIM_TYPE DIM_FLAT)
+  (PyArrayObject* array=NULL, int i=1)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_c_or_f_contiguous(array)
+      || !require_native(array)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = 1;
+  for (i=0; i < array_numdims(array); ++i) $2 *= array_size(array,i);
+}
+
+%enddef    /* %numpy_typemaps() macro */
+/* *************************************************************** */
+
+/* Concrete instances of the %numpy_typemaps() macro: Each invocation
+ * below applies all of the typemaps above to the specified data type.
+ */
+%numpy_typemaps(signed char       , NPY_BYTE     , int)
+%numpy_typemaps(unsigned char     , NPY_UBYTE    , int)
+%numpy_typemaps(short             , NPY_SHORT    , int)
+%numpy_typemaps(unsigned short    , NPY_USHORT   , int)
+%numpy_typemaps(int               , NPY_INT      , int)
+%numpy_typemaps(unsigned int      , NPY_UINT     , int)
+%numpy_typemaps(long              , NPY_LONG     , int)
+%numpy_typemaps(unsigned long     , NPY_ULONG    , int)
+%numpy_typemaps(long long         , NPY_LONGLONG , int)
+%numpy_typemaps(unsigned long long, NPY_ULONGLONG, int)
+%numpy_typemaps(float             , NPY_FLOAT    , int)
+%numpy_typemaps(double            , NPY_DOUBLE   , int)
+
+/* ***************************************************************
+ * The follow macro expansion does not work, because C++ bool is 4
+ * bytes and NPY_BOOL is 1 byte
+ *
+ *    %numpy_typemaps(bool, NPY_BOOL, int)
+ */
+
+/* ***************************************************************
+ * On my Mac, I get the following warning for this macro expansion:
+ * 'swig/python detected a memory leak of type 'long double *', no destructor found.'
+ *
+ *    %numpy_typemaps(long double, NPY_LONGDOUBLE, int)
+ */
+
+#ifdef __cplusplus
+
+%include <std_complex.i>
+
+%numpy_typemaps(std::complex<float>,  NPY_CFLOAT , int)
+%numpy_typemaps(std::complex<double>, NPY_CDOUBLE, int)
+
+#endif
+
+#endif /* SWIGPYTHON */
diff --git a/paddle/api/paddle_api_config.py.in b/paddle/api/paddle_api_config.py.in
new file mode 100644
index 00000000000000..6531e5ccb3dba3
--- /dev/null
+++ b/paddle/api/paddle_api_config.py.in
@@ -0,0 +1,17 @@
+PADDLE_BUILD_DIR="@CMAKE_CURRENT_BINARY_DIR@/../"
+WITH_GPU="@WITH_GPU@"
+PROTOBUF_LIB="@PROTOBUF_LIBRARY@"
+CMAKE_THREAD_LIB="@CMAKE_THREAD_LIBS_INIT@"
+CMAKE_DL_LIBS="@CMAKE_DL_LIBS@"
+
+
+WITH_PYTHON="@WITH_PYTHON@"
+PYTHON_LIBRARIES="@PYTHON_LIBRARIES@"
+WITH_GLOG="@WITH_GLOG@"
+LIBGLOG_LIBRARY="@LIBGLOG_LIBRARY@"
+WITH_GFLAGS="@WITH_GFLAGS@"
+GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
+GFLAGS_LOCATION="@GFLAGS_LOCATION@"
+CBLAS_LIBRARIES="@CBLAS_LIBS@"
+
+CUDA_LIBRARIES="@CUDA_LIBRARIES@"
diff --git a/paddle/api/paddle_ld_flags.py b/paddle/api/paddle_ld_flags.py
new file mode 100644
index 00000000000000..21b4ca1dd61713
--- /dev/null
+++ b/paddle/api/paddle_ld_flags.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+try:
+    from paddle_api_config import *
+    import os.path
+
+    LIB_DIRS = ["math", 'utils', 'parameter', "gserver", "api", "cuda", "pserver", "trainer"]
+    PARENT_LIB_DIRS = ['proto']
+
+    class PaddleLDFlag(object):
+        def __init__(self):
+            self.paddle_build_dir = PADDLE_BUILD_DIR
+            self.paddle_build_dir = os.path.abspath(self.paddle_build_dir)
+            self.with_gpu = PaddleLDFlag.cmake_bool(WITH_GPU)
+            self.protolib = PROTOBUF_LIB
+            self.thread = CMAKE_THREAD_LIB
+            self.dl_libs = CMAKE_DL_LIBS
+            self.with_python = PaddleLDFlag.cmake_bool(WITH_PYTHON)
+            self.python_libs = PYTHON_LIBRARIES
+
+            self.with_glog = PaddleLDFlag.cmake_bool(WITH_GLOG)
+            self.glog_libs = LIBGLOG_LIBRARY
+
+            self.with_gflags = PaddleLDFlag.cmake_bool(WITH_GFLAGS)
+            self.gflags_libs = GFLAGS_LIBRARIES
+            self.gflags_location = GFLAGS_LOCATION
+            self.cblas_libs = CBLAS_LIBRARIES
+            self.curt = CUDA_LIBRARIES
+
+        def ldflag_str(self):
+            return " ".join([self.libs_dir_str(),
+                             self.parent_dir_str(),
+                             self.libs_str()])
+
+        def libs_dir_str(self):
+            libdirs = LIB_DIRS
+            return " ".join(map(lambda x: "-L" + os.path.join(self.paddle_build_dir, x),
+                                libdirs))
+
+        def parent_dir_str(self):
+            libdirs = PARENT_LIB_DIRS
+            return " ".join(map(lambda x: "-L" + os.path.join(self.paddle_build_dir, '..', x), 
+                libdirs))
+
+        def libs_str(self):
+            libs = [
+                "-Wl,--whole-archive",
+                "-lpaddle_gserver",
+                "-Wl,--no-whole-archive",
+                "-lpaddle_pserver",
+                "-lpaddle_trainer_lib",
+                "-lpaddle_network",
+                '-lpaddle_parameter',
+                "-lpaddle_math",
+                '-lpaddle_utils',
+                "-lpaddle_proto",
+                "-lpaddle_cuda",
+                "-lpaddle_api",
+                self.normalize_flag(self.protolib),
+                self.normalize_flag(self.thread),
+                self.normalize_flag(self.dl_libs),
+                self.normalize_flag(self.cblas_libs),
+            ]
+
+            if self.with_python:
+                libs.append(self.normalize_flag(self.python_libs))
+            if self.with_glog:
+                libs.append(self.normalize_flag(self.glog_libs))
+            if self.with_gflags:
+                libs.append(self.normalize_flag(self.gflags_libs))
+            if self.with_gpu:
+                libs.append(self.normalize_flag(self.curt))
+            return " ".join(filter(lambda l: len(l) != 0, libs))
+
+        def normalize_flag(self, cmake_flag):
+            """
+            CMake flag string to ld flag
+            :type cmake_flag: str
+            """
+            if ";" in cmake_flag:
+                return " ".join(map(self.normalize_flag, cmake_flag.split(";")))
+            if cmake_flag.startswith("/"):  # is a path
+                return cmake_flag
+            elif cmake_flag.startswith("-l"):  # normal link command
+                return cmake_flag
+            elif cmake_flag in ["gflags-shared",
+                                "gflags-static",
+                                "gflags_nothreads-shared",
+                                "gflags_nothreads-static"]:  # special for gflags
+                assert PaddleLDFlag.cmake_bool(self.gflags_location)
+                return self.gflags_location
+            elif len(cmake_flag) != 0:
+                return "".join(["-l", cmake_flag])
+            else:
+                return ""
+
+        @staticmethod
+        def cmake_bool(cmake_str):
+            """
+            CMake bool string to bool
+            :param cmake_str: cmake boolean string
+            :type cmake_str: str
+            :rtype: bool
+            """
+            if cmake_str in ["FALSE", "OFF", "NO"] or cmake_str.endswith("-NOTFOUND"):
+                return False
+            else:
+                return True
+
+except ImportError:
+    class PaddleLDFlag(object):
+        def ldflag_str(self):
+            pass
diff --git a/paddle/api/test/CMakeLists.txt b/paddle/api/test/CMakeLists.txt
new file mode 100644
index 00000000000000..c4c26e6c03fdff
--- /dev/null
+++ b/paddle/api/test/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_test(NAME test_swig_api
+    COMMAND bash ${PROJ_ROOT}/paddle/api/test/run_tests.sh)
\ No newline at end of file
diff --git a/paddle/api/test/run_tests.sh b/paddle/api/test/run_tests.sh
new file mode 100755
index 00000000000000..61a76507d2cf78
--- /dev/null
+++ b/paddle/api/test/run_tests.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+pushd `dirname $0` > /dev/null
+SCRIPTPATH=$PWD
+popd > /dev/null
+
+cd $SCRIPTPATH
+
+if [ ! -f ../../*.whl ] ; then  # Swig not compiled.
+  exit 0
+fi
+
+rm .test_env -rf
+virtualenv .test_env
+source .test_env/bin/activate
+
+pip --timeout 600  install ../../*.whl
+
+test_list="testArguments.py testGradientMachine.py testMatrix.py  testVector.py testTrain.py"
+
+export PYTHONPATH=$PWD/../../../python/
+
+for fn in $test_list
+do
+  echo "test $fn"
+  python $fn
+  if [ $? -ne 0 ]; then
+    exit 1
+  fi
+done
diff --git a/paddle/api/test/testArguments.py b/paddle/api/test/testArguments.py
new file mode 100644
index 00000000000000..daedd2409effcc
--- /dev/null
+++ b/paddle/api/test/testArguments.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from py_paddle import swig_paddle
+import unittest
+
+
+class TestArguments(unittest.TestCase):
+    def test_load_arguments(self):
+        m = swig_paddle.Matrix.createDense([4, 2, 4, 3, 9, 5], 2, 3)
+        args = swig_paddle.Arguments.createArguments(1)
+        args.setSlotValue(0, m)
+
+        mat = args.getSlotValue(0)
+        assert isinstance(mat, swig_paddle.Matrix)
+        np_mat = mat.toNumpyMatInplace()
+        # The matrix unittest is in testMatrix.py
+        self.assertEqual(np_mat.shape, (2, 3))
+
+        args.setSlotIds(0, swig_paddle.IVector.create([1, 2, 3, 4, 5, 6]))
+        iv = args.getSlotIds(0)
+        assert isinstance(iv, swig_paddle.IVector)
+        np_arr = iv.toNumpyArrayInplace()
+        self.assertEqual(np_arr.shape, (6,))
+
+
+if __name__ == '__main__':
+    swig_paddle.initPaddle("--use_gpu=0")
+    unittest.main()
diff --git a/paddle/api/test/testGradientMachine.py b/paddle/api/test/testGradientMachine.py
new file mode 100644
index 00000000000000..59b36a012a2397
--- /dev/null
+++ b/paddle/api/test/testGradientMachine.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from py_paddle import swig_paddle
+import paddle.proto.ParameterConfig_pb2
+import util
+import unittest
+import numpy
+
+
+class TestGradientMachine(unittest.TestCase):
+    def test_create_gradient_machine(self):
+        conf_file_path = "./testTrainConfig.py"
+        trainer_config = swig_paddle.TrainerConfig.createFromTrainerConfigFile(
+            conf_file_path)
+        self.assertIsNotNone(trainer_config)
+        opt_config = trainer_config.getOptimizationConfig()
+        model_config = trainer_config.getModelConfig()
+        self.assertIsNotNone(model_config)
+        machine = swig_paddle.GradientMachine.createByModelConfig(
+            model_config, swig_paddle.CREATE_MODE_NORMAL,
+            swig_paddle.ParameterOptimizer.create(
+                opt_config).getParameterTypes())
+        self.assertIsNotNone(machine)
+        ipt, _ = util.loadMNISTTrainData()
+        output = swig_paddle.Arguments.createArguments(0)
+
+        optimizers = {}
+
+        # Initial Machine Parameter all to 0.1
+        for param in machine.getParameters():
+            assert isinstance(param, swig_paddle.Parameter)
+            val = param.getBuf(swig_paddle.PARAMETER_VALUE)
+            assert isinstance(val, swig_paddle.Vector)
+            arr = numpy.full((len(val),), 0.1, dtype="float32")
+            val.copyFromNumpyArray(arr)
+            param_config = param.getConfig().toProto()
+            assert isinstance(param_config,
+                              paddle.proto.ParameterConfig_pb2.ParameterConfig)
+            opt = swig_paddle.ParameterOptimizer.create(opt_config)
+            optimizers[param.getID()] = opt
+            num_rows = param_config.dims[1]
+            opt.init(num_rows, param.getConfig())
+
+        for k in optimizers:
+            opt = optimizers[k]
+            opt.startPass()
+
+        batch_size = ipt.getSlotValue(0).getHeight()
+        for k in optimizers:
+            opt = optimizers[k]
+            opt.startBatch(batch_size)
+
+        machine.forward(ipt, output, swig_paddle.PASS_TRAIN)
+        self.assertEqual(1, output.getSlotNum())
+        self.isCalled = False
+
+        def backward_callback(param_):
+            self.isCalled = isinstance(param_, swig_paddle.Parameter)
+            assert isinstance(param_, swig_paddle.Parameter)
+            vec = param_.getBuf(swig_paddle.PARAMETER_VALUE)
+            assert isinstance(vec, swig_paddle.Vector)
+            vec = vec.copyToNumpyArray()
+            for val_ in vec:
+                self.assertTrue(
+                    util.doubleEqual(val_, 0.1))  # Assert All Value is 0.1
+
+            vecs = list(param_.getBufs())
+            opt_ = optimizers[param_.getID()]
+            opt_.update(vecs, param_.getConfig())
+
+        machine.backward(backward_callback)
+
+        for k in optimizers:
+            opt = optimizers[k]
+            opt.finishBatch()
+
+        for k in optimizers:
+            opt = optimizers[k]
+            opt.finishPass()
+
+        self.assertTrue(self.isCalled)
+
+    def test_train_one_pass(self):
+        conf_file_path = './testTrainConfig.py'
+        trainer_config = swig_paddle.TrainerConfig.createFromTrainerConfigFile(
+            conf_file_path)
+        model_config = trainer_config.getModelConfig()
+        machine = swig_paddle.GradientMachine.createByModelConfig(model_config)
+
+        at_end = False
+
+        output = swig_paddle.Arguments.createArguments(0)
+        if not at_end:
+            input_, at_end = util.loadMNISTTrainData(1000)
+            machine.forwardBackward(input_, output, swig_paddle.PASS_TRAIN)
+
+
+if __name__ == '__main__':
+    swig_paddle.initPaddle('--use_gpu=0')
+    unittest.main()
diff --git a/paddle/api/test/testMatrix.py b/paddle/api/test/testMatrix.py
new file mode 100644
index 00000000000000..2216ef30a58b0d
--- /dev/null
+++ b/paddle/api/test/testMatrix.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from py_paddle import swig_paddle
+import numpy as np
+import unittest
+
+
+class TestMatrix(unittest.TestCase):
+    def test_createZero_get_set(self):
+        m = swig_paddle.Matrix.createZero(32, 24)
+        self.assertEqual(m.getWidth(), 24)
+        self.assertEqual(m.getHeight(), 32)
+        for x in xrange(24):
+            for y in xrange(32):
+                self.assertEqual(0.0, m.get(x, y))
+        with self.assertRaises(swig_paddle.RangeError):
+            m.get(51, 47)
+        m.set(3, 3, 3.0)
+        self.assertEqual(m.get(3, 3), 3.0)
+
+    def test_sparse(self):
+        m = swig_paddle.Matrix.createSparse(3, 3, 6, True, False, False)
+        self.assertIsNotNone(m)
+        self.assertTrue(m.isSparse())
+        self.assertEqual(m.getSparseValueType(), swig_paddle.SPARSE_NON_VALUE)
+        self.assertEqual(m.getSparseFormat(), swig_paddle.SPARSE_CSR)
+        m.sparseCopyFrom([0, 2, 3, 3], [0, 1, 2], [])
+        self.assertEqual(m.getSparseRowCols(0), [0, 1])
+        self.assertEqual(m.getSparseRowCols(1), [2])
+        self.assertEqual(m.getSparseRowCols(2), [])
+
+    def test_sparse_value(self):
+        m = swig_paddle.Matrix.createSparse(3, 3, 6, False)
+        self.assertIsNotNone(m)
+        m.sparseCopyFrom([0, 2, 3, 3], [0, 1, 2], [7.3, 4.2, 3.2])
+
+        def assertKVArraySame(actual, expect):
+            self.assertEqual(len(actual), len(expect))
+            for i in xrange(len(actual)):
+                a = actual[i]
+                e = expect[i]
+                self.assertIsInstance(a, tuple)
+                self.assertIsInstance(e, tuple)
+                self.assertEqual(len(a), 2)
+                self.assertEqual(len(e), 2)
+                self.assertEqual(a[0], e[0])
+                self.assertTrue(abs(a[1] - e[1]) < 1e-5)
+
+        first_row = m.getSparseRowColsVal(0)
+        assertKVArraySame(first_row, [(0, 7.3), (1, 4.2)])
+
+    def test_createDenseMat(self):
+        m = swig_paddle.Matrix.createDense([0.1, 0.2, 0.3, 0.4, 0.5, 0.6], 2, 3)
+        self.assertIsNotNone(m)
+        self.assertTrue(abs(m.get(1, 1) - 0.5) < 1e-5)
+
+    def test_numpy(self):
+        numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
+        m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat)
+        self.assertEqual((int(m.getHeight()), int(m.getWidth())), numpy_mat.shape)
+
+        # the numpy matrix and paddle matrix shared the same memory.
+        numpy_mat[0, 1] = 342.23
+
+        for h in xrange(m.getHeight()):
+            for w in xrange(m.getWidth()):
+                self.assertEqual(m.get(h, w), numpy_mat[h, w])
+
+        mat2 = m.toNumpyMatInplace()
+        mat2[1, 1] = 32.2
+        self.assertTrue(np.array_equal(mat2, numpy_mat))
+
+    def test_numpyGpu(self):
+        if swig_paddle.isGpuVersion():
+            numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype='float32')
+            gpu_m = swig_paddle.Matrix.createGpuDenseFromNumpy(numpy_mat)
+            assert isinstance(gpu_m, swig_paddle.Matrix)
+            self.assertEqual((int(gpu_m.getHeight()), int(gpu_m.getWidth())),
+                             numpy_mat.shape)
+            self.assertTrue(gpu_m.isGpu())
+            numpy_mat = gpu_m.copyToNumpyMat()
+            numpy_mat[0, 1] = 3.23
+            for a, e in zip(gpu_m.getData(), [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]):
+                self.assertAlmostEqual(a, e)
+
+            gpu_m.copyFromNumpyMat(numpy_mat)
+
+            for a, e in zip(gpu_m.getData(), [1.0, 3.23, 3.0, 4.0, 5.0, 6.0]):
+                self.assertAlmostEqual(a, e)
+
+
+if __name__ == "__main__":
+    swig_paddle.initPaddle("--use_gpu=0")
+    unittest.main()
diff --git a/paddle/api/test/testTrain.py b/paddle/api/test/testTrain.py
new file mode 100644
index 00000000000000..7f79c2701e9ed2
--- /dev/null
+++ b/paddle/api/test/testTrain.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from py_paddle import swig_paddle, DataProviderWrapperConverter
+import paddle.trainer.config_parser
+from paddle.trainer.PyDataProviderWrapper import DenseSlot, IndexSlot
+import numpy
+import util
+
+
+def init_params(params):
+    def init_param(p):
+        assert isinstance(p, swig_paddle.Parameter)
+        val = p.getBuf(swig_paddle.PARAMETER_VALUE)
+        assert isinstance(val, swig_paddle.Vector)
+        arr = val.toNumpyArrayInplace()
+        for i in xrange(len(arr)):
+            arr[i] = numpy.random.uniform(-1.0, 1.0)
+
+    for p in params:
+        init_param(p)
+
+
+def init_optimizers(opt_conf, params):
+    opts = {}
+    for param in params:
+        param_conf = param.getConfig().toProto()
+        opts[param.getID()] = swig_paddle.ParameterOptimizer.create(opt_conf)
+        opts[param.getID()].init(param_conf.dims[1], param.getConfig())
+    retv_opts = [None for _ in xrange(len(opts))]
+    for k in opts:
+        assert k < len(retv_opts)
+        retv_opts[k] = opts[k]
+    return retv_opts
+
+
+def main():
+    trainer_config = paddle.trainer.config_parser.parse_config(
+        "./testTrainConfig.py", "")
+    opt_config = trainer_config.opt_config
+    print "========Optimization Config ======="
+    print opt_config
+    print "==================================="
+    opt_config = swig_paddle.OptimizationConfig.createFromProto(opt_config)
+    _temp_optimizer_ = swig_paddle.ParameterOptimizer.create(opt_config)
+    enable_types = _temp_optimizer_.getParameterTypes()
+    m = swig_paddle.GradientMachine.createFromConfigProto(
+        trainer_config.model_config, swig_paddle.CREATE_MODE_NORMAL,
+        enable_types)
+    assert m is not None
+    assert isinstance(m, swig_paddle.GradientMachine)
+    init_params(m.getParameters())
+
+    optimizers = init_optimizers(opt_config, m.getParameters())
+
+    # Train One Pass.
+    for optimizer in optimizers:
+        optimizer.startPass()
+    batch_id = 0
+    while True:  # Train one batch
+        batch_size = 1000
+        inArgs, atEnd = util.loadMNISTTrainData(batch_size)
+        if atEnd:
+            break
+        outArgs = swig_paddle.Arguments.createArguments(0)
+
+        for optimizer in optimizers:
+            optimizer.startBatch(batch_size)
+
+        def update_callback(param):
+            try:
+                bufs = list(param.getBufs())
+                opt = optimizers[param.getID()]
+                opt.update(bufs, param.getConfig())
+                callback = opt.needSpecialTraversal(param.getConfig())
+                if callback is not None:
+                    callback(bufs, param.getConfig(), swig_paddle.NO_SPARSE_ID)
+
+            except Exception as e:
+                print e
+
+        m.forwardBackward(inArgs, outArgs, swig_paddle.PASS_TRAIN,
+                          update_callback)
+
+        for optimizer in optimizers:
+            optimizer.finishBatch()
+
+        cost_vec = outArgs.getSlotValue(0)
+        assert isinstance(cost_vec, swig_paddle.Matrix)
+        cost_vec = cost_vec.copyToNumpyMat()
+        print 'Finish Batch', batch_id, 'with cost ', cost_vec.sum() / batch_size
+        batch_id += 1
+
+    for optimizer in optimizers:
+        optimizer.finishPass()
+
+
+if __name__ == '__main__':
+    swig_paddle.initPaddle("--use_gpu=0", "--trainer_count=1")
+    main()
diff --git a/paddle/api/test/testTrainConfig.py b/paddle/api/test/testTrainConfig.py
new file mode 100644
index 00000000000000..22148e31915da0
--- /dev/null
+++ b/paddle/api/test/testTrainConfig.py
@@ -0,0 +1,14 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=100,
+    learning_method=AdamOptimizer()
+)
+
+din = data_layer(name='input', size=784)
+
+fc1 = fc_layer(name='hidden1', input=din, size=100)
+fc2 = fc_layer(name='hidden2', input=fc1, size=100)
+
+opt = fc_layer(input=fc2, size=10, act=SoftmaxActivation())
+outputs(classification_cost(input=opt, label=data_layer('lbl', 10)))
diff --git a/paddle/api/test/testVector.py b/paddle/api/test/testVector.py
new file mode 100644
index 00000000000000..f5b5d0e32e4208
--- /dev/null
+++ b/paddle/api/test/testVector.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from py_paddle import swig_paddle
+import util
+import numpy as np
+import unittest
+
+
+class TestIVector(unittest.TestCase):
+    def test_createZero(self):
+        m = swig_paddle.IVector.createZero(10)
+        self.assertIsNotNone(m)
+        for i in xrange(10):
+            self.assertEqual(m[i], 0)
+            m[i] = i
+            self.assertEqual(m[i], i)
+
+    def test_create(self):
+        m = swig_paddle.IVector.create(range(10))
+        self.assertIsNotNone(m)
+        for i in xrange(10):
+            self.assertEqual(m[i], i)
+
+    def test_numpy(self):
+        vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32")
+        iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec)
+        self.assertEqual(vec.shape[0], int(iv.__len__()))
+        vec[4] = 832
+        for i in xrange(len(iv)):
+            self.assertEqual(vec[i], iv[i])
+        vec2 = iv.toNumpyArrayInplace()
+        vec2[1] = 384
+        for i in xrange(len(iv)):
+            self.assertEqual(vec[i], iv[i])
+            self.assertEqual(vec2[i], iv[i])
+
+    def test_gpu_numpy(self):
+        if swig_paddle.isGpuVersion():
+            vec = swig_paddle.IVector.create(range(0, 10), True)
+            assert isinstance(vec, swig_paddle.IVector)
+            self.assertTrue(vec.isGpu())
+            self.assertEqual(vec.getData(), range(0, 10))
+            num_arr = vec.copyToNumpyArray()
+            assert isinstance(num_arr, np.ndarray)  # for code hint.
+            num_arr[4] = 7
+            self.assertEquals(vec.getData(), range(0, 10))
+
+            vec.copyFromNumpyArray(num_arr)
+            expect_vec = range(0, 10)
+            expect_vec[4] = 7
+            self.assertEqual(vec.getData(), expect_vec)
+
+
+class TestVector(unittest.TestCase):
+    def testCreateZero(self):
+        v = swig_paddle.Vector.createZero(10)
+        self.assertIsNotNone(v)
+        for i in xrange(len(v)):
+            self.assertTrue(util.doubleEqual(v[i], 0))
+            v[i] = i
+            self.assertTrue(util.doubleEqual(v[i], i))
+
+    def testCreate(self):
+        v = swig_paddle.Vector.create([x / 100.0 for x in xrange(100)])
+        self.assertIsNotNone(v)
+        for i in xrange(len(v)):
+            self.assertTrue(util.doubleEqual(v[i], i / 100.0))
+        self.assertEqual(100, len(v))
+
+    def testNumpy(self):
+        numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
+        vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr)
+        assert isinstance(vec, swig_paddle.Vector)
+        numpy_arr[0] = 0.1
+        for n, v in zip(numpy_arr, vec):
+            self.assertTrue(util.doubleEqual(n, v))
+
+        numpy_2 = vec.toNumpyArrayInplace()
+        vec[0] = 1.3
+        for x, y in zip(numpy_arr, numpy_2):
+            self.assertTrue(util.doubleEqual(x, y))
+
+        for x, y in zip(numpy_arr, vec):
+            self.assertTrue(util.doubleEqual(x, y))
+
+        numpy_3 = vec.copyToNumpyArray()
+        numpy_3[0] = 0.4
+        self.assertTrue(util.doubleEqual(vec[0], 1.3))
+        self.assertTrue(util.doubleEqual(numpy_3[0], 0.4))
+
+        for i in xrange(1, len(numpy_3)):
+            util.doubleEqual(numpy_3[i], vec[i])
+
+    def testCopyFromNumpy(self):
+        vec = swig_paddle.Vector.createZero(1)
+        arr = np.array([1.3, 3.2, 2.4], dtype="float32")
+        vec.copyFromNumpyArray(arr)
+        for i in xrange(len(vec)):
+            self.assertTrue(util.doubleEqual(vec[i], arr[i]))
+
+
+if __name__ == '__main__':
+    swig_paddle.initPaddle("--use_gpu=1" if swig_paddle.isGpuVersion() else "--use_gpu=0")
+    unittest.main()
diff --git a/paddle/api/test/util.py b/paddle/api/test/util.py
new file mode 100644
index 00000000000000..93a01b242f9f9a
--- /dev/null
+++ b/paddle/api/test/util.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+import numpy as np
+from py_paddle import swig_paddle
+
+
+def doubleEqual(a, b):
+    return abs(a - b) < 1e-5
+
+
+def __readFromFile():
+    for i in xrange(10002):
+        yield np.random.rand(784), random.randint(0, 9)
+
+
+def loadMNISTTrainData(batch_size=100):
+    if not hasattr(loadMNISTTrainData, "gen"):
+        generator = __readFromFile()
+        loadMNISTTrainData.gen = generator
+    else:
+        generator = loadMNISTTrainData.gen
+    args = swig_paddle.Arguments.createArguments(2)
+    # batch_size = 100
+
+    dense_slot = []
+    id_slot = []
+    atEnd = False
+
+    for _ in xrange(batch_size):
+        try:
+            result = generator.next()
+            dense_slot.extend(result[0])
+            id_slot.append(result[1])
+        except StopIteration:
+            atEnd = True
+            del loadMNISTTrainData.gen
+            break
+
+    dense_slot = swig_paddle.Matrix.createDense(dense_slot, batch_size, 784)
+    id_slot = swig_paddle.IVector.create(id_slot)
+    args.setSlotValue(0, dense_slot)
+    args.setSlotIds(1, id_slot)
+    return args, atEnd
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
new file mode 100644
index 00000000000000..2ccbf311bf8b9d
--- /dev/null
+++ b/paddle/cuda/CMakeLists.txt
@@ -0,0 +1,76 @@
+set(CUDA_SOURCES
+    src/hl_time.cc
+    src/hl_math.cc
+    src/hl_cpu_functions.cc
+    src/hl_avx_functions.cc)
+
+set(CUDA_CXX_WITH_GPU_SOURCES
+    src/hl_cuda_cublas.cc
+    src/hl_cuda_cudnn.cc
+    src/hl_cuda_device.cc)
+
+set_source_files_properties(${CUDA_CXX_WITH_GPU_SOURCES}
+                            PROPERTIES COMPILE_FLAGS "-D__NVCC__")
+
+set_source_files_properties(${CUDA_SOURCES}
+                            PROPERTIES COMPILE_FLAGS "-mavx")
+
+set(CUDA_DSO_SOURCES
+    src/hl_dso_loader.cc
+    src/hl_cudart_wrap.cc)
+
+set(CUDA_CU_SOURCES
+    src/hl_perturbation_util.cu
+    src/hl_cuda_aggregate.cu
+    src/hl_cuda_matrix.cu
+    src/hl_cuda_sparse.cu
+    src/hl_cuda_cnn.cu
+    src/hl_cuda_lstm.cu
+    src/hl_top_k.cu
+    src/hl_batch_transpose.cu
+    src/hl_cuda_sequence.cu
+    src/hl_table_apply.cu)
+
+set(CUDA_HEADERS
+    include/hl_time.h
+    include/hl_dso_loader.h
+    include/hl_sequence.h
+    include/hl_cuda_cublas.h
+    include/hl_batch_transpose.h
+    include/hl_avx_functions.h
+    include/hl_sparse.h
+    include/hl_functions.h
+    include/hl_cuda_cudnn.h
+    include/hl_activation_functions.h
+    include/hl_base.h
+    include/stub/hl_cuda_cudnn_stub.h
+    include/stub/hl_cuda_stub.h
+    include/stub/hl_cuda_cublas_stub.h
+    include/stub/hl_cnn_stub.h
+    include/stub/hl_lstm_stub.h
+    include/stub/hl_sequence_stub.h
+    include/stub/hl_aggregate_stub.h
+    include/stub/hl_sparse_stub.h
+    include/stub/hl_matrix_stub.h
+    include/hl_aggregate.h
+    include/hl_cuda.h
+    include/hl_lstm.h
+    include/hl_table_apply.h
+    include/hl_gpu.h
+    include/hl_top_k.h
+    include/hl_matrix.h
+    include/hl_cnn.h)
+
+if(WITH_GPU)
+    cuda_add_library(paddle_cuda
+        ${CUDA_SOURCES}
+        ${CUDA_CU_SOURCES}
+        ${CUDA_DSO_SOURCES}
+        ${CUDA_CXX_WITH_GPU_SOURCES})
+else()
+    add_library(paddle_cuda ${CUDA_SOURCES})
+endif()
+
+add_style_check_target(paddle_cuda ${CUDA_SOURCES})
+add_style_check_target(paddle_cuda ${CUDA_HEADERS})
+# add_style_check_target(hppl ${HPPL_CU_SOURCES})   # TODO(yuyang18): Format hppl style
diff --git a/paddle/cuda/include/hl_activation_functions.h b/paddle/cuda/include/hl_activation_functions.h
new file mode 100644
index 00000000000000..c8aabc7844cd48
--- /dev/null
+++ b/paddle/cuda/include/hl_activation_functions.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_ACTIVATION_FUNCTIONS_H_
+#define HL_ACTIVATION_FUNCTIONS_H_
+
+#include "hl_functions.h"
+
+/**
+ * Active functions: sigmoid, relu, tanh and linear.
+ */
+#define HPPL_ACTIVE_FUNCTION  {hppl::sigmoid,   \
+                               hppl::relu,      \
+                               hppl::tanh,      \
+                               hppl::linear     \
+                              }
+
+namespace hppl {
+
+/**
+ * Hppl supports sigmoid, relu, tanh, linear active functions
+ * for neural networks' forward and backward activation.
+ */
+template <class T>
+class Active {
+public:
+  typedef T (*forward)(T);
+  typedef T (*backward)(T, T);
+};
+
+#ifdef __NVCC__
+namespace gpu {
+static __device__ Active<real>::forward  forward[]  = HPPL_ACTIVE_FUNCTION;
+static __device__ Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
+}
+#else
+namespace cpu {
+static Active<real>::forward  forward[] = HPPL_ACTIVE_FUNCTION;
+static Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
+}
+
+#ifdef __AVX__
+namespace avx {
+static Active<__m256>::forward  forward[] = HPPL_ACTIVE_FUNCTION;
+static Active<__m256>::backward backward[] = HPPL_ACTIVE_FUNCTION;
+}
+#endif
+#endif
+
+}  // namespace hppl
+
+#endif  // HL_ACTIVATION_FUNCTIONS_H_
diff --git a/paddle/cuda/include/hl_aggregate.h b/paddle/cuda/include/hl_aggregate.h
new file mode 100644
index 00000000000000..db75809f5de195
--- /dev/null
+++ b/paddle/cuda/include/hl_aggregate.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_AGGREGATE_H_
+#define HL_AGGREGATE_H_
+
+#include "hl_base.h"
+
+/**
+ * @brief   Calculate the sum of each row of the matrix A_d.
+ *
+ * @param[in]    A_d     input matrix (M x N).
+ * @param[out]   C_d     output matrix (M x 1).
+ * @param[in]    dimM    matrix height.
+ * @param[in]    dimN    matrix width.
+ *
+ */
+extern void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN);
+
+/**
+ * @brief   Calculate the maximum value of each row of the matrix A_d.
+ *
+ * @param[in]   A_d     input matrix (M x N).
+ * @param[out]  C_d     output matrix (M x 1).
+ * @param[in]   dimM    matrix height.
+ * @param[in]   dimN    matrix width.
+ *
+ */
+extern void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN);
+
+/**
+ * @brief   Calculate the minimum value of each row of the matrix A_d.
+ *
+ * @param[in]   A_d     input matrix (M x N).
+ * @param[out]  C_d     output matrix (M x 1).
+ * @param[in]   dimM    matrix height.
+ * @param[in]   dimN    matrix width.
+ *
+ */
+extern void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN);
+
+/**
+ * @brief   Calculate the sum of each column of the matrix A_d.
+ *
+ * @param[in]   A_d     input matrix (M x N).
+ * @param[out]  C_d     output Matrix (1 x N).
+ * @param[in]   dimM    matrix height.
+ * @param[in]   dimN    matrix width.
+ *
+ */
+extern void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN);
+
+/**
+ * @brief   Calculate the maximum value of each column of the matrix A_d.
+ *
+ * @param[in]   A_d     input matrix (M x N).
+ * @param[out]  C_d     output matrix (1 x N).
+ * @param[in]   dimM    matrix height.
+ * @param[in]   dimN    matrix width.
+ *
+ */
+extern void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN);
+
+/**
+ * @brief   Calculate the minimum value of each column of the matrix A_d.
+ *
+ * @param[in]   A_d     input matrix (M x N).
+ * @param[out]  C_d     output matrix (1 x N).
+ * @param[in]   dimM    matrix height.
+ * @param[in]   dimN    matrix width.
+ *
+ */
+extern void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN);
+
+/**
+ * @brief   C_h = sum(A_d[i]).
+ *
+ * @param[in]   A_d     input(m).
+ * @param[out]  C_h     output(host memory).
+ * @param[in]   dimM    size of vector.
+ *
+ */
+extern void hl_vector_sum(real *A_d, real *C_h, int dimM);
+
+/**
+ * @brief   C_h = sum(abs(A_d[i])).
+ *
+ * @param[in]   A_d     input(m).
+ * @param[out]  C_h     output(host memory).
+ * @param[in]   dimM    size of vector.
+ *
+ */
+extern void hl_vector_abs_sum(real *A_d, real *C_h, int dimM);
+
+#endif /* HL_AGGREGATE_H_ */
diff --git a/paddle/cuda/include/hl_avx_functions.h b/paddle/cuda/include/hl_avx_functions.h
new file mode 100644
index 00000000000000..cf062dd969bf79
--- /dev/null
+++ b/paddle/cuda/include/hl_avx_functions.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_AVX_FUNCTIONS_H_
+#define HL_AVX_FUNCTIONS_H_
+
+#include <immintrin.h>
+
+namespace hppl {
+  __m256 relu(const __m256 a);
+  __m256 sigmoid(const __m256 a);
+  __m256 tanh(const __m256 a);
+  __m256 linear(const __m256 a);
+
+  __m256 relu(const __m256 a, const __m256 b);
+  __m256 sigmoid(const __m256 a, const __m256 b);
+  __m256 tanh(const __m256 a, const __m256 b);
+  __m256 linear(const __m256 a, const __m256 b);
+}  // namespace hppl
+
+#endif  // HL_AVX_FUNCTIONS_H_
diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
new file mode 100644
index 00000000000000..77e2649b172144
--- /dev/null
+++ b/paddle/cuda/include/hl_base.h
@@ -0,0 +1,248 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+
+#ifndef HL_BASE_H_
+#define HL_BASE_H_
+
+#include <cstddef>
+#include "paddle/utils/TypeDefs.h"
+
+/**
+ * HPPL is an internal high performance parallel computing library
+ * for high-level neural network routines, which can support many
+ * heterogeneous compute architectures, such as GPU, FPGA, etc.
+ */
+
+/**
+ * @brief   HPPL CUDA Stream.
+ *
+ * @note    Each thread can use HPPL_STREAM_* after calling hl_init.
+ *          HPPL_STREAM_DEFAULT is HPPL default stream.
+ */
+typedef enum {
+    HPPL_STREAM_DEFAULT = 0,    /* Thread Default Stream*/
+    HPPL_STREAM_1 = 1,
+    HPPL_STREAM_2 = 2,
+    HPPL_STREAM_3 = 3,
+    HPPL_STREAM_4 = 4,
+    HPPL_THREAD_STREAM_1 = 5,
+    HPPL_THREAD_STREAM_2 = 6,
+    HPPL_THREAD_STREAM_3 = 7,
+    HPPL_THREAD_STREAM_4 = 8,
+    HPPL_STREAM_END
+} hl_stream_t;
+
+/**
+ * @brief HPPL activation mode.
+ */
+typedef enum {
+    HL_ACTIVATION_SIGMOID   = 0,
+    HL_ACTIVATION_RELU      = 1,
+    HL_ACTIVATION_TANH      = 2,
+    HL_ACTIVATION_LINEAR    = 3,
+    HL_ACTIVATION_END
+} hl_activation_mode_t;
+
+/**
+ * @brief Transpose type.
+ */
+typedef enum {
+    HPPL_OP_N = 0, /* transpose */
+    HPPL_OP_T = 1, /* non transpose */
+    HPPL_OP_END
+} hl_trans_op_t;
+
+/**
+ * @brief Lstm value.
+ *
+ * @param  gateValue         input value.
+ * @param  prevStateValue    previous state value.
+ * @param  stateValue        state value.
+ * @param  stateActiveValue  state active value.
+ * @param  outputValue       output value.
+ */
+typedef struct {
+  real *gateValue;
+  real *prevStateValue;
+  real *stateValue;
+  real *stateActiveValue;
+  real *outputValue;
+  real *checkIg;
+  real *checkFg;
+  real *checkOg;
+} hl_lstm_value;
+
+/**
+ * @brief Lstm gradient.
+ *
+ * @param  gateGrad          input gradient.
+ * @param  prevStateGrad     previous state gradient.
+ * @param  stateGrad         state gradient.
+ * @param  stateActiveGrad   state active gradient.
+ * @param  outputGrad        output gradient.
+ */
+typedef struct {
+  real *gateGrad;
+  real *prevStateGrad;
+  real *stateGrad;
+  real *stateActiveGrad;
+  real *outputGrad;
+  real *checkIgGrad;
+  real *checkFgGrad;
+  real *checkOgGrad;
+} hl_lstm_grad;
+
+/**
+ * @brief Gru value.
+ *
+ * @param  gateWeight           gate weight (updateGate + resetGate).
+ * @param  stateWeight          frame state weight.
+ * @param  gateValue            gate value results.
+ * @param  resetOutputValue     resetOutput value.
+ * @param  outputValue          output value.
+ * @param  prevOutValue         previous output value.
+ *
+ */
+typedef struct {
+  real *gateWeight;
+  real *stateWeight;
+  real *gateValue;
+  real *resetOutputValue;
+  real *outputValue;
+  real *prevOutValue;
+} hl_gru_value;
+
+/**
+ * @brief Gru gradient.
+ *
+ * @param  gateWeightGrad       gate weight gradient.
+ * @param  stateWeightGrad      frame state weight gradient.
+ * @param  gateGrad             gate gradient results.
+ * @param  resetOutputGrad      resetOutput gradient.
+ * @param  outputGrad           output gradient.
+ * @param  prevOutGrad          previous output gradient.
+ */
+typedef struct {
+  real *gateWeightGrad;
+  real *stateWeightGrad;
+  real *gateGrad;
+  real *resetOutputGrad;
+  real *outputGrad;
+  real *prevOutGrad;
+} hl_gru_grad;
+
+/**
+ * @brief  Sparse matrix value type.
+ */
+typedef enum {
+    HL_NO_VALUE = 0,                       /* matrix values only 0 or 1 */
+    HL_FLOAT_VALUE = 1,
+    HL_VALUE_END
+} hl_matrix_value_t;
+
+
+/**
+ * @brief  HPPL matrix format.
+ */
+typedef enum {
+    HL_SPARSE_CSR = 0,
+    HL_SPARSE_CSC = 1,
+    HL_SPARSE_END
+} hl_matrix_format_t;
+
+
+typedef struct _hl_matrix_s * hl_matrix_s;
+
+/**
+ * @brief   HPPL sparse matrix.
+ *
+ * @param  matrix     sparse matrix.
+ * @param  format     matrix format.
+ * @param  type       the type of matrix values.
+ * @param  rows       matrix rows.
+ * @param  cols       matrix columns.
+ * @param  nnz        nonzero values of sparse matrix.
+ */
+typedef struct {
+    hl_matrix_s             matrix;
+    hl_matrix_format_t      format;
+    hl_matrix_value_t       type;
+    int                     rows;
+    int                     cols;
+    size_t                  nnz;
+} _hl_sparse_matrix_s, *hl_sparse_matrix_s;
+
+#ifndef HPPL_TYPE_DOUBLE
+/**
+ * HPPL data type: real (float or double)
+ *
+ * if real == float
+ *
+ * HL_FLOAT_MAX: 3.40282347e+38F
+ *
+ * HL_FLOAT_MIN: 1.17549435e-38F
+ */
+#define HL_FLOAT_MAX        3.40282347e+38F
+/**
+ * if real == double
+ *
+ * HL_FLOAT_MAX: 1.7976931348623157e+308
+ *
+ * HL_FLOAT_MIN: 2.2250738585072014e-308
+ */
+#define HL_FLOAT_MIN        1.17549435e-38F
+#else
+#define HL_FLOAT_MAX        1.7976931348623157e+308
+#define HL_FLOAT_MIN        2.2250738585072014e-308
+#endif
+
+/**
+ * @brief DIVUP(x, y) is similar to ceil(x / y).
+ * @note  For CUDA, DIVUP will be used to specify
+ *        the size of blockDim.
+ */
+#ifndef DIVUP
+#define DIVUP(x, y) (((x) + (y) - 1) / (y))
+#endif
+
+#ifdef __NVCC__
+
+#include "paddle/utils/Logging.h"
+#include "hl_cuda.h"
+#include "cuda_runtime.h"
+
+extern  __thread bool g_sync_flag;
+extern __thread cudaStream_t default_stream;
+#define STREAM_DEFAULT default_stream
+
+/**
+ * @brief   Check cuda kernel execution.
+ * @param   msg   error string
+ */
+#define CHECK_SYNC(msg)                                   \
+  if (true == g_sync_flag) {                              \
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);           \
+    cudaError_t err                                       \
+      = (cudaError_t)hl_get_device_last_error();          \
+    CHECK_EQ(cudaSuccess, err) << "[" << msg << "] "      \
+      << "CUDA error: "                                   \
+      << hl_get_device_error_string((size_t)err);         \
+  }
+
+#endif  /* __NVCC__ */
+
+#endif  /* HL_BASE_H_ */
+
diff --git a/paddle/cuda/include/hl_batch_transpose.h b/paddle/cuda/include/hl_batch_transpose.h
new file mode 100644
index 00000000000000..414c7996acee4c
--- /dev/null
+++ b/paddle/cuda/include/hl_batch_transpose.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_BATCH_TRANSPOSE_H_
+#define HL_BATCH_TRANSPOSE_H_
+
+#include "hl_base.h"
+
+/**
+ * @brief   Perform matrix transpose for each data in the batch.
+ *
+ * @param[in]   input     height * width elements in batch.
+ * @param[out]  output    height * width elements in batch.
+ * @param[in]   width     width of batch data.
+ * @param[in]   height    height of batch data.
+ * @param[in]   batchSize batch size
+ *
+ * @note    Both the inpt and output are arranged in batch-first
+ *          order. Each batch has height * width data, which are
+ *          arranged in height-first (or row-first) manner.
+ */
+extern void batchTranspose(const real* input,
+                           real* output,
+                           int width,
+                           int height,
+                           int batchSize);
+
+#endif  // HL_BATCH_TRANSPOSE_H_
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
new file mode 100644
index 00000000000000..dcae62d06b26d1
--- /dev/null
+++ b/paddle/cuda/include/hl_cnn.h
@@ -0,0 +1,215 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_CNN_H_
+#define HL_CNN_H_
+
+#include "hl_base.h"
+
+/**
+ * @brief   Shrink column to feature.
+ *
+ * @param[in]   dataCol     expand data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   blockH      filter height.
+ * @param[in]   blockW      filter width.
+ * @param[in]   strideH     stride height.
+ * @param[in]   strideW     stride width.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
+ * @param[in]   outputH     output height.
+ * @param[in]   outputW     output width.
+ * @param[out]  dataIm      output image data.
+ * @param[in]   alpha
+ * @param[in]   beta
+ */
+extern void hl_shrink_col2feature(
+    const real * dataCol, size_t channels,
+    size_t height, size_t width,
+    size_t blockH, size_t blockW,
+    size_t strideH, size_t strideW,
+    size_t paddingH, size_t paddingW,
+    size_t outputH, size_t outputW,
+    real* dataIm,
+    real alpha = 1.0f, real beta = 0.0f);
+
+/**
+ * @brief   Expand feature to column.
+ *
+ * @param[in]   dataIm      input image data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   blockH      filter height.
+ * @param[in]   blockW      filter width.
+ * @param[in]   strideH     stride height.
+ * @param[in]   strideW     stride width.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
+ * @param[in]   outputH     output height.
+ * @param[in]   outputW     output width.
+ * @param[out]  dataCol     expand data.
+ *
+ */
+extern void hl_expand_feature2col(
+    const real* dataIm, size_t channels,
+    size_t height, size_t width,
+    size_t blockH, size_t blockW,
+    size_t strideH, size_t strideW,
+    size_t paddingH, size_t paddingW,
+    size_t outputH, size_t outputW,
+    real* dataCol);
+
+/**
+ * @brief   Maximum pool forward.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   inputData   input data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   pooledH     output image height.
+ * @param[in]   pooledW     output image width.
+ * @param[in]   sizeX       size of pooling window.
+ * @param[in]   stride      pooling stride.
+ * @param[in]   start       pooling start.
+ * @param[out]  tgtData     output data.
+ *
+ */
+extern void hl_maxpool_forward(
+    int frameCnt, const real* inputData, int channels,
+    int height, int width, int pooledH, int pooledW,
+    int sizeX, int stride, int start, real* tgtData);
+
+/**
+ * @brief   Maximum pool backward.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   inputData   input data.
+ * @param[out]  outData     output data.
+ * @param[out]  outGrad     output grad data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   pooledH     output image height.
+ * @param[in]   pooledW     output image width.
+ * @param[in]   sizeX       size of pooling window.
+ * @param[in]   stride      pooling stride.
+ * @param[in]   start       pooling start.
+ * @param[out]  targetGrad  output grad.
+ * @param[in]   scaleA      scale.
+ * @param[in]   scaleB      scale.
+ *
+ */
+extern void hl_maxpool_backward(
+    int frameCnt, const real* inputData,
+    const real* outData, const real* outGrad,
+    int channels, int height, int width,
+    int pooledH, int pooledW, int sizeX,
+    int stride, int start, real* targetGrad,
+    real scaleA, real scaleB);
+
+/**
+ * @brief   Averge pool forward.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   inputData   input data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   pooledH     output image height.
+ * @param[in]   pooledW     output image width.
+ * @param[in]   sizeX       size of pooling window.
+ * @param[in]   stride      pooling stride.
+ * @param[in]   start       pooling start.
+ * @param[out]  tgtData     output data.
+ *
+ */
+extern void hl_avgpool_forward(
+    int frameCnt, const real* inputData, int channels,
+    int height, int width, int pooledH, int pooledW,
+    int sizeX, int stride, int start, real* tgtData);
+
+/**
+ * @brief   Maximum pool backward.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   outGrad     input data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   pooledH     output image height.
+ * @param[in]   pooledW     output image width.
+ * @param[in]   sizeX       size of pooling window.
+ * @param[in]   stride      pooling stride.
+ * @param[in]   start       pooling start.
+ * @param[out]  backGrad    output grad.
+ * @param[in]   scaleA      scale.
+ * @param[in]   scaleB      scale.
+ *
+ */
+extern void hl_avgpool_backward(
+    int frameCnt, const real* outGrad,
+    int channels, int height, int width,
+    int pooledH, int pooledW, int sizeX,
+    int stride, int start, real* backGrad,
+    real scaleA, real scaleB);
+
+/**
+ * @brief   Cross-map-respose normalize forward.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   in          input data.
+ * @param[in]   scale       buffer.
+ * @param[out]  out         output data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   sizeX       size.
+ * @param[in]   alpha       scale.
+ * @param[in]   beta        scale.
+ *
+ */
+extern void hl_CMRNorm_forward(
+    size_t frameCnt, const real* in, real* scale, real* out,
+    size_t channels, size_t height, size_t width, size_t sizeX,
+    real alpha, real beta);
+
+/**
+ * @brief   Cross-map-respose normalize backward.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   inV         input data.
+ * @param[in]   scale       buffer.
+ * @param[out]  outV        output value.
+ * @param[out]  outDiff     output grad.
+ * @param[out]  inDiff      input grad.
+ * @param[in]   channels    number of channel.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   sizeX       size.
+ * @param[in]   alpha       scale.
+ * @param[in]   beta        scale.
+ *
+ */
+extern void hl_CMRNorm_backward(
+    size_t frameCnt, const real* inV, const real* scale,
+    const real* outV, const real* outDiff, real *inDiff,
+    size_t channels, size_t height, size_t width, size_t sizeX,
+    real alpha, real beta);
+
+#endif /* HL_CNN_H_ */
diff --git a/paddle/cuda/include/hl_cpu_gru.cuh b/paddle/cuda/include/hl_cpu_gru.cuh
new file mode 100644
index 00000000000000..cba1c9f30da8d5
--- /dev/null
+++ b/paddle/cuda/include/hl_cpu_gru.cuh
@@ -0,0 +1,610 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_CPU_GRU_CUH_
+#define HL_CPU_GRU_CUH_
+
+#ifndef __NVCC__
+
+#include "paddle/math/MathFunctions.h"
+
+#ifndef HPPL_TYPE_DOUBLE
+#define     CBLAS_GEMM     paddle::gemm<float>
+#else
+#define     CBLAS_GEMM     paddle::gemm<double>
+#endif
+
+template<class OpResetOutput>
+void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
+                                       real *gateValue,
+                                       real *resetOutputValue,
+                                       real *prevOutputValue,
+                                       int frameSize,
+                                       hl_activation_mode_t active_gate) {
+  real rValueUpdateGate;
+  real rValueResetGate;
+  real rValueResetOutput;
+  real rPrevOut = 0;
+  real *updateGate = gateValue;
+  real *resetGate = gateValue + frameSize;
+
+  for (int i = 0; i < frameSize; i++) {
+    rValueUpdateGate = updateGate[i];
+    rValueResetGate = resetGate[i];
+    if (prevOutputValue) {
+      rPrevOut = prevOutputValue[i];
+    }
+
+    opResetOutput(rValueUpdateGate,
+                  rValueResetGate,
+                  rPrevOut,
+                  rValueResetOutput,
+                  hppl::cpu::forward[active_gate]);
+
+    updateGate[i] = rValueUpdateGate;
+    resetGate[i] = rValueResetGate;
+    resetOutputValue[i] = rValueResetOutput;
+  }
+}
+
+template<class OpFinalOutput>
+void hl_naive_gru_forward_final_output(OpFinalOutput opFinalOutput,
+                                       real *gateValue,
+                                       real *prevOutputValue,
+                                       real *outputValue,
+                                       int frameSize,
+                                       hl_activation_mode_t active_node) {
+  real rValueUpdateGate;
+  real rValueFrameState;
+  real rPrevOut = 0;
+  real rOutput;
+  real *updateGate = gateValue;
+  real *frameState = gateValue + frameSize * 2;
+
+  for (int i = 0; i < frameSize; i++) {
+    rValueUpdateGate = updateGate[i];
+    rValueFrameState = frameState[i];
+    if (prevOutputValue) {
+      rPrevOut = prevOutputValue[i];
+    }
+
+    opFinalOutput(rValueUpdateGate,
+                  rValueFrameState,
+                  rPrevOut,
+                  rOutput,
+                  hppl::cpu::forward[active_node]);
+
+    frameState[i] = rValueFrameState;
+    outputValue[i] = rOutput;
+  }
+}
+
+template<class OpResetOutput>
+void hl_avx_gru_forward_reset_output(OpResetOutput opResetOutput,
+                                     real *gateValue,
+                                     real *resetOutputValue,
+                                     real *prevOutputValue,
+                                     int frameSize,
+                                     hl_activation_mode_t active_gate) {
+#ifdef __AVX__
+  __m256 rValueUpdateGate;
+  __m256 rValueResetGate;
+  __m256 rValueResetOutput;
+  __m256 rPrevOut = _mm256_set1_ps(0.0f);
+  __m256 *updateGate = (__m256*)gateValue;
+  __m256 *resetGate = (__m256*)(gateValue + frameSize);
+
+  for (int i = 0; i < frameSize / 8; i++) {
+    rValueUpdateGate = updateGate[i];
+    rValueResetGate = resetGate[i];
+    if (prevOutputValue) {
+      rPrevOut = ((__m256*)prevOutputValue)[i];
+    }
+
+    opResetOutput(rValueUpdateGate,
+                  rValueResetGate,
+                  rPrevOut,
+                  rValueResetOutput,
+                  hppl::avx::forward[active_gate]);
+
+    updateGate[i] = rValueUpdateGate;
+    resetGate[i] = rValueResetGate;
+    ((__m256*)resetOutputValue)[i] = rValueResetOutput;
+  }
+#endif
+}
+
+template<class OpFinalOutput>
+void hl_avx_gru_forward_final_output(OpFinalOutput opFinalOutput,
+                                     real *gateValue,
+                                     real *prevOutputValue,
+                                     real *outputValue,
+                                     int frameSize,
+                                     hl_activation_mode_t active_node) {
+#ifdef __AVX__
+  __m256 rValueUpdateGate;
+  __m256 rValueFrameState;
+  __m256 rPrevOut = _mm256_set1_ps(0.0f);
+  __m256 rOutput;
+  __m256 *updateGate = (__m256*)gateValue;
+  __m256 *frameState = (__m256*)(gateValue + frameSize * 2);
+
+  for (int i = 0; i < frameSize / 8; i++) {
+    rValueUpdateGate = updateGate[i];
+    rValueFrameState = frameState[i];
+    if (prevOutputValue) {
+      rPrevOut = ((__m256*)prevOutputValue)[i];
+    }
+
+    opFinalOutput(rValueUpdateGate,
+                  rValueFrameState,
+                  rPrevOut,
+                  rOutput,
+                  hppl::avx::forward[active_node]);
+
+    frameState[i] = rValueFrameState;
+    ((__m256*)outputValue)[i] = rOutput;
+  }
+#endif
+}
+
+template<class OpResetOutput>
+inline void forward_reset_output(OpResetOutput opResetOutput,
+                                 hl_gru_value value,
+                                 int frameSize,
+                                 int batchSize,
+                                 hl_activation_mode_t active_gate) {
+  for (int b = 0; b < batchSize; b++) {
+    if (OpResetOutput::avx && !(frameSize & (8 - 1)) && (sizeof(real) == 4)) {
+      hl_avx_gru_forward_reset_output(opResetOutput,
+        value.gateValue, value.resetOutputValue, value.prevOutValue,
+        frameSize, active_gate);
+    } else {
+      hl_naive_gru_forward_reset_output(opResetOutput,
+        value.gateValue, value.resetOutputValue, value.prevOutValue,
+        frameSize, active_gate);
+    }
+
+    value.gateValue += frameSize * 3;
+    value.resetOutputValue += frameSize;
+    if (value.prevOutValue) {
+      value.prevOutValue += frameSize;
+    }
+  }
+}
+
+template<class OpFinalOutput>
+inline void forward_final_output(OpFinalOutput opFinalOutput,
+                                 hl_gru_value value,
+                                 int frameSize,
+                                 int batchSize,
+                                 hl_activation_mode_t active_node) {
+  for (int b = 0; b < batchSize; b++) {
+    if (OpFinalOutput::avx && !(frameSize & (8 - 1)) && (sizeof(real) == 4)) {
+      hl_avx_gru_forward_final_output(opFinalOutput,
+        value.gateValue, value.prevOutValue, value.outputValue,
+        frameSize, active_node);
+    } else {
+      hl_naive_gru_forward_final_output(opFinalOutput,
+        value.gateValue, value.prevOutValue, value.outputValue,
+        frameSize, active_node);
+    }
+
+    value.gateValue += frameSize * 3;
+    value.outputValue += frameSize;
+    if (value.prevOutValue) {
+      value.prevOutValue += frameSize;
+    }
+  }
+}
+
+template<class OpResetOutput, class OpFinalOutput>
+void hl_cpu_gru_forward(OpResetOutput opResetOutput,
+                        OpFinalOutput opFinalOutput,
+                        hl_gru_value value,
+                        int frameSize,
+                        int batchSize,
+                        hl_activation_mode_t active_node,
+                        hl_activation_mode_t active_gate) {
+  if (value.prevOutValue) {
+    CBLAS_GEMM(CblasNoTrans,
+               CblasNoTrans,
+               batchSize,
+               2 * frameSize,
+               frameSize,
+               1,
+               value.prevOutValue,
+               frameSize,
+               value.gateWeight,
+               frameSize * 2,
+               1,
+               value.gateValue,
+               frameSize * 3);
+  }
+
+  forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate);
+
+  if (value.prevOutValue) {
+    CBLAS_GEMM(CblasNoTrans,
+               CblasNoTrans,
+               batchSize,
+               frameSize,
+               frameSize,
+               1,
+               value.resetOutputValue,
+               frameSize,
+               value.stateWeight,
+               frameSize,
+               1,
+               value.gateValue + frameSize * 2,
+               frameSize * 3);
+  }
+
+  forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node);
+}
+
+template<class OpStateGrad>
+void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad,
+                                      real *gateValue,
+                                      real *gateGrad,
+                                      real *prevOutValue,
+                                      real *prevOutGrad,
+                                      real *outputGrad,
+                                      int frameSize,
+                                      hl_activation_mode_t active_node) {
+  real rUpdateGateValue;
+  real rUpdateGateGrad;
+  real rFrameStateValue;
+  real rFrameStateGrad;
+  real rOutGrad;
+  real rPrevOutValue = 0;
+  real rPrevOutGrad  = 0;
+  real *updateGateValue = gateValue;
+  real *updateGateGrad = gateGrad;
+  real *frameStateValue = gateValue + frameSize * 2;
+  real *frameStateGrad = gateGrad + frameSize * 2;
+
+  for (int i = 0; i < frameSize; i++) {
+    rUpdateGateValue = updateGateValue[i];
+    rFrameStateValue = frameStateValue[i];
+    rOutGrad  = outputGrad[i];
+    if (prevOutValue) {
+      rPrevOutValue = prevOutValue[i];
+    }
+    if (prevOutGrad) {
+      rPrevOutGrad  = prevOutGrad[i];
+    }
+
+    opStateGrad(rUpdateGateValue,
+                rUpdateGateGrad,
+                rFrameStateValue,
+                rFrameStateGrad,
+                rPrevOutValue,
+                rPrevOutGrad,
+                rOutGrad,
+                hppl::cpu::backward[active_node]);
+
+    updateGateGrad[i] = rUpdateGateGrad;
+    frameStateGrad[i] = rFrameStateGrad;
+    if (prevOutGrad) {
+      prevOutGrad[i] = rPrevOutGrad;
+    }
+  }
+}
+
+template<class OpResetGrad>
+void hl_naive_gru_backward_reset_grad(OpResetGrad opResetGrad,
+                                      real *gateValue,
+                                      real *gateGrad,
+                                      real *prevOutValue,
+                                      real *prevOutGrad,
+                                      real *resetOutputGrad,
+                                      int frameSize,
+                                      hl_activation_mode_t active_gate) {
+  real rUpdateGateValue;
+  real rUpdateGateGrad;
+  real rResetGateValue;
+  real rResetGateGrad;
+  real rResetOutputGrad = 0;
+  real rPrevOutValue = 0;
+  real rPrevOutGrad  = 0;
+  real *updateGateValue = gateValue;
+  real *updateGateGrad = gateGrad;
+  real *resetGateValue = gateValue + frameSize;
+  real *resetGateGrad = gateGrad + frameSize;
+
+  for (int i = 0; i < frameSize; i++) {
+    rUpdateGateValue = updateGateValue[i];
+    rUpdateGateGrad = updateGateGrad[i];
+    rResetGateValue = resetGateValue[i];
+
+    if (prevOutValue && prevOutGrad) {
+      rResetOutputGrad = resetOutputGrad[i];
+    }
+    if (prevOutValue) {
+      rPrevOutValue = prevOutValue[i];
+    }
+    if (prevOutGrad) {
+      rPrevOutGrad  = prevOutGrad[i];
+    }
+
+    opResetGrad(rUpdateGateValue,
+                rUpdateGateGrad,
+                rResetGateValue,
+                rResetGateGrad,
+                rPrevOutValue,
+                rPrevOutGrad,
+                rResetOutputGrad,
+                hppl::cpu::backward[active_gate]);
+
+    updateGateGrad[i] = rUpdateGateGrad;
+    resetGateGrad[i] = rResetGateGrad;
+    if (prevOutGrad) {
+      prevOutGrad[i] = rPrevOutGrad;
+    }
+  }
+}
+
+template<class OpStateGrad>
+void hl_avx_gru_backward_state_grad(OpStateGrad opStateGrad,
+                                    real *gateValue,
+                                    real *gateGrad,
+                                    real *prevOutValue,
+                                    real *prevOutGrad,
+                                    real *outputGrad,
+                                    int frameSize,
+                                    hl_activation_mode_t active_node) {
+#ifdef __AVX__
+  __m256 rUpdateGateValue;
+  __m256 rUpdateGateGrad;
+  __m256 rFrameStateValue;
+  __m256 rFrameStateGrad;
+  __m256 rOutGrad;
+  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
+  __m256 rPrevOutGrad  = _mm256_set1_ps(0.0f);
+  __m256 *updateGateValue = (__m256*)gateValue;
+  __m256 *updateGateGrad = (__m256*)gateGrad;
+  __m256 *frameStateValue = (__m256*)(gateValue + frameSize * 2);
+  __m256 *frameStateGrad = (__m256*)(gateGrad + frameSize * 2);
+
+  for (int i = 0; i < frameSize / 8; i++) {
+    rUpdateGateValue = updateGateValue[i];
+    rFrameStateValue = frameStateValue[i];
+    rOutGrad  = ((__m256*)outputGrad)[i];
+    if (prevOutValue) {
+      rPrevOutValue = ((__m256*)prevOutValue)[i];
+    }
+    if (prevOutGrad) {
+      rPrevOutGrad  = ((__m256*)prevOutGrad)[i];
+    }
+
+    opStateGrad(rUpdateGateValue,
+                rUpdateGateGrad,
+                rFrameStateValue,
+                rFrameStateGrad,
+                rPrevOutValue,
+                rPrevOutGrad,
+                rOutGrad,
+                hppl::avx::backward[active_node]);
+
+    updateGateGrad[i] = rUpdateGateGrad;
+    frameStateGrad[i] = rFrameStateGrad;
+    if (prevOutGrad) {
+      ((__m256*)prevOutGrad)[i] = rPrevOutGrad;
+    }
+  }
+#endif
+}
+
+template<class OpResetGrad>
+void hl_avx_gru_backward_reset_grad(OpResetGrad opResetGrad,
+                                    real *gateValue,
+                                    real *gateGrad,
+                                    real *prevOutValue,
+                                    real *prevOutGrad,
+                                    real *resetOutputGrad,
+                                    int frameSize,
+                                    hl_activation_mode_t active_gate) {
+#ifdef __AVX__
+  __m256 rUpdateGateValue;
+  __m256 rUpdateGateGrad;
+  __m256 rResetGateValue;
+  __m256 rResetGateGrad;
+  __m256 rResetOutputGrad = _mm256_set1_ps(0.0f);
+  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
+  __m256 rPrevOutGrad  = _mm256_set1_ps(0.0f);
+  __m256 *updateGateValue = (__m256*)gateValue;
+  __m256 *updateGateGrad = (__m256*)gateGrad;
+  __m256 *resetGateValue = (__m256*)(gateValue + frameSize);
+  __m256 *resetGateGrad = (__m256*)(gateGrad + frameSize);
+
+  for (int i = 0; i < frameSize / 8; i++) {
+    rUpdateGateValue = updateGateValue[i];
+    rUpdateGateGrad = updateGateGrad[i];
+    rResetGateValue = resetGateValue[i];
+
+    if (prevOutValue && prevOutGrad) {
+      rResetOutputGrad = ((__m256*)resetOutputGrad)[i];
+    }
+    if (prevOutValue) {
+      rPrevOutValue = ((__m256*)prevOutValue)[i];
+    }
+    if (prevOutGrad) {
+      rPrevOutGrad  = ((__m256*)prevOutGrad)[i];
+    }
+
+    opResetGrad(rUpdateGateValue,
+                rUpdateGateGrad,
+                rResetGateValue,
+                rResetGateGrad,
+                rPrevOutValue,
+                rPrevOutGrad,
+                rResetOutputGrad,
+                hppl::avx::backward[active_gate]);
+
+    updateGateGrad[i] = rUpdateGateGrad;
+    resetGateGrad[i] = rResetGateGrad;
+    if (prevOutGrad) {
+      ((__m256*)prevOutGrad)[i] = rPrevOutGrad;
+    }
+  }
+#endif
+}
+
+template<class OpStateGrad>
+inline void backward_state_grad(OpStateGrad opStateGrad,
+                                hl_gru_value value,
+                                hl_gru_grad  grad,
+                                int frameSize,
+                                int batchSize,
+                                hl_activation_mode_t active_node) {
+  for (int b = 0; b < batchSize; b++) {
+    if (OpStateGrad::avx && !(frameSize & (8 - 1)) && (sizeof(real) == 4)) {
+      hl_avx_gru_backward_state_grad(opStateGrad,
+        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
+        grad.outputGrad, frameSize, active_node);
+    } else {
+      hl_naive_gru_backward_state_grad(opStateGrad,
+        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
+        grad.outputGrad, frameSize, active_node);
+    }
+
+    value.gateValue += frameSize * 3;
+    if (value.prevOutValue) {
+      value.prevOutValue += frameSize;
+    }
+
+    grad.gateGrad += frameSize * 3;
+    grad.outputGrad += frameSize;
+    if (grad.prevOutGrad) {
+      grad.prevOutGrad += frameSize;
+    }
+  }
+}
+
+template<class OpResetGrad>
+inline void backward_reset_grad(OpResetGrad opResetGrad,
+                                hl_gru_value value,
+                                hl_gru_grad  grad,
+                                int frameSize,
+                                int batchSize,
+                                hl_activation_mode_t active_gate) {
+  for (int b = 0; b < batchSize; b++) {
+    if (OpResetGrad::avx && !(frameSize & (8 - 1)) && (sizeof(real) == 4)) {
+      hl_avx_gru_backward_reset_grad(opResetGrad,
+        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
+        grad.resetOutputGrad, frameSize, active_gate);
+    } else {
+      hl_naive_gru_backward_reset_grad(opResetGrad,
+        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
+        grad.resetOutputGrad, frameSize, active_gate);
+    }
+
+    value.gateValue += frameSize * 3;
+    if (value.prevOutValue) {
+      value.prevOutValue += frameSize;
+    }
+
+    grad.gateGrad += frameSize * 3;
+    grad.resetOutputGrad += frameSize;
+    if (grad.prevOutGrad) {
+      grad.prevOutGrad += frameSize;
+    }
+  }
+}
+
+template<class OpStateGrad, class OpResetGrad>
+void hl_cpu_gru_backward(OpStateGrad opStateGrad,
+                         OpResetGrad opResetGrad,
+                         hl_gru_value value,
+                         hl_gru_grad  grad,
+                         int frameSize,
+                         int batchSize,
+                         hl_activation_mode_t active_node,
+                         hl_activation_mode_t active_gate) {
+  backward_state_grad(opStateGrad, value, grad,
+    frameSize, batchSize, active_node);
+
+  if (value.prevOutValue && grad.prevOutGrad) {
+    CBLAS_GEMM(CblasNoTrans,
+               CblasTrans,
+               batchSize,
+               frameSize,
+               frameSize,
+               1,
+               grad.gateGrad + frameSize * 2,
+               frameSize * 3,
+               value.stateWeight,
+               frameSize,
+               0,
+               grad.resetOutputGrad,
+               frameSize);
+
+    if (grad.stateWeightGrad) {
+      CBLAS_GEMM(CblasTrans,
+                 CblasNoTrans,
+                 frameSize,
+                 frameSize,
+                 batchSize,
+                 1,
+                 value.resetOutputValue,
+                 frameSize,
+                 grad.gateGrad + frameSize * 2,
+                 frameSize * 3,
+                 1,
+                 grad.stateWeightGrad,
+                 frameSize);
+    }
+  }
+
+  backward_reset_grad(opResetGrad, value, grad,
+    frameSize, batchSize, active_gate);
+
+  if (grad.prevOutGrad && value.prevOutValue) {
+    CBLAS_GEMM(CblasNoTrans,
+               CblasTrans,
+               batchSize,
+               frameSize,
+               frameSize * 2,
+               1,
+               grad.gateGrad,
+               frameSize * 3,
+               value.gateWeight,
+               frameSize * 2,
+               1,
+               grad.prevOutGrad,
+               frameSize);
+
+    if (grad.gateWeightGrad) {
+      CBLAS_GEMM(CblasTrans,
+                 CblasNoTrans,
+                 frameSize,
+                 frameSize * 2,
+                 batchSize,
+                 1,
+                 value.prevOutValue,
+                 frameSize,
+                 grad.gateGrad,
+                 frameSize * 3,
+                 1,
+                 grad.gateWeightGrad,
+                 frameSize * 2);
+    }
+  }
+}
+
+#endif
+
+#endif  // HL_CPU_GRU_CUH_
diff --git a/paddle/cuda/include/hl_cpu_lstm.cuh b/paddle/cuda/include/hl_cpu_lstm.cuh
new file mode 100644
index 00000000000000..65a174d85ba2cd
--- /dev/null
+++ b/paddle/cuda/include/hl_cpu_lstm.cuh
@@ -0,0 +1,372 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_CPU_LSTM_CUH_
+#define HL_CPU_LSTM_CUH_
+
+#ifndef __NVCC__
+
+// using namespace hppl;
+
+template<class Op>
+void hl_naive_lstm_forward_one_sequence(Op op,
+                                        hl_lstm_value value,
+                                        int frameSize,
+                                        hl_activation_mode_t active_node,
+                                        hl_activation_mode_t active_gate,
+                                        hl_activation_mode_t active_state) {
+  real rValueIn;
+  real rValueIg;
+  real rValueFg;
+  real rValueOg;
+  real rCheckI;
+  real rCheckF;
+  real rCheckO;
+  real rState;
+  real rPrevState = 0;
+  real rStateAtv;
+  real rOut;
+
+  real *valueIn = value.gateValue;
+  real *valueIg = value.gateValue + frameSize;
+  real *valueFg = value.gateValue + frameSize * 2;
+  real *valueOg = value.gateValue + frameSize * 3;
+
+  for (int i = 0; i < frameSize; i++) {
+    rValueIn = valueIn[i];
+    rValueIg = valueIg[i];
+    rValueFg = valueFg[i];
+    rValueOg = valueOg[i];
+    rCheckI = value.checkIg[i];
+    rCheckF = value.checkFg[i];
+    rCheckO = value.checkOg[i];
+
+    if (value.prevStateValue) {
+      rPrevState = value.prevStateValue[i];
+    }
+
+    op(rValueIn,
+       rValueIg,
+       rValueFg,
+       rValueOg,
+       rPrevState,
+       rState,
+       rStateAtv,
+       rOut,
+       rCheckI,
+       rCheckF,
+       rCheckO,
+       hppl::cpu::forward[active_node],
+       hppl::cpu::forward[active_gate],
+       hppl::cpu::forward[active_state]);
+
+    valueIn[i] = rValueIn;
+    valueIg[i] = rValueIg;
+    valueFg[i] = rValueFg;
+    valueOg[i] = rValueOg;
+    value.stateValue[i] = rState;
+    value.stateActiveValue[i] = rStateAtv;
+    value.outputValue[i] = rOut;
+  }
+}
+
+template<class Op>
+void hl_naive_lstm_backward_one_sequence(Op op,
+                                         hl_lstm_value value,
+                                         hl_lstm_grad grad,
+                                         int frameSize,
+                                         hl_activation_mode_t active_node,
+                                         hl_activation_mode_t active_gate,
+                                         hl_activation_mode_t active_state) {
+  real rValueIn;
+  real rValueIg;
+  real rValueFg;
+  real rValueOg;
+  real rGradIn;
+  real rGradIg;
+  real rGradFg;
+  real rGradOg;
+  real rPrevState = 0;
+  real rPrevStateGrad;
+  real rState;
+  real rStateGrad;
+  real rStateAtv;
+  real rOutputGrad;
+  real rCheckI;
+  real rCheckF;
+  real rCheckO;
+  real rCheckIGrad;
+  real rCheckFGrad;
+  real rCheckOGrad;
+
+  real *valueIn = value.gateValue;
+  real *valueIg = value.gateValue + frameSize;
+  real *valueFg = value.gateValue + frameSize * 2;
+  real *valueOg = value.gateValue + frameSize * 3;
+  real *gradIn = grad.gateGrad;
+  real *gradIg = grad.gateGrad + frameSize;
+  real *gradFg = grad.gateGrad + frameSize * 2;
+  real *gradOg = grad.gateGrad + frameSize * 3;
+
+  for (int i = 0; i < frameSize; i++) {
+    rValueIn = valueIn[i];
+    rValueIg = valueIg[i];
+    rValueFg = valueFg[i];
+    rValueOg = valueOg[i];
+    rCheckI = value.checkIg[i];
+    rCheckF = value.checkFg[i];
+    rCheckO = value.checkOg[i];
+    rState = value.stateValue[i];
+    rStateAtv = value.stateActiveValue[i];
+    rOutputGrad = grad.outputGrad[i];
+    rStateGrad = grad.stateGrad[i];
+    if (value.prevStateValue) {
+      rPrevState = value.prevStateValue[i];
+    }
+
+    op(rValueIn,
+       rValueIg,
+       rValueFg,
+       rValueOg,
+       rGradIn,
+       rGradIg,
+       rGradFg,
+       rGradOg,
+       rPrevState,
+       rPrevStateGrad,
+       rState,
+       rStateGrad,
+       rStateAtv,
+       rOutputGrad,
+       rCheckI,
+       rCheckF,
+       rCheckO,
+       rCheckIGrad,
+       rCheckFGrad,
+       rCheckOGrad,
+       hppl::cpu::backward[active_node],
+       hppl::cpu::backward[active_gate],
+       hppl::cpu::backward[active_state]);
+
+    gradIn[i] = rGradIn;
+    gradIg[i] = rGradIg;
+    gradFg[i] = rGradFg;
+    gradOg[i] = rGradOg;
+    grad.stateGrad[i] = rStateGrad;
+
+    if (grad.prevStateGrad) grad.prevStateGrad[i] = rPrevStateGrad;
+    if (value.prevStateValue) {
+      if (grad.checkIgGrad) grad.checkIgGrad[i] += rCheckIGrad;
+      if (grad.checkFgGrad) grad.checkFgGrad[i] += rCheckFGrad;
+    }
+    if (grad.checkOgGrad) grad.checkOgGrad[i] += rCheckOGrad;
+  }
+}
+
+template<class Op>
+void hl_avx_lstm_forward_one_sequence(Op op,
+                                      hl_lstm_value value,
+                                      int frameSize,
+                                      hl_activation_mode_t active_node,
+                                      hl_activation_mode_t active_gate,
+                                      hl_activation_mode_t active_state) {
+#ifdef __AVX__
+  __m256 rValueIn;
+  __m256 rValueIg;
+  __m256 rValueFg;
+  __m256 rValueOg;
+  __m256 rCheckI;
+  __m256 rCheckF;
+  __m256 rCheckO;
+  __m256 rState;
+  __m256 rPrevState = _mm256_set1_ps(0.0f);
+  __m256 rStateAtv;
+  __m256 rOut;
+
+  __m256 *valueIn = (__m256*)value.gateValue;
+  __m256 *valueIg = (__m256*)(value.gateValue + frameSize);
+  __m256 *valueFg = (__m256*)(value.gateValue + frameSize * 2);
+  __m256 *valueOg = (__m256*)(value.gateValue + frameSize * 3);
+
+  for (int i = 0; i < frameSize / 8; i++) {
+    rValueIn = valueIn[i];
+    rValueIg = valueIg[i];
+    rValueFg = valueFg[i];
+    rValueOg = valueOg[i];
+    rCheckI = ((__m256*)value.checkIg)[i];
+    rCheckF = ((__m256*)value.checkFg)[i];
+    rCheckO = ((__m256*)value.checkOg)[i];
+
+    if (value.prevStateValue) {
+      rPrevState = ((__m256*)value.prevStateValue)[i];
+    }
+
+    op(rValueIn,
+       rValueIg,
+       rValueFg,
+       rValueOg,
+       rPrevState,
+       rState,
+       rStateAtv,
+       rOut,
+       rCheckI,
+       rCheckF,
+       rCheckO,
+       hppl::avx::forward[active_node],
+       hppl::avx::forward[active_gate],
+       hppl::avx::forward[active_state]);
+
+    valueIn[i] = rValueIn;
+    valueIg[i] = rValueIg;
+    valueFg[i] = rValueFg;
+    valueOg[i] = rValueOg;
+    ((__m256*)value.stateValue)[i] = rState;
+    ((__m256*)value.stateActiveValue)[i] = rStateAtv;
+    ((__m256*)value.outputValue)[i] = rOut;
+  }
+#endif
+}
+
+template<class Op>
+void hl_avx_lstm_backward_one_sequence(Op op,
+                                       hl_lstm_value value,
+                                       hl_lstm_grad grad,
+                                       int frameSize,
+                                       hl_activation_mode_t active_node,
+                                       hl_activation_mode_t active_gate,
+                                       hl_activation_mode_t active_state) {
+#ifdef __AVX__
+  __m256 rValueIn;
+  __m256 rValueIg;
+  __m256 rValueFg;
+  __m256 rValueOg;
+  __m256 rGradIn;
+  __m256 rGradIg;
+  __m256 rGradFg;
+  __m256 rGradOg;
+  __m256 rPrevState = _mm256_set1_ps(0.0f);
+  __m256 rPrevStateGrad;
+  __m256 rStateGrad;
+  __m256 rState;
+  __m256 rStateAtv;
+  __m256 rOutputGrad;
+  __m256 rCheckI;
+  __m256 rCheckF;
+  __m256 rCheckO;
+  __m256 rCheckIGrad;
+  __m256 rCheckFGrad;
+  __m256 rCheckOGrad;
+
+  __m256 *valueIn = (__m256*)value.gateValue;
+  __m256 *valueIg = (__m256*)(value.gateValue + frameSize);
+  __m256 *valueFg = (__m256*)(value.gateValue + frameSize * 2);
+  __m256 *valueOg = (__m256*)(value.gateValue + frameSize * 3);
+  __m256 *gradIn = (__m256*)grad.gateGrad;
+  __m256 *gradIg = (__m256*)(grad.gateGrad + frameSize);
+  __m256 *gradFg = (__m256*)(grad.gateGrad + frameSize * 2);
+  __m256 *gradOg = (__m256*)(grad.gateGrad + frameSize * 3);
+
+  for (int i = 0; i < frameSize / 8; i++) {
+    rValueIn = valueIn[i];
+    rValueIg = valueIg[i];
+    rValueFg = valueFg[i];
+    rValueOg = valueOg[i];
+    rCheckI = ((__m256*)value.checkIg)[i];
+    rCheckF = ((__m256*)value.checkFg)[i];
+    rCheckO = ((__m256*)value.checkOg)[i];
+    rState = ((__m256*)value.stateValue)[i];
+    rStateAtv = ((__m256*)value.stateActiveValue)[i];
+    rOutputGrad = ((__m256*)grad.outputGrad)[i];
+    rStateGrad = ((__m256*)grad.stateGrad)[i];
+    if (value.prevStateValue) {
+      rPrevState = ((__m256*)value.prevStateValue)[i];
+    }
+
+    op(rValueIn,
+       rValueIg,
+       rValueFg,
+       rValueOg,
+       rGradIn,
+       rGradIg,
+       rGradFg,
+       rGradOg,
+       rPrevState,
+       rPrevStateGrad,
+       rState,
+       rStateGrad,
+       rStateAtv,
+       rOutputGrad,
+       rCheckI,
+       rCheckF,
+       rCheckO,
+       rCheckIGrad,
+       rCheckFGrad,
+       rCheckOGrad,
+       hppl::avx::backward[active_node],
+       hppl::avx::backward[active_gate],
+       hppl::avx::backward[active_state]);
+
+    gradIn[i] = rGradIn;
+    gradIg[i] = rGradIg;
+    gradFg[i] = rGradFg;
+    gradOg[i] = rGradOg;
+    ((__m256*)grad.stateGrad)[i] = rStateGrad;
+
+    if (grad.prevStateGrad) ((__m256*)grad.prevStateGrad)[i] = rPrevStateGrad;
+    if (value.prevStateValue) {
+      if (grad.checkIgGrad) ((__m256*)grad.checkIgGrad)[i] += rCheckIGrad;
+      if (grad.checkFgGrad) ((__m256*)grad.checkFgGrad)[i] += rCheckFGrad;
+    }
+    if (grad.checkOgGrad) ((__m256*)grad.checkOgGrad)[i] += rCheckOGrad;
+  }
+#endif
+}
+
+template<class Op>
+void hl_cpu_lstm_forward(Op op,
+                         hl_lstm_value value,
+                         int frameSize,
+                         hl_activation_mode_t active_node,
+                         hl_activation_mode_t active_gate,
+                         hl_activation_mode_t active_state) {
+  if (Op::avx && !(frameSize & (8 - 1)) && (sizeof(real) == 4)) {
+    hl_avx_lstm_forward_one_sequence(op, value, frameSize,
+        active_node, active_gate, active_state);
+  } else {
+    hl_naive_lstm_forward_one_sequence(op, value, frameSize,
+        active_node, active_gate, active_state);
+  }
+}
+
+template<class Op>
+void hl_cpu_lstm_backward(Op op,
+                          hl_lstm_value value,
+                          hl_lstm_grad grad,
+                          int frameSize,
+                          hl_activation_mode_t active_node,
+                          hl_activation_mode_t active_gate,
+                          hl_activation_mode_t active_state) {
+  if (Op::avx && !(frameSize & (8 - 1)) && (sizeof(real) == 4)) {
+    hl_avx_lstm_backward_one_sequence(op, value, grad, frameSize,
+        active_node, active_gate, active_state);
+  } else {
+    hl_naive_lstm_backward_one_sequence(op, value, grad, frameSize,
+        active_node, active_gate, active_state);
+  }
+}
+
+#endif
+
+#endif /* HL_CPU_LSTM_CUH_ */
diff --git a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
new file mode 100644
index 00000000000000..239a2419918f9a
--- /dev/null
+++ b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
@@ -0,0 +1,222 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_CPU_MATRIX_KERNEL_CUH_
+#define HL_CPU_MATRIX_KERNEL_CUH_
+
+#include <stdio.h>
+#include "hl_base.h"
+#include "hl_sse_matrix_kernel.cuh"
+
+/**
+ * @brief   cpu element wise unary operator.
+ */
+template <class T, class Op>
+void hl_cpu_apply_unary_op(Op op, T* A_h, int dimM, int dimN, int lda) {
+  for (int i = 0; i < dimM; i ++) {
+    for (int j = 0; j < dimN; j++) {
+      op.cpuOperator(A_h[i*lda + j]);
+    }
+  }
+}
+
+/**
+ * @brief   cpu element wise binary operator.
+ */
+template <class T, class Op, bool BAsRowVector, bool BAsColVector>
+void hl_cpu_apply_binary_op(Op op,
+                            T* A_h,
+                            T* B_h,
+                            int dimM,
+                            int dimN,
+                            int lda,
+                            int ldb) {
+  for (int i = 0; i < dimM; i ++) {
+    for (int j = 0; j < dimN; j++) {
+      if (BAsRowVector == 0 && BAsColVector == 0) {
+        op.cpuOperator(A_h[i * lda + j], B_h[i * ldb + j]);
+      } else if (BAsRowVector == 1 && BAsColVector == 0) {
+        op.cpuOperator(A_h[i * lda + j], B_h[j]);
+      } else if (BAsRowVector == 0 && BAsColVector == 1) {
+        op.cpuOperator(A_h[i * lda + j], B_h[i * ldb]);
+      } else {
+        op.cpuOperator(A_h[i * lda + j], B_h[0]);
+      }
+    }
+  }
+}
+
+/**
+ * @brief   cpu element wise ternary operator.
+ */
+template <class T, class Op, bool CAsRowVector, bool CAsColVector>
+void hl_cpu_apply_ternary_op(Op op,
+                             T* A_h,
+                             T* B_h,
+                             T* C_h,
+                             int dimM,
+                             int dimN,
+                             int lda,
+                             int ldb,
+                             int ldc) {
+  for (int i = 0; i < dimM; i ++) {
+    for (int j = 0; j < dimN; j++) {
+      if (CAsRowVector == 0 && CAsColVector == 0) {
+        op.cpuOperator(A_h[i*lda + j], B_h[i*ldb + j], C_h[i*ldc + j]);
+      } else if (CAsRowVector == 1 && CAsColVector == 0) {
+        op.cpuOperator(A_h[i*lda + j], B_h[i*ldb + j], C_h[j]);
+      } else if (CAsRowVector == 0 && CAsColVector == 1) {
+        op.cpuOperator(A_h[i*lda + j], B_h[i*ldb + j], C_h[i*ldc]);
+      } else {
+        op.cpuOperator(A_h[i*lda + j], B_h[i*ldb + j], C_h[0]);
+      }
+    }
+  }
+}
+
+/**
+ * @brief   cpu element wise quaternary operator.
+ */
+template <class T, class Op>
+void hl_cpu_apply_quaternary_op(Op op,
+                                T* A_h,
+                                T* B_h,
+                                T* C_h,
+                                T* D_h,
+                                int dimM,
+                                int dimN,
+                                int lda,
+                                int ldb,
+                                int ldc,
+                                int ldd) {
+  for (int i = 0; i < dimM; i ++) {
+    for (int j = 0; j < dimN; j++) {
+      op.cpuOperator(A_h[i*lda + j],
+                     B_h[i*ldb + j],
+                     C_h[i*ldc + j],
+                     D_h[i*ldd + j]);
+    }
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_matrix_row_op(Agg agg, Op op, Saver sv,
+                      int dimM, int dimN,
+                      real *dst, int ld,
+                      real *A, int lda) {
+  for (int i = 0; i < dimM; i++) {
+    real tmp = agg.init();
+    for (int j = 0; j < dimN; j++) {
+        tmp = agg(tmp, op(A[i * lda + j]));
+    }
+    dst[i*ld] = sv(dst[i*ld], tmp);
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_matrix_row_op(Agg agg, Op op, Saver sv,
+                      int dimM, int dimN,
+                      real *dst, int ld,
+                      real *A, int lda,
+                      real *B, int ldb) {
+  for (int i = 0; i < dimM; i++) {
+    real tmp = agg.init();
+    for (int j = 0; j < dimN; j++) {
+        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
+    }
+    dst[i*ld] = sv(dst[i*ld], tmp);
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_cpu_matrix_row_op(Agg agg, Op op, Saver sv,
+                          int dimM, int dimN,
+                          real *dst, int ld,
+                          real *A, int lda) {
+#ifndef __CUDA_ARCH__
+  if (!Agg::sse || !Op::sse || !Saver::sse) {
+    hl_matrix_row_op(agg, op, sv, dimM, dimN, dst, ld, A, lda);
+  } else {
+    if (hl_check_align(A) && hl_check_align(lda*sizeof(real))) {
+      hl_sse_matrix_row_op(agg, op, sv, dimM, dimN, dst, ld, A, lda);
+    } else {
+      hl_matrix_row_op(agg, op, sv, dimM, dimN, dst, ld, A, lda);
+    }
+  }
+#endif
+}
+
+template <class Agg, class Op, class Saver>
+void hl_cpu_matrix_row_op(Agg agg, Op op, Saver sv,
+                          int dimM, int dimN,
+                          real *dst, int ld,
+                          real *A, int lda,
+                          real *B, int ldb) {
+#ifndef __CUDA_ARCH__
+  if (!Agg::sse || !Op::sse || !Saver::sse) {
+    hl_matrix_row_op(agg, op, sv, dimM, dimN, dst, ld, A, lda, B, ldb);
+  } else {
+    if (hl_check_align(A) && hl_check_align(lda*sizeof(real))
+      && hl_check_align(B) && hl_check_align(ldb*sizeof(real))) {
+      hl_sse_matrix_row_op(
+        agg, op, sv, dimM, dimN, dst, ld, A, lda, B, ldb);
+    } else {
+      hl_matrix_row_op(agg, op, sv, dimM, dimN, dst, ld, A, lda, B, ldb);
+    }
+  }
+#endif
+}
+
+template <class Agg, class Op, class Saver>
+void hl_cpu_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda) {
+#ifndef __CUDA_ARCH__
+  if (!Agg::sse || !Op::sse || !Saver::sse) {
+    hl_matrix_column_op(agg, op, sv, dimM, dimN, dst, A, lda);
+  } else {
+    if (hl_check_align(A) && hl_check_align(lda*sizeof(real))
+      && hl_check_align(dst)) {
+      hl_sse_matrix_column_op(agg, op, sv, dimM, dimN, dst, A, lda);
+    } else {
+      hl_matrix_column_op(agg, op, sv, dimM, dimN, dst, A, lda);
+    }
+  }
+#endif
+}
+
+template <class Agg, class Op, class Saver>
+void hl_cpu_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda,
+                             real *B, int ldb) {
+#ifndef __CUDA_ARCH__
+  if (!Agg::sse || !Op::sse || !Saver::sse) {
+    hl_matrix_column_op(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  } else {
+    if (hl_check_align(A) && hl_check_align(lda*sizeof(real))
+      && hl_check_align(B) && hl_check_align(ldb*sizeof(real))
+      && hl_check_align(dst)) {
+      hl_sse_matrix_column_op(
+        agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+    } else {
+      hl_matrix_column_op(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+    }
+  }
+#endif
+}
+
+#endif /* HL_CPU_MATRIX_KERNEL_CUH_ */
diff --git a/paddle/cuda/include/hl_cuda.h b/paddle/cuda/include/hl_cuda.h
new file mode 100644
index 00000000000000..ffdf71229abe17
--- /dev/null
+++ b/paddle/cuda/include/hl_cuda.h
@@ -0,0 +1,337 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_CUDA_H_
+#define HL_CUDA_H_
+
+#include "hl_base.h"
+#include <string>
+
+/**
+ * @brief   HPPL event.
+ */
+typedef struct _hl_event_st *  hl_event_t;
+
+
+/**
+ * @brief return cuda runtime api version.
+ */
+extern int hl_get_cuda_lib_version();
+
+/**
+ * @brief   HPPL strat(Initialize all GPU).
+ */
+extern void hl_start();
+
+/**
+ * @brief   HPPL start(Initialize the specific GPU).
+ *
+ * @param[in]   device  device id(0, 1......).
+ *                      if device is NULL, will start all GPU.
+ * @param[in]   number  number of devices.
+ */
+extern void hl_specify_devices_start(int* device, int number);
+
+/**
+ * @brief   Queries if a device may directly access a peer device's memory.
+ *
+ * @param[in]   device      Device from which allocations on peerDevice are
+ *                          to be directly accessed.
+ * @param[in]   peerDevice  Device on which the allocations to be directly
+ *                          accessed by device reside.
+ *
+ * @return  Returns true if device is capable of directly accessing memory
+ *          from peerDevice and false otherwise.
+ */
+bool hl_device_can_access_peer(int device, int peerDevice);
+
+/**
+ * @brief   Enables direct access to memory allocations on a peer device.
+ *
+ * @param[in]   peerDevice  Peer device to enable direct access to from the
+ *                          current device
+ */
+void hl_device_enable_peer_access(int peerDevice);
+
+/**
+ * @brief   Init a work thread.
+ *
+ * @param[in]   device  device id.
+ */
+extern void hl_init(int device);
+
+/**
+ * @brief   Finish a work thread.
+ */
+extern void hl_fini();
+
+/**
+ * @brief   Set synchronous/asynchronous flag.
+ *
+ * @param[in]   flag    true(default), set synchronous flag.
+ *                      false, set asynchronous flag.
+ *
+ *
+ * @note    This setting is only valid for the current worker thread.
+ */
+extern void hl_set_sync_flag(bool flag);
+
+/**
+ * @brief   Get synchronous/asynchronous flag.
+ *
+ * @return  Synchronous call true.
+ *          Asynchronous call false.
+ *
+ */
+extern bool hl_get_sync_flag();
+
+/**
+ * @brief   Returns the number of compute-capable devices.
+ *
+ */
+extern int hl_get_device_count();
+
+/**
+ * @brief   Set device to be used.
+ *
+ * @param[in]   device  device id.
+ *
+ */
+extern void hl_set_device(int device);
+
+/**
+ * @brief   Returns which device is currently being used.
+ *
+ * @return  device  device id.
+ *
+ */
+extern int hl_get_device();
+
+/**
+ * @brief   Allocate device memory.
+ *
+ * @param[in]   size     size in bytes to copy.
+ *
+ * @return      dest_d   pointer to device memory.
+ */
+extern void* hl_malloc_device(size_t size);
+
+/**
+ * @brief   Free device memory.
+ *
+ * @param[in]   dest_d  pointer to device memory.
+ *
+ */
+extern void hl_free_mem_device(void *dest_d);
+
+/**
+ * @brief   Allocate host page-lock memory.
+ *
+ * @param[in]   size     size in bytes to copy.
+ *
+ * @return      dest_h   pointer to host memory.
+ */
+extern void* hl_malloc_host(size_t size);
+
+/**
+ * @brief   Free host page-lock memory.
+ *
+ * @param[in]   dest_h  pointer to host memory.
+ *
+ */
+extern void hl_free_mem_host(void *dest_h);
+
+/**
+ * @brief   Copy data.
+ *
+ * @param[in]   dst     dst memory address(host or device).
+ * @param[in]   src     src memory address(host or device).
+ * @param[in]   size    size in bytes to copy.
+ *
+ */
+extern void hl_memcpy(void *dst, void *src, size_t size);
+
+/**
+ * @brief   Set device memory to a value.
+ *
+ * @param[in]   dest_d  pointer to device memory.
+ * @param[in]   value   value to set for each byte of specified memory.
+ * @param[in]   size    size in bytes to set.
+ *
+ */
+extern void hl_memset_device(void *dest_d, int value, size_t size);
+
+/**
+ * @brief   Copy host memory to device memory.
+ *
+ * @param[in]   dest_d  dst memory address.
+ * @param[in]   src_h   src memory address.
+ * @param[in]   size    size in bytes to copy.
+ *
+ */
+extern void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size);
+
+/**
+ * @brief   Copy device memory to host memory.
+ *
+ * @param[in]   dest_h  dst memory address.
+ * @param[in]   src_d   src memory address.
+ * @param[in]   size    size in bytes to copy.
+ *
+ */
+extern void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size);
+
+/**
+ * @brief   Copy device memory to device memory.
+ *
+ * @param[in]   dest_d  dst memory address.
+ * @param[in]   src_d   src memory address.
+ * @param[in]   size    size in bytes to copy.
+ *
+ */
+extern void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size);
+
+/**
+ * @brief   Generate uniformly distributed floats (0, 1.0].
+ *
+ * @param[in]   dest_d  pointer to device memory to store results.
+ * @param[in]   num     number of floats to generate.
+ *
+ */
+extern void hl_rand(real *dest_d, size_t num);
+
+/**
+ * @brief   Set the seed value of the random number generator.
+ *
+ * @param[in]   seed    seed value.
+ */
+extern void hl_srand(unsigned int seed);
+
+/**
+ * @brief   Copy data.
+ *
+ * @param[in]   dst     dst memory address(host or device).
+ * @param[in]   src     src memory address(host or device).
+ * @param[in]   size    size in bytes to copy.
+ * @param[in]   stream  stream id.
+ */
+extern void hl_memcpy_async(void *dst,
+                           void *src,
+                           size_t size,
+                           hl_stream_t stream);
+
+/**
+ * @brief   Waits for stream tasks to complete.
+ *
+ * @param[in]   stream  stream id.
+ */
+extern void hl_stream_synchronize(hl_stream_t stream);
+
+/**
+ * @brief   Creates an event object.
+ *
+ * @param[out]   event  New event.
+ */
+extern void hl_create_event(hl_event_t *event);
+
+/**
+ * @brief   Destroys an event object.
+ *
+ * @param[in]   event   Event to destroy.
+ */
+extern void hl_destroy_event(hl_event_t event);
+
+/**
+ * @brief   Computes the elapsed time between events.
+ *
+ * @param[in]   start  Starting event.
+ * @param[in]   end    Ending event.
+ *
+ * @return      time   Time between start and end in ms.
+ */
+extern float hl_event_elapsed_time(hl_event_t start,
+                                   hl_event_t end);
+
+/**
+ * @brief   Records an event.
+ *
+ * @param[in]   stream   Stream in which to insert event.
+ * @param[in]   event    Event waiting to be recorded as completed.
+ *
+ */
+extern void hl_stream_record_event(hl_stream_t stream, hl_event_t event);
+
+/**
+ * @brief   Make a compute stream wait on an event.
+ *
+ * @param[in]   stream   Stream in which to insert event.
+ * @param[in]   event    Event to wait on.
+ *
+ */
+extern void hl_stream_wait_event(hl_stream_t stream, hl_event_t event);
+
+/**
+ * @brief   Wait for an event to complete.
+ *
+ * @param[in]   event       event to wait for.
+ *
+ */
+extern void hl_event_synchronize(hl_event_t event);
+
+/**
+ * @brief   Sets block flags to be used for device executions.
+ *
+ * @note    This interface needs to be called before hl_start.
+ */
+extern void hl_set_device_flags_block();
+
+/**
+ * @brief   Returns the last error string from a cuda runtime call.
+ */
+extern const char* hl_get_device_error_string();
+
+/**
+ * @brief     Returns the last error string from a cuda runtime call.
+ *
+ * @param[in] err  error number.
+ *
+ * @see       hl_get_device_last_error()
+ */
+extern const char* hl_get_device_error_string(size_t err);
+
+/**
+ * @brief   Returns the last error number.
+ *
+ * @return  error number.
+ *
+ * @see     hl_get_device_error_string()
+ */
+extern int hl_get_device_last_error();
+
+/**
+ * @brief   hppl query event.
+ *
+ * @param[in]   event       cuda event to query.
+ * @param[out]  isNotReady  this work under device has not yet been
+ *                          completed, vice versa.
+ */
+extern void hl_cuda_event_query(hl_event_t event, bool& isNotReady);
+
+/**
+ * @brief   hppl device synchronization.
+ */
+extern void hl_device_synchronize();
+
+#endif  // HL_CUDA_H_
diff --git a/paddle/cuda/include/hl_cuda.ph b/paddle/cuda/include/hl_cuda.ph
new file mode 100644
index 00000000000000..9e0537aaf16871
--- /dev/null
+++ b/paddle/cuda/include/hl_cuda.ph
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_CUDA_PH_
+#define HL_CUDA_PH_
+
+#include <stdio.h>
+#include <pthread.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <curand.h>
+#include <cudnn.h>
+#include "hl_base.h"
+
+/**
+ * @brief   hppl event.
+ * @param   cuda event.
+ */
+struct _hl_event_st {
+    cudaEvent_t     cu_event;       /* cuda event */
+};
+
+/**
+ * @brief   global device resources.
+ *
+ * @param   *stream         device global stream.
+ * @param   handle          devcie cublas handle.
+ * @param   gen             device curand generator.
+ * @param   cudnn_handle    cudnn handle.
+ * @param   *gen_mutex      gen lock.
+ */
+typedef struct {
+    cudaStream_t        *stream;
+    cublasHandle_t      handle;
+    curandGenerator_t   gen;
+    cudnnHandle_t       cudnn_handle;
+    pthread_mutex_t     *gen_mutex;
+}_global_device_resources, *global_device_resources;
+
+/*
+ * @brief   thread device resources.
+ *
+ * @param   *stream         device thread stream.
+ * @param   *gpu_mem        device memory.
+ * @param   *cpu_mem        cpu memory.
+ * @param    mem_event      device memory lock.
+ */
+typedef struct {
+    cudaStream_t   *stream;
+    real           *gpu_mem;
+    real           *cpu_mem;
+    cudaEvent_t    mem_event;
+}_thread_device_resources, *thread_device_resources;
+
+/*
+ * @brief   hppl device properties.
+ *
+ * @param   device            device id.
+ * @param   device_type       0.Nvidia, 1.AMD, 2.Intel.
+ * @param   device_name[256]  device name.
+ * @param   device_mem        total global memory.
+ * @param   major             device compute capability.
+ * @param   minor             device compute capability.
+ * @param   is_local          local device or not.
+ * @param   device_resources  device resources.
+ */
+typedef struct {
+    int device;
+    int device_type;
+    char device_name[256];
+    size_t device_mem;
+    int major;
+    int minor;
+    bool is_local;
+    global_device_resources device_resources;
+} _hl_device_prop, *hl_device_prop;
+
+/**
+ * @brief   thread device resource allocation.
+ *
+ * create cuda stream and cuda event, allocate gpu
+ * memory and host page-lock memory for threads.
+ *
+ * @param[in]   device      device number.
+ * @param[out]  device_res  device properties.
+ */
+extern void hl_create_thread_resources(int device,
+                                       thread_device_resources device_res);
+
+/**
+ * @brief   global device resource allocation.
+ *
+ * create cuda stream, initialize cublas, curand and cudnn.
+ *
+ * @param[out]   device_prop  device properties.
+ */
+extern void hl_create_global_resources(hl_device_prop device_prop);
+
+#endif  /* HL_CUDA_PH_ */
diff --git a/paddle/cuda/include/hl_cuda_cublas.h b/paddle/cuda/include/hl_cuda_cublas.h
new file mode 100644
index 00000000000000..0ffbed18b5f9e5
--- /dev/null
+++ b/paddle/cuda/include/hl_cuda_cublas.h
@@ -0,0 +1,146 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_CUDA_CUBLAS_H_
+#define HL_CUDA_CUBLAS_H_
+
+#include "hl_base.h"
+
+/**
+ * @brief   Matrix transpose: C_d = T(A_d)
+ *
+ * @param[in]   A_d     input matrix (M x N).
+ * @param[out]  C_d     output matrix (N x M).
+ * @param[in]   dimM    matrix height.
+ * @param[in]   dimN    matrix width.
+ * @param[in]   lda     the first dimension of A_d.
+ * @param[in]   ldc     the first dimension of C_d.
+ *
+ */
+extern void hl_matrix_transpose(real *A_d,
+                                real *C_d,
+                                int dimM,
+                                int dimN,
+                                int lda,
+                                int ldc);
+
+/*
+ * @brief Matrix transpose, while lda = dimN, ldc = dimM.
+ *
+ * @param[in]   A_d     input matrix (M x N).
+ * @param[out]  C_d     output matrix (N x M).
+ * @param[in]   dimM    matrix height.
+ * @param[in]   dimN    matrix width.
+ *
+ */
+extern void hl_matrix_transpose(real *A_d,
+                                real *C_d,
+                                int dimM,
+                                int dimN);
+
+/**
+ * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
+ *
+ * @param[in]   A_d     input.
+ * @param[in]   transa  operation op(A) that is non-or transpose.
+ * @param[in]   B_d     input.
+ * @param[in]   transb  operation op(B) that is non-or transpose.
+ * @param[out]  C_d     output.
+ * @param[in]   dimM    matrix height of op(A) & C
+ * @param[in]   dimN    matrix width of op(B) & C
+ * @param[in]   dimK    width of op(A) & height of op(B)
+ * @param[in]   alpha   scalar used for multiplication.
+ * @param[in]   beta    scalar used for multiplication.
+ * @param[in]   lda     the first dimension of A_d.
+ * @param[in]   ldb     the first dimension of B_d.
+ * @param[in]   ldc     the first dimension of C_d.
+ *
+ */
+extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
+                          real *B_d, hl_trans_op_t transb,
+                          real *C_d,
+                          int dimM, int dimN, int dimK,
+                          real alpha, real beta,
+                          int lda, int ldb, int ldc);
+
+/**
+ * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
+ *
+ * @param[in]   A_d     input.
+ * @param[in]   transa  operation op(A) that is non-or transpose.
+ * @param[in]   B_d     input.
+ * @param[in]   transb  operation op(B) that is non-or transpose.
+ * @param[out]  C_d     output.
+ * @param[in]   dimM    matrix height of op(A) & C
+ * @param[in]   dimN    matrix width of op(B) & C
+ * @param[in]   dimK    width of op(A) & height of op(B)
+ * @param[in]   alpha   scalar used for multiplication.
+ * @param[in]   beta    scalar used for multiplication.
+ *
+ */
+extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
+                          real *B_d, hl_trans_op_t transb,
+                          real *C_d,
+                          int dimM, int dimN, int dimK,
+                          real alpha, real beta);
+
+/**
+ * @brief   This function performs the matrix-vector multiplication.
+ *          C_d = alpha*op(A_d)*B_d + beta*C_d
+ *
+ * @param[in]     A_d    matrix.
+ * @param[in]     trans  operation op(A) that is non-or transpose.
+ * @param[in]     B_d    vector with dimN(dimM) elements
+ *                       if trans==HPPL_OP_N(HPPL_OP_T).
+ * @param[in,out] C_d    vector with dimM(dimN) elements
+ *                       if trans==HPPL_OP_N(HPPL_OP_T).
+ * @param[in]     dimM   number of rows of matrix A_d.
+ * @param[in]     dimN   number of columns of matrix A_d.
+ * @param[in]     alpha  scalar used for multiplication.
+ * @param[in]     beta   scalar used for multiplication.
+ * @param[in]     lda    the first dimension of A_d.
+ * @param[in]     incb   increase B_d size for compaction.
+ * @param[in]     incc   increase C_d size for compaction.
+ *
+ */
+
+extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
+                                 real *B_d, real *C_d,
+                                 int dimM, int dimN,
+                                 real alpha, real beta,
+                                 int lda, int incb, int incc);
+
+/**
+ * @brief   This function performs the matrix-vector multiplication.
+ *          C_d = alpha*op(A_d)*B_d + beta*C_d
+ *
+ * @param[in]     A_d    matrix.
+ * @param[in]     trans  operation op(A) that is non-or transpose.
+ * @param[in]     B_d    vector with dimN(dimM) elements
+ *                       if trans==HPPL_OP_N(HPPL_OP_T).
+ * @param[in,out] C_d    vector with dimM(dimN) elements
+ *                       if trans==HPPL_OP_N(HPPL_OP_T).
+ * @param[in]     dimM   number of rows of matrix A_d.
+ * @param[in]     dimN   number of columns of matrix A_d.
+ * @param[in]     alpha  scalar used for multiplication.
+ * @param[in]     beta   scalar used for multiplication.
+ *
+ */
+extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
+                                 real *B_d, real *C_d,
+                                 int dimM, int dimN,
+                                 real alpha, real beta);
+
+#endif /* HL_CUDA_CUBLAS_H_ */
diff --git a/paddle/cuda/include/hl_cuda_cudnn.h b/paddle/cuda/include/hl_cuda_cudnn.h
new file mode 100644
index 00000000000000..f256cb54dfe69e
--- /dev/null
+++ b/paddle/cuda/include/hl_cuda_cudnn.h
@@ -0,0 +1,513 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_CUDA_CUDNN_H_
+#define HL_CUDA_CUDNN_H_
+
+#include "hl_base.h"
+
+/*
+ *  hppl pooling mode
+ */
+typedef enum {
+  HL_POOLING_MAX     = 0,
+  // average includes padded values
+  HL_POOLING_AVERAGE = 1,
+  // average does not include padded values
+  HL_POOLING_AVERAGE_EXCLUDE_PADDING = 2,
+  HL_POOLING_END
+} hl_pooling_mode_t;
+
+/**
+ * @brief return cudnn lib version
+ */
+
+extern int hl_get_cudnn_lib_version();
+
+/**
+ * @brief   hppl image descriptor.
+ */
+typedef struct _hl_tensor_descriptor* hl_tensor_descriptor;
+
+/**
+ * @brief   hppl pooling descriptor.
+ */
+typedef struct _hl_pooling_descriptor* hl_pooling_descriptor;
+
+/**
+ * @brief   hppl filter descriptor.
+ */
+typedef struct _hl_filter_descriptor* hl_filter_descriptor;
+
+/**
+ * @brief   hppl filter descriptor.
+ */
+typedef struct _hl_convolution_descriptor* hl_convolution_descriptor;
+
+/**
+ * @brief   create image descriptor.
+ *
+ * @param[out]   image_desc     image descriptor.
+ *
+ */
+extern void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc);
+
+/**
+ * @brief   reshape image descriptor.
+ *
+ * @param[in,out]   image_desc    image descriptor.
+ * @param[in]       batch_size    input batch size.
+ * @param[in]       feature_maps  image feature maps.
+ * @param[in]       height        image height.
+ * @param[in]       width         image width.
+ */
+extern void hl_tensor_reshape(hl_tensor_descriptor image_desc,
+                              int batch_size,
+                              int feature_maps,
+                              int height,
+                              int width);
+
+/**
+ * @brief   reshape image descriptor.
+ *
+ * @param[in,out]   image_desc    image descriptor.
+ * @param[in]       batch_size    input batch size.
+ * @param[in]       feature_maps  image feature maps.
+ * @param[in]       height        image height.
+ * @param[in]       width         image width.
+ * @param[in]       nStride       stride between two consecutive images.
+ * @param[in]       cStride       stride between two consecutive feature maps.
+ * @param[in]       hStride       stride between two consecutive rows.
+ * @param[in]       wStride       stride between two consecutive columns.
+ *
+ */
+extern void hl_tensor_reshape(hl_tensor_descriptor image_desc,
+                              int batch_size,
+                              int feature_maps,
+                              int height,
+                              int width,
+                              int nStride,
+                              int cStride,
+                              int hStride,
+                              int wStride);
+
+/**
+ * @brief   destroy image descriptor.
+ *
+ * @param[in]   image_desc  hppl image descriptor.
+ */
+extern void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc);
+
+/**
+ * @brief   create pooling descriptor.
+ *
+ * @param[out]  pooling_desc    pooling descriptor.
+ * @param[in]   mode            pooling mode.
+ * @param[in]   height          height of the pooling window.
+ * @param[in]   width           width of the pooling window.
+ * @param[in]   height_padding  padding height.
+ * @param[in]   width_padding   padding width.
+ * @param[in]   stride_height   pooling vertical stride.
+ * @param[in]   stride_width    pooling horizontal stride.
+ */
+extern void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
+                                         hl_pooling_mode_t mode,
+                                         int height,
+                                         int width,
+                                         int height_padding,
+                                         int width_padding,
+                                         int stride_height,
+                                         int stride_width);
+
+/**
+ * @brief   destroy pooling descriptor.
+ *
+ * @param[in]   pooling_desc  hppl pooling descriptor.
+ *
+ */
+extern void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc);
+
+/**
+ * @brief   pooling forward(calculate output image).
+ *
+ * @param[in]   input           input image descriptor.
+ * @param[in]   input_image     input image data.
+ * @param[in]   output          output image descriptor.
+ * @param[out]  output_image    output image data.
+ * @param[in]   pooling         pooling descriptor.
+ *
+ */
+extern void hl_pooling_forward(hl_tensor_descriptor input,
+                               real* input_image,
+                               hl_tensor_descriptor output,
+                               real* output_image,
+                               hl_pooling_descriptor pooling);
+
+/**
+ * @brief   pooling backward(calculate input image gradient).
+ *
+ * @param[in]   input               input image descriptor.
+ * @param[in]   input_image         input image data.
+ * @param[in]   input_image_grad    input image gradient data.
+ * @param[in]   output              output image descriptor.
+ * @param[in]   output_image        output image data.
+ * @param[out]  output_image_grad   output image gradient data.
+ * @param[in]   pooling             pooling descriptor.
+ *
+ */
+extern void hl_pooling_backward(hl_tensor_descriptor input,
+                                real* input_image,
+                                real* input_image_grad,
+                                hl_tensor_descriptor output,
+                                real* output_image,
+                                real* output_image_grad,
+                                hl_pooling_descriptor pooling);
+
+/**
+ * @brief   create filter descriptor.
+ *
+ * @param[out]  filter                  filter descriptor.
+ * @param[in]   input_feature_maps      input image feature maps.
+ * @param[in]   output_feature_maps     output image feature maps.
+ * @param[in]   height                  filter height.
+ * @param[in]   width                   filter width.
+ *
+ */
+extern void hl_create_filter_descriptor(hl_filter_descriptor* filter,
+                                        int input_feature_maps,
+                                        int output_feature_maps,
+                                        int height,
+                                        int width);
+
+/**
+ * @brief    convolution workspace configuration
+ *
+ * @param[in]    input                image descriptor
+ * @param[in]    output               image descriptor
+ * @param[in]    filter               filter descriptor
+ * @param[in]    conv                 convolution descriptor
+ * @param[out]   convFwdAlgo          forward algorithm
+ * @param[out]   fwdLimitBytes        forward workspace size
+ * @param[out]   convBwdDataAlgo      backward data algorithm
+ * @param[out]   bwdDataLimitBytes    backward data workspace size
+ * @param[out]   convBwdFilterAlgo    backward filter algorithm
+ * @param[out]   bwdFilterLimitBytes  backward filter workspace size
+ *
+ */
+extern void hl_conv_workspace(hl_tensor_descriptor input,
+                              hl_tensor_descriptor output,
+                              hl_filter_descriptor filter,
+                              hl_convolution_descriptor conv,
+                              int* convFwdAlgo,
+                              size_t* fwdLimitBytes,
+                              int* convBwdDataAlgo,
+                              size_t* bwdDataLimitBytes,
+                              int* convBwdFilterAlgo,
+                              size_t* bwdFilterLimitBytes);
+
+/**
+ * @brief   destroy filter descriptor.
+ *
+ * @param[in]   filter  hppl filter descriptor.
+ *
+ */
+extern void hl_destroy_filter_descriptor(hl_filter_descriptor filter);
+
+/**
+ * @brief   create convolution descriptor.
+ *
+ * @param[out]  conv                    conv descriptor.
+ * @param[in]   image                   input image descriptor.
+ * @param[in]   filter                  filter descriptor.
+ * @param[in]   padding_height          padding height.
+ * @param[in]   padding_width           padding width.
+ * @param[in]   stride_height           stride height.
+ * @param[in]   stride_width            stride width.
+ *
+ */
+extern void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
+                                             hl_tensor_descriptor image,
+                                             hl_filter_descriptor filter,
+                                             int padding_height,
+                                             int padding_width,
+                                             int stride_height,
+                                             int stride_width);
+
+/**
+ * @brief   reset convolution descriptor.
+ *
+ * @param[in,out]   conv                conv descriptor.
+ * @param[in]       image               input image descriptor.
+ * @param[in]       filter              filter descriptor.
+ * @param[in]       padding_height      padding height.
+ * @param[in]       padding_width       padding width.
+ * @param[in]       stride_height       stride height.
+ * @param[in]       stride_width        stride width.
+ *
+ */
+extern void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
+                                            hl_tensor_descriptor image,
+                                            hl_filter_descriptor filter,
+                                            int padding_height,
+                                            int padding_width,
+                                            int stride_height,
+                                            int stride_width);
+
+/**
+ * @brief   destroy convolution descriptor.
+ *
+ * @param[in]   conv  hppl convolution descriptor.
+ */
+extern void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv);
+
+/**
+ * @brief   convolution forward(calculate output image).
+ *
+ * @param[in]   input           input image descriptor.
+ * @param[in]   input_data      input image data.
+ * @param[in]   output          output image descriptor.
+ * @param[out]  output_data     output image data.
+ * @param[in]   filter          filter descriptor.
+ * @param[in]   filter_data     filter data.
+ * @param[in]   conv            convolution descriptor.
+ * @param[in]   gpuWorkSpace    limited gpu workspace.
+ * @param[in]   sizeInBytes     gpu workspace size (bytes).
+ * @param[in]   convFwdAlgo     forward algorithm.
+ */
+extern void hl_convolution_forward(hl_tensor_descriptor input,
+                                   real* input_data,
+                                   hl_tensor_descriptor output,
+                                   real* output_data,
+                                   hl_filter_descriptor filter,
+                                   real* filter_data,
+                                   hl_convolution_descriptor conv,
+                                   void* gpuWorkSpace,
+                                   size_t sizeInBytes,
+                                   int convFwdAlgo);
+
+/**
+ * @brief   convolution forward add bias(calculate output add bias).
+ *
+ * @param[in]   bias                bias descriptor.
+ * @param[in]   bias_data           bias data.
+ * @param[in]   output              output image descriptor.
+ * @param[out]  output_data         output image data.
+ */
+extern void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
+                                            real* bias_data,
+                                            hl_tensor_descriptor output,
+                                            real* output_data);
+
+/**
+ * @brief   convolution backward filter(calculate filter grad data).
+ *
+ * @param[in]   input               input image descriptor.
+ * @param[in]   input_data          input image data.
+ * @param[in]   output              output image descriptor.
+ * @param[in]   output_grad_data    output image grad data.
+ * @param[in]   filter              filter descriptor.
+ * @param[out]  filter_grad_data    filter grad data.
+ * @param[in]   conv                convolution descriptor.
+ * @param[in]   gpuWorkSpace        limited gpu workspace.
+ * @param[in]   sizeInBytes         gpu workspace size (bytes).
+ * @param[in]   convBwdFilterAlgo   backward filter algorithm.
+ */
+extern void hl_convolution_backward_filter(
+        hl_tensor_descriptor input,
+        real* input_data,
+        hl_tensor_descriptor output,
+        real* output_grad_data,
+        hl_filter_descriptor filter,
+        real* filter_grad_data,
+        hl_convolution_descriptor conv,
+        void* gpuWorkSpace,
+        size_t sizeInBytes,
+        int  convBwdFilterAlgo);
+
+/**
+ * @brief   convolution backward data(calculate input image grad data).
+ *
+ * @param[in]   input               input image descriptor.
+ * @param[out]  input_data_grad     input image grad data.
+ * @param[in]   output              output image descriptor.
+ * @param[in]   output_grad_data    output image grad data.
+ * @param[in]   filter              filter descriptor.
+ * @param[in]   filter_data         filter data.
+ * @param[in]   conv                convolution descriptor.
+ * @param[in]   gpuWorkSpace        limited gpu workspace.
+ * @param[in]   sizeInBytes         gpu workspace size (bytes).
+ * @param[in]   convBwdDataAlgo     backward data algorithm.
+ */
+extern void hl_convolution_backward_data(
+        hl_tensor_descriptor input,
+        real* input_data_grad,
+        hl_tensor_descriptor output,
+        real* output_grad_data,
+        hl_filter_descriptor filter,
+        real* filter_data,
+        hl_convolution_descriptor conv,
+        void* gpuWorkSpace,
+        size_t sizeInBytes,
+        int convBwdDataAlgo);
+
+/**
+ * @brief   convolution backward bias(calculate bias grad data).
+ *
+ * @param[in]   bias                bias descriptor.
+ * @param[out]  bias_grad_data      bias grad data.
+ * @param[in]   output              output image descriptor.
+ * @param[in]   output_grad_data    output image grad data.
+ */
+extern void hl_convolution_backward_bias(hl_tensor_descriptor bias,
+                                         real* bias_grad_data,
+                                         hl_tensor_descriptor output,
+                                         real* output_grad_data);
+
+/**
+ * @brief   softmax forward.
+ *
+ * @param[in]   input               input value.
+ * @param[out]  output              output value.
+ * @param[in]   height              matrix height.
+ * @param[in]   width               matrix width.
+ */
+extern void hl_softmax_forward(real *input,
+                               real *output,
+                               int height,
+                               int width);
+
+/**
+ * @brief   softmax backward.
+ *
+ * @param[in]   output_value        output value data.
+ * @param[out]  output_grad         output grad data.
+ * @param[in]   height              matrix height.
+ * @param[in]   width               matrix width.
+ */
+extern void hl_softmax_backward(real *output_value,
+                                real *output_grad,
+                                int height,
+                                int width);
+
+/**
+ * @brief   cudnn batch norm forward.
+ *
+ * @param[in]   inputDesc     input tensor descriptor desc.
+ * @param[in]   input         input data.
+ * @param[in]   outputDesc    output tensor descriptor desc.
+ * @param[out]  output        output data.
+ * @param[in]   bnParamDesc   tensor descriptor desc.
+ *                            bnScale, bnBias, running mean/var, save_mean/var.
+ * @param[in]   scale         batch normalization scale parameter (in original
+ *                            paper scale is referred to as gamma).
+ * @param[in]   bias          batch normalization bias parameter (in original
+ *                            paper scale is referred to as beta).
+ * @param[in]   factor        Factor used in the moving average computation.
+ *                            runningMean = newMean * factor
+ *                                         + runningMean * (1 - factor)
+ * @param[in]   runningMean   running mean.
+ * @param[in]   runningInvVar running variance.
+ * @param[in]   epsilon       Epsilon value used in the batch normalization
+ *                            formula.
+ * @param[out]  savedMean     optional cache to save intermediate results.
+ * @param[out]  savedVar      optional cache to save intermediate results.
+ *
+ */
+extern void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
+                                           real *input,
+                                           hl_tensor_descriptor outputDesc,
+                                           real *output,
+                                           hl_tensor_descriptor bnParamDesc,
+                                           real *scale,
+                                           real *bias,
+                                           double factor,
+                                           real *runningMean,
+                                           real *runningInvVar,
+                                           double epsilon,
+                                           real *savedMean,
+                                           real *savedVar);
+
+/**
+ * @brief   cudnn batch norm forward.
+ *
+ * @param[in]   inputDesc    input tensor descriptor desc.
+ * @param[in]   input        input data.
+ * @param[in]   outputDesc   output tensor descriptor desc.
+ * @param[out]  output       output data.
+ * @param[in]   bnParamDesc  tensor descriptor desc.
+ *                           bnScale, bnBias, running mean/var, save_mean/var.
+ * @param[in]   scale        batch normalization scale parameter (in original
+ *                           paper scale is referred to as gamma).
+ * @param[in]   bias         batch normalization bias parameter (in original
+ *                           paper scale is referred to as beta).
+ * @param[in]   estimatedMean
+ * @param[in]   estimatedVar It is suggested that resultRunningMean,
+ *                           resultRunningVariance from the
+ *                           cudnnBatchNormalizationForwardTraining call
+ *                           accumulated during the training phase are passed
+ *                           as inputs here.
+ * @param[in]   epsilon      Epsilon value used in the batch
+ *                           normalization formula.
+ *
+ */
+extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
+                                            real *input,
+                                            hl_tensor_descriptor outputDesc,
+                                            real *output,
+                                            hl_tensor_descriptor bnParamDesc,
+                                            real *scale,
+                                            real *bias,
+                                            real *estimatedMean,
+                                            real *estimatedVar,
+                                            double epsilon);
+
+/**
+ * @brief   cudnn batch norm forward.
+ *
+ * @param[in]   inputDesc       input tensor descriptor desc.
+ * @param[in]   input           input data.
+ * @param[in]   outGradDesc     output tensor descriptor desc.
+ * @param[out]  outGrad         output data.
+ * @param[in]   inGradDesc      input tensor descriptor desc.
+ * @param[in]   inGrad          input data.
+ * @param[in]   dBnParamDesc    tensor descriptor desc.
+ *                              bnScale, bnBias, running mean/var, save_mean/var.
+ * @param[in]   scale           batch normalization scale parameter (in original
+ *                              paper scale is referred to as gamma).
+ * @param[in]   scaleGrad       batch normalization scale parameter (in original
+ *                              paper scale is referred to as gamma) gradient.
+ * @param[in]   biasGrad        batch normalization bias parameter (in original
+ *                              paper scale is referred to as beta) gradient.
+ * @param[in]   epsilon         Epsilon value used in the batch
+ *                              normalization formula.
+ * @param[out]  savedMean       optional cache to save intermediate results.
+ * @param[out]  savedInvVar     optional cache to save intermediate results.
+ *
+ */
+extern void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
+                                   real *input,
+                                   hl_tensor_descriptor outGradDesc,
+                                   real *outGrad,
+                                   hl_tensor_descriptor inGradDesc,
+                                   real *inGrad,
+                                   hl_tensor_descriptor dBnParamDesc,
+                                   real *scale,
+                                   real *scaleGrad,
+                                   real *biasGrad,
+                                   double epsilon,
+                                   real *savedMean,
+                                   real *savedInvVar);
+
+#endif  // HL_CUDA_CUDNN_H_
diff --git a/paddle/cuda/include/hl_cuda_cudnn.ph b/paddle/cuda/include/hl_cuda_cudnn.ph
new file mode 100644
index 00000000000000..c0e82abe1785a0
--- /dev/null
+++ b/paddle/cuda/include/hl_cuda_cudnn.ph
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_CUDA_CUDNN_PH_
+#define HL_CUDA_CUDNN_PH_
+
+#include "hl_base.h"
+
+/*
+ * @brief   hppl for cudnn tensor4d descriptor.
+ */
+typedef struct {
+    cudnnTensorDescriptor_t     desc;
+    cudnnTensorFormat_t         format;
+    cudnnDataType_t             data_type;  // image data type
+    int batch_size;                         // number of input batch size
+    int feature_maps;                       // number of input feature maps
+    int height;                             // height of input image
+    int width;                              // width of input image
+} _cudnn_tensor_descriptor, *cudnn_tensor_descriptor;
+
+#define GET_TENSOR_DESCRIPTOR(image) (((cudnn_tensor_descriptor)image)->desc)
+
+/*
+ * @brief   hppl for cudnn pooling descriptor.
+ */
+typedef struct {
+    cudnnPoolingDescriptor_t   desc;
+    cudnnPoolingMode_t         mode;
+    int window_height;
+    int window_width;
+    int stride_height;
+    int stride_width;
+} _cudnn_pooling_descriptor, *cudnn_pooling_descriptor;
+
+/*
+ * @brief   hppl for cudnn filter descriptor.
+ */
+typedef struct {
+    cudnnFilterDescriptor_t   desc;
+    cudnnDataType_t           data_type;    /* data type */
+    int output_feature_maps;        /* number of output feature maps */
+    int input_feature_maps;         /* number of input feature maps */
+    int filter_height;              /* height of each input filter */
+    int filter_width;               /* width of  each input fitler */
+} _cudnn_filter_descriptor, *cudnn_filter_descriptor;
+
+#define GET_FILTER_DESCRIPTOR(filter) (((cudnn_filter_descriptor)filter)->desc)
+
+/*
+ * @brief   hppl for cudnn convolution descriptor.
+ */
+typedef struct {
+    cudnnConvolutionDescriptor_t    desc;
+    hl_tensor_descriptor             input_image;
+    hl_filter_descriptor            filter;
+    int padding_height;                     // zero-padding height
+    int padding_width;                      // zero-padding width
+    int stride_height;                      // vertical filter stride
+    int stride_width;                       // horizontal filter stride
+    int upscalex;                           // upscale the input in x-direction
+    int upscaley;                           // upscale the input in y-direction
+    cudnnConvolutionMode_t          mode;
+} _cudnn_convolution_descriptor, *cudnn_convolution_descriptor;
+
+#define GET_CONVOLUTION_DESCRIPTOR(conv)    \
+    (((cudnn_convolution_descriptor)conv)->desc)
+
+#endif /* HL_CUDA_CUDNN_PH_ */
diff --git a/paddle/cuda/include/hl_device_functions.cuh b/paddle/cuda/include/hl_device_functions.cuh
new file mode 100644
index 00000000000000..408ff35d963c65
--- /dev/null
+++ b/paddle/cuda/include/hl_device_functions.cuh
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_DEVICE_FUNCTIONS_CUH_
+#define HL_DEVICE_FUNCTIONS_CUH_
+
+namespace hppl {
+
+static __inline__ __device__ double atomicAdd(double* address, double val) {
+    // NOLINTNEXTLINE
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed; // NOLINT
+
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull,
+                        assumed,
+                        __double_as_longlong(val +
+                        __longlong_as_double(assumed)));
+    } while (assumed != old);
+
+    return __longlong_as_double(old);
+}
+
+}  // namespace hppl
+
+using hppl::atomicAdd;
+
+#endif /* HL_DEVICE_FUNCTIONS_CUH_ */
diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h
new file mode 100644
index 00000000000000..f36c724e2da3dc
--- /dev/null
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_DSO_LOADER_H_
+#define HL_DSO_LOADER_H_
+
+#include <dlfcn.h>
+#include <string>
+#include <memory>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <curand.h>
+#include <cudnn.h>
+#include "hl_base.h"
+
+/**
+ * @brief    load the DSO of CUBLAS
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetCublasDsoHandle(void** dso_handle);
+
+/**
+ * @brief    load the DSO of CUDNN
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetCudnnDsoHandle(void** dso_handle);
+
+/**
+ * @brief    load the DSO of CUDA Run Time
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetCudartDsoHandle(void** dso_handle);
+
+/**
+ * @brief    load the DSO of CURAND
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetCurandDsoHandle(void** dso_handle);
+
+#endif  // HL_DSO_LOADER_H_
diff --git a/paddle/cuda/include/hl_functions.h b/paddle/cuda/include/hl_functions.h
new file mode 100644
index 00000000000000..65f366461ced0f
--- /dev/null
+++ b/paddle/cuda/include/hl_functions.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_FUNCTIONS_H_
+#define HL_FUNCTIONS_H_
+
+#include "hl_base.h"
+
+/**
+ * sigmoid threshold maximum
+ */
+#define     SIGMOID_THRESHOLD_MIN   -40.0
+
+/**
+ * sigmoid threshold minimum
+ */
+#define     SIGMOID_THRESHOLD_MAX   13.0
+
+#ifndef __NVCC__
+namespace hppl {
+  /*
+   * forward activation
+   */
+  real relu(const real a);
+  real sigmoid(const real a);
+  real tanh(const real a);
+  real linear(const real a);
+
+  /*
+   * backward activation
+   */
+  real relu(const real a, const real b);
+  real sigmoid(const real a, const real b);
+  real tanh(const real a, const real b);
+  real linear(const real a, const real b);
+}  // namespace hppl
+
+#ifdef __AVX__
+#include "hl_avx_functions.h"
+#endif
+
+#else
+#include "hl_gpu_functions.cuh"
+#endif
+
+#endif  // HL_FUNCTIONS_H_
diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h
new file mode 100644
index 00000000000000..05039663b6e9f5
--- /dev/null
+++ b/paddle/cuda/include/hl_gpu.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_GPU_H_
+#define HL_GPU_H_
+
+#include "hl_base.h"
+#include "hl_cuda.h"
+#include "hl_cuda_cublas.h"
+#include "hl_cuda_cudnn.h"
+#include "hl_matrix.h"
+#include "hl_aggregate.h"
+#include "hl_cnn.h"
+#include "hl_sparse.h"
+#include "hl_lstm.h"
+#include "hl_sequence.h"
+
+#ifdef HPPL_STUB_FUNC
+#include "stub/hl_cuda_stub.h"
+#include "stub/hl_cuda_cublas_stub.h"
+#include "stub/hl_cuda_cudnn_stub.h"
+#include "stub/hl_matrix_stub.h"
+#include "stub/hl_aggregate_stub.h"
+#include "stub/hl_cnn_stub.h"
+#include "stub/hl_sparse_stub.h"
+#include "stub/hl_lstm_stub.h"
+#include "stub/hl_sequence_stub.h"
+#endif
+
+#endif /* HL_GPU_H_ */
diff --git a/paddle/cuda/include/hl_gpu_functions.cuh b/paddle/cuda/include/hl_gpu_functions.cuh
new file mode 100644
index 00000000000000..38df4eb8958f21
--- /dev/null
+++ b/paddle/cuda/include/hl_gpu_functions.cuh
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_GPU_FUNCTIONS_CUH_
+#define HL_GPU_FUNCTIONS_CUH_
+
+#include "hl_base.h"
+
+namespace hppl {
+
+  __device__ static real relu(const real a) {
+    return a > 0.0f ? a : 0.0f;
+  }
+
+  __device__ static real sigmoid(const real a) {
+    const real min = SIGMOID_THRESHOLD_MIN;
+    const real max = SIGMOID_THRESHOLD_MAX;
+    real tmp = (a < min) ? min : ((a > max) ? max : a);
+#ifndef HPPL_TYPE_DOUBLE
+    return __fdividef(1.0f, 1.0f + __expf(-tmp));
+#else
+    return 1.0 / (1.0 + exp(-tmp));
+#endif
+  }
+
+  __device__ static real tanh(const real a) {
+#ifndef HPPL_TYPE_DOUBLE
+    return __fdividef(2.0f, (1.0f + __expf(-2.0f*a))) - 1.0f;
+#else
+    return (2.0 / (1.0 + exp(-2.0*a))) - 1.0;
+#endif
+  }
+
+  __device__ static real linear(const real a) {
+    return a;
+  }
+
+  __device__ static real relu(const real a, const real b) {
+    return a * (b > 0.0f ? 1.0f : 0.0f);
+  }
+
+  __device__ static real sigmoid(const real a, const real b) {
+    return a * b * (1 - b);
+  }
+
+  __device__ static real tanh(const real a, const real b) {
+    return a * (1.0f - b * b);
+  }
+
+  __device__ static real linear(const real a, const real b) {
+    return a;
+  }
+
+}  // namespace hppl
+
+#endif  // HL_GPU_FUNCTIONS_CUH_
diff --git a/paddle/cuda/include/hl_gpu_gru.cuh b/paddle/cuda/include/hl_gpu_gru.cuh
new file mode 100644
index 00000000000000..3e0cfdbe4f7717
--- /dev/null
+++ b/paddle/cuda/include/hl_gpu_gru.cuh
@@ -0,0 +1,393 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_GPU_GRU_CUH_
+#define HL_GPU_GRU_CUH_
+
+#ifdef __NVCC__
+
+#include "paddle/utils/Logging.h"
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template<class OpResetOutput, bool isBatch>
+__global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput,
+                                        real *gateValue,
+                                        real *resetOutputValue,
+                                        real *prevOutputValue,
+                                        int frameSize,
+                                        int batchSize,
+                                        hl_activation_mode_t active_gate) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    resetOutputValue += batchIdx * frameSize;
+  }
+
+  real rPrevOut = 0;
+  real rValueResetOutput;
+  real rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
+  real rValueResetGate  = gateValue[frameIdx + frameSize * 1];
+
+  if (prevOutputValue) {
+    if (isBatch) prevOutputValue += batchIdx * frameSize;
+    rPrevOut = prevOutputValue[frameIdx];
+  }
+
+  opResetOutput(rValueUpdateGate,
+                rValueResetGate,
+                rPrevOut,
+                rValueResetOutput,
+                hppl::gpu::forward[active_gate]);
+
+  gateValue[frameIdx + frameSize * 0] = rValueUpdateGate;
+  gateValue[frameIdx + frameSize * 1] = rValueResetGate;
+  resetOutputValue[frameIdx] = rValueResetOutput;
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template<class OpFinalOutput, bool isBatch>
+__global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput,
+                                        real *gateValue,
+                                        real *prevOutputValue,
+                                        real *outputValue,
+                                        int frameSize,
+                                        int batchSize,
+                                        hl_activation_mode_t active_node) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    outputValue += batchIdx * frameSize;
+  }
+
+  real rOutput;
+  real rPrevOut = 0;
+  real rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
+  real rValueFrameState = gateValue[frameIdx + frameSize * 2];
+
+  if (prevOutputValue) {
+    if (isBatch) prevOutputValue += batchIdx * frameSize;
+    rPrevOut = prevOutputValue[frameIdx];
+  }
+
+  opFinalOutput(rValueUpdateGate,
+                rValueFrameState,
+                rPrevOut,
+                rOutput,
+                hppl::gpu::forward[active_node]);
+
+  gateValue[frameIdx + frameSize * 2] = rValueFrameState;
+  outputValue[frameIdx] = rOutput;
+}
+
+template<class OpResetOutput, class OpFinalOutput>
+void hl_gpu_gru_forward(OpResetOutput opResetOutput,
+                        OpFinalOutput opFinalOutput,
+                        hl_gru_value value,
+                        int frameSize,
+                        int batchSize,
+                        hl_activation_mode_t active_node,
+                        hl_activation_mode_t active_gate) {
+  dim3 threads;
+  dim3 grid;
+  if (batchSize == 1) {
+    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+    int frameBlocks = (frameSize + 1024 - 1) / 1024;
+    threads = dim3(framePerBlock, 1);
+    grid = dim3(frameBlocks, 1);
+  } else {
+    threads = dim3(32, 32);
+    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+  }
+
+  if (value.prevOutValue) {
+    hl_matrix_mul(value.prevOutValue, HPPL_OP_N,
+                  value.gateWeight, HPPL_OP_N,
+                  value.gateValue,
+                  batchSize, 2*frameSize, frameSize,
+                  /*alpha = */ 1, /*beta = */ 1,
+                  frameSize, 2* frameSize, 3*frameSize);
+  }
+
+  if (batchSize == 1) {
+    KeGruForwardResetOutput<OpResetOutput, /* isBatch= */false>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetOutput,
+        value.gateValue, value.resetOutputValue, value.prevOutValue,
+        frameSize, batchSize, active_gate);
+  } else {
+    KeGruForwardResetOutput<OpResetOutput, /* isBatch= */true>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetOutput,
+        value.gateValue, value.resetOutputValue, value.prevOutValue,
+        frameSize, batchSize, active_gate);
+  }
+
+  if (value.prevOutValue) {
+    hl_matrix_mul(value.resetOutputValue, HPPL_OP_N,
+                  value.stateWeight, HPPL_OP_N,
+                  value.gateValue + 2*frameSize,
+                  batchSize, frameSize, frameSize,
+                  /*alpha = */ 1, /*beta = */ 1,
+                  frameSize, frameSize, 3*frameSize);
+  }
+
+  if (batchSize == 1) {
+    KeGruForwardFinalOutput<OpFinalOutput, /* isBatch= */false>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opFinalOutput,
+        value.gateValue, value.prevOutValue, value.outputValue,
+        frameSize, batchSize, active_node);
+  } else {
+    KeGruForwardFinalOutput<OpFinalOutput, /* isBatch= */true>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opFinalOutput,
+        value.gateValue, value.prevOutValue, value.outputValue,
+        frameSize, batchSize, active_node);
+  }
+
+  CHECK_SYNC("hl_gpu_gru_forward failed");
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template<class OpStateGrad, bool isBatch>
+__global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad,
+                                       real *gateValue,
+                                       real *gateGrad,
+                                       real *prevOutValue,
+                                       real *prevOutGrad,
+                                       real *outputGrad,
+                                       int frameSize,
+                                       int batchSize,
+                                       hl_activation_mode_t active_node) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    gateGrad  += batchIdx * 3 * frameSize;
+    outputGrad += batchIdx * frameSize;
+  }
+
+  real rUpdateGateGrad;
+  real rFrameStateGrad;
+  real rPrevOutValue = 0;
+  real rPrevOutGrad  = 0;
+  real rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
+  real rFrameStateValue = gateValue[frameIdx + frameSize * 2];
+  real rOutGrad  = outputGrad[frameIdx];
+
+  if (prevOutValue && prevOutGrad) {
+    if (isBatch) prevOutValue += batchIdx * frameSize;
+    rPrevOutValue = prevOutValue[frameIdx];
+
+    if (isBatch) prevOutGrad  += batchIdx * frameSize;
+    rPrevOutGrad  = prevOutGrad[frameIdx];
+  }
+
+  opStateGrad(rUpdateGateValue,
+              rUpdateGateGrad,
+              rFrameStateValue,
+              rFrameStateGrad,
+              rPrevOutValue,
+              rPrevOutGrad,
+              rOutGrad,
+              hppl::gpu::backward[active_node]);
+
+  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
+  gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad;
+  if (prevOutGrad) {
+    prevOutGrad[frameIdx] = rPrevOutGrad;
+  }
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template<class OpResetGrad, bool isBatch>
+__global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad,
+                                       real *gateValue,
+                                       real *gateGrad,
+                                       real *prevOutValue,
+                                       real *prevOutGrad,
+                                       real *resetOutputGrad,
+                                       int frameSize,
+                                       int batchSize,
+                                       hl_activation_mode_t active_gate) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    gateGrad  += batchIdx * 3 * frameSize;
+    resetOutputGrad += batchIdx * frameSize;
+  }
+
+  real rResetGateGrad;
+  real rPrevOutValue = 0;
+  real rPrevOutGrad  = 0;
+  real rResetOutputGrad = 0;
+  real rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
+  real rUpdateGateGrad  = gateGrad[frameIdx + frameSize * 0];
+  real rResetGateValue  = gateValue[frameIdx + frameSize * 1];
+
+  if (prevOutValue && prevOutGrad) {
+    if (isBatch) prevOutValue += batchIdx * frameSize;
+    if (isBatch) prevOutGrad  += batchIdx * frameSize;
+    rPrevOutValue = prevOutValue[frameIdx];
+    rPrevOutGrad  = prevOutGrad[frameIdx];
+    rResetOutputGrad = resetOutputGrad[frameIdx];
+  }
+
+  opResetGrad(rUpdateGateValue,
+              rUpdateGateGrad,
+              rResetGateValue,
+              rResetGateGrad,
+              rPrevOutValue,
+              rPrevOutGrad,
+              rResetOutputGrad,
+              hppl::gpu::backward[active_gate]);
+
+  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
+  gateGrad[frameIdx + frameSize * 1] = rResetGateGrad;
+  if (prevOutGrad) {
+    prevOutGrad[frameIdx] = rPrevOutGrad;
+  }
+}
+
+template<class OpStateGrad, class OpResetGrad>
+void hl_gpu_gru_backward(OpStateGrad opStateGrad,
+                         OpResetGrad opResetGrad,
+                         hl_gru_value value,
+                         hl_gru_grad  grad,
+                         int frameSize,
+                         int batchSize,
+                         hl_activation_mode_t active_node,
+                         hl_activation_mode_t active_gate) {
+  dim3 threads;
+  dim3 grid;
+  if (batchSize == 1) {
+    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+    int frameBlocks = (frameSize + 1024 - 1) / 1024;
+    threads = dim3(framePerBlock, 1);
+    grid = dim3(frameBlocks, 1);
+  } else {
+    threads = dim3(32, 32);
+    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+  }
+
+  if (batchSize == 1) {
+    KeGruBackwardStateGrad<OpStateGrad, /* isBatch= */false>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opStateGrad,
+        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
+        grad.outputGrad, frameSize, batchSize, active_node);
+  } else {
+    KeGruBackwardStateGrad<OpStateGrad, /* isBatch= */true>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opStateGrad,
+        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
+        grad.outputGrad, frameSize, batchSize, active_node);
+  }
+
+  if (value.prevOutValue && grad.prevOutGrad) {
+    hl_matrix_mul(grad.gateGrad + 2*frameSize, HPPL_OP_N,
+                  value.stateWeight, HPPL_OP_T,
+                  grad.resetOutputGrad,
+                  batchSize, frameSize, frameSize,
+                  /*alpha = */ 1, /*beta = */ 0,
+                  3*frameSize, frameSize, frameSize);
+    if (grad.stateWeightGrad) {
+      hl_matrix_mul(value.resetOutputValue, HPPL_OP_T,
+                    grad.gateGrad + 2*frameSize, HPPL_OP_N,
+                    grad.stateWeightGrad,
+                    frameSize, frameSize, batchSize,
+                    /*alpha = */ 1, /*beta = */ 1,
+                    frameSize, 3*frameSize, frameSize);
+    }
+  }
+
+  if (batchSize == 1) {
+    KeGruBackwardResetGrad<OpResetGrad, /* isBatch= */false>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetGrad,
+        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
+        grad.resetOutputGrad, frameSize, batchSize, active_gate);
+  } else {
+    KeGruBackwardResetGrad<OpResetGrad, /* isBatch= */true>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetGrad,
+        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
+        grad.resetOutputGrad, frameSize, batchSize, active_gate);
+  }
+
+  if (grad.prevOutGrad && value.prevOutValue) {
+    hl_matrix_mul(grad.gateGrad, HPPL_OP_N,
+                  value.gateWeight, HPPL_OP_T,
+                  grad.prevOutGrad,
+                  batchSize, frameSize, 2*frameSize,
+                  /*alpha = */ 1, /*beta = */ 1,
+                  3*frameSize, 2*frameSize, frameSize);
+    if (grad.gateWeightGrad) {
+      hl_matrix_mul(value.prevOutValue, HPPL_OP_T,
+                    grad.gateGrad, HPPL_OP_N,
+                    grad.gateWeightGrad,
+                    frameSize, 2*frameSize, batchSize,
+                    /*alpha = */ 1, /*beta = */ 1,
+                    frameSize, 3*frameSize, 2*frameSize);
+    }
+  }
+
+  CHECK_SYNC("hl_gpu_gru_backward failed");
+}
+
+#else
+
+template<class OpResetOutput, class OpFinalOutput>
+void hl_gpu_gru_forward(OpResetOutput opResetOutput,
+                        OpFinalOutput opFinalOutput,
+                        hl_gru_value value,
+                        int frameSize,
+                        int batchSize,
+                        hl_activation_mode_t active_node,
+                        hl_activation_mode_t active_gate) {}
+
+template<class OpStateGrad, class OpResetGrad>
+void hl_gpu_gru_backward(OpStateGrad opStateGrad,
+                         OpResetGrad opResetGrad,
+                         hl_gru_value value,
+                         hl_gru_grad  grad,
+                         int frameSize,
+                         int batchSize,
+                         hl_activation_mode_t active_node,
+                         hl_activation_mode_t active_gate) {}
+
+#endif
+
+#endif /* HL_GPU_GRU_CUH_ */
diff --git a/paddle/cuda/include/hl_gpu_lstm.cuh b/paddle/cuda/include/hl_gpu_lstm.cuh
new file mode 100644
index 00000000000000..2ca33f2b13a1f7
--- /dev/null
+++ b/paddle/cuda/include/hl_gpu_lstm.cuh
@@ -0,0 +1,300 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_GPU_LSTM_CUH_
+#define HL_GPU_LSTM_CUH_
+
+#ifdef __NVCC__
+
+#include "paddle/utils/Logging.h"
+#include "hl_device_functions.cuh"
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template<class Op, bool isBatch>
+__global__ void KeLstmForward(Op op,
+                              hl_lstm_value value,
+                              int frameSize,
+                              int batchSize,
+                              hl_activation_mode_t active_node,
+                              hl_activation_mode_t active_gate,
+                              hl_activation_mode_t active_state) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    value.gateValue += batchIdx * frameSize * 4;
+    value.outputValue += batchIdx * frameSize;
+    value.stateValue  += batchIdx * frameSize;
+    value.stateActiveValue += batchIdx * frameSize;
+  }
+
+  real rState;
+  real rPrevState = 0;
+  real rStateAtv;
+  real rOut;
+  real rValueIn;
+  real rValueIg;
+  real rValueFg;
+  real rValueOg;
+  real rCheckI = value.checkIg[frameIdx];
+  real rCheckF = value.checkFg[frameIdx];
+  real rCheckO = value.checkOg[frameIdx];
+
+  rValueIn = value.gateValue[frameIdx];
+  rValueIg = value.gateValue[frameIdx + frameSize];
+  rValueFg = value.gateValue[frameIdx + frameSize * 2];
+  rValueOg = value.gateValue[frameIdx + frameSize * 3];
+
+  if (value.prevStateValue) {
+    if (isBatch) value.prevStateValue += batchIdx * frameSize;
+    rPrevState = value.prevStateValue[frameIdx];
+  }
+
+  op(rValueIn,
+     rValueIg,
+     rValueFg,
+     rValueOg,
+     rPrevState,
+     rState,
+     rStateAtv,
+     rOut,
+     rCheckI,
+     rCheckF,
+     rCheckO,
+     hppl::gpu::forward[active_node],
+     hppl::gpu::forward[active_gate],
+     hppl::gpu::forward[active_state]);
+
+  value.gateValue[frameIdx] = rValueIn;
+  value.gateValue[frameIdx + frameSize] = rValueIg;
+  value.gateValue[frameIdx + frameSize * 2] = rValueFg;
+  value.gateValue[frameIdx + frameSize * 3] = rValueOg;
+
+  value.stateValue[frameIdx] = rState;
+  value.stateActiveValue[frameIdx] = rStateAtv;
+  value.outputValue[frameIdx] = rOut;
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template<class Op, bool isBatch>
+__global__ void KeLstmBackward(Op op,
+                               hl_lstm_value value,
+                               hl_lstm_grad grad,
+                               int frameSize,
+                               int batchSize,
+                               hl_activation_mode_t active_node,
+                               hl_activation_mode_t active_gate,
+                               hl_activation_mode_t active_state) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    value.gateValue += batchIdx * frameSize * 4;
+    value.stateValue += batchIdx * frameSize;
+    value.stateActiveValue += batchIdx * frameSize;
+    grad.gateGrad += batchIdx * frameSize * 4;
+    grad.stateGrad += batchIdx * frameSize;
+    grad.outputGrad += batchIdx * frameSize;
+  }
+
+  real rValueIn;
+  real rValueIg;
+  real rValueFg;
+  real rValueOg;
+  real rGradIn;
+  real rGradIg;
+  real rGradFg;
+  real rGradOg;
+  real rPrevState = 0;
+  real rPrevStateGrad;
+  real rState;
+  real rStateGrad;
+  real rStateAtv;
+  real rOutputGrad;
+  real rCheckI = value.checkIg[frameIdx];
+  real rCheckF = value.checkFg[frameIdx];
+  real rCheckO = value.checkOg[frameIdx];
+  real rCheckIGrad;
+  real rCheckFGrad;
+  real rCheckOGrad;
+
+  rValueIn = value.gateValue[frameIdx];
+  rValueIg = value.gateValue[frameIdx + frameSize];
+  rValueFg = value.gateValue[frameIdx + frameSize * 2];
+  rValueOg = value.gateValue[frameIdx + frameSize * 3];
+  rState = value.stateValue[frameIdx];
+  rStateAtv = value.stateActiveValue[frameIdx];
+  rOutputGrad = grad.outputGrad[frameIdx];
+  rStateGrad = grad.stateGrad[frameIdx];
+
+  if (value.prevStateValue) {
+    if (isBatch) value.prevStateValue += batchIdx * frameSize;
+    rPrevState = value.prevStateValue[frameIdx];
+  }
+
+  op(rValueIn,
+     rValueIg,
+     rValueFg,
+     rValueOg,
+     rGradIn,
+     rGradIg,
+     rGradFg,
+     rGradOg,
+     rPrevState,
+     rPrevStateGrad,
+     rState,
+     rStateGrad,
+     rStateAtv,
+     rOutputGrad,
+     rCheckI,
+     rCheckF,
+     rCheckO,
+     rCheckIGrad,
+     rCheckFGrad,
+     rCheckOGrad,
+     hppl::gpu::backward[active_node],
+     hppl::gpu::backward[active_gate],
+     hppl::gpu::backward[active_state]);
+
+  grad.gateGrad[frameIdx] = rGradIn;
+  grad.gateGrad[frameIdx + frameSize    ] = rGradIg;
+  grad.gateGrad[frameIdx + frameSize * 2] = rGradFg;
+  grad.gateGrad[frameIdx + frameSize * 3] = rGradOg;
+  grad.stateGrad[frameIdx] = rStateGrad;
+  if (grad.prevStateGrad) {
+    if (isBatch) grad.prevStateGrad += batchIdx * frameSize;
+    grad.prevStateGrad[frameIdx] = rPrevStateGrad;
+  }
+
+  if (isBatch) {
+    if (value.prevStateValue) {
+      if (grad.checkIgGrad) atomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
+      if (grad.checkFgGrad) atomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
+    }
+    if (grad.checkOgGrad) atomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
+  } else {
+    if (value.prevStateValue) {
+      if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;
+      if (grad.checkFgGrad) grad.checkFgGrad[frameIdx] += rCheckFGrad;
+    }
+    if (grad.checkOgGrad) grad.checkOgGrad[frameIdx] += rCheckOGrad;
+  }
+}
+
+template<class Op>
+void hl_gpu_lstm_forward(Op op,
+                         hl_lstm_value value,
+                         int frameSize,
+                         int batchSize,
+                         hl_activation_mode_t active_node,
+                         hl_activation_mode_t active_gate,
+                         hl_activation_mode_t active_state) {
+  dim3 threads;
+  dim3 grid;
+  if (batchSize == 1) {
+    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+    int frameBlocks = (frameSize + 1024 - 1) / 1024;
+    threads = dim3(framePerBlock, 1);
+    grid = dim3(frameBlocks, 1);
+  } else {
+    /* framePerBlock = 32 batchPerBlock = 32 */
+    threads = dim3(32, 32);
+    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+  }
+
+  if (batchSize == 1) {
+    KeLstmForward<Op, /* isBatch= */false>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value,
+      frameSize, batchSize, active_node, active_gate, active_state);
+  } else {
+    KeLstmForward<Op, /* isBatch= */true>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value,
+      frameSize, batchSize, active_node, active_gate, active_state);
+  }
+
+  CHECK_SYNC("hl_gpu_lstm_forward failed");
+}
+
+template<class Op>
+void hl_gpu_lstm_backward(Op op,
+                          hl_lstm_value value,
+                          hl_lstm_grad grad,
+                          int frameSize,
+                          int batchSize,
+                          hl_activation_mode_t active_node,
+                          hl_activation_mode_t active_gate,
+                          hl_activation_mode_t active_state) {
+  dim3 threads;
+  dim3 grid;
+  if (batchSize == 1) {
+    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+    int frameBlocks = (frameSize + 1024 - 1) / 1024;
+    threads = dim3(framePerBlock, 1);
+    grid = dim3(frameBlocks, 1);
+  } else {
+    /* framePerBlock = 32 batchPerBlock = 32 */
+    threads = dim3(32, 32);
+    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+  }
+
+  if (batchSize == 1) {
+    KeLstmBackward<Op, /* isBatch= */false>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value, grad,
+      frameSize, batchSize, active_node, active_gate, active_state);
+  } else {
+    KeLstmBackward<Op, /* isBatch= */true>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value, grad,
+      frameSize, batchSize, active_node, active_gate, active_state);
+  }
+
+  CHECK_SYNC("hl_gpu_lstm_backward failed");
+}
+
+#else
+
+template<class Op>
+void hl_gpu_lstm_forward(Op op,
+                         hl_lstm_value value,
+                         int frameSize,
+                         int batchSize,
+                         hl_activation_mode_t active_node,
+                         hl_activation_mode_t active_gate,
+                         hl_activation_mode_t active_state) {}
+
+template<class Op>
+void hl_gpu_lstm_backward(Op op,
+                          hl_lstm_value value,
+                          hl_lstm_grad grad,
+                          int frameSize,
+                          int batchSize,
+                          hl_activation_mode_t active_node,
+                          hl_activation_mode_t active_gate,
+                          hl_activation_mode_t active_state) {}
+
+#endif
+
+#endif /* HL_GPU_LSTM_CUH_ */
diff --git a/paddle/cuda/include/hl_gpu_matrix_kernel.cuh b/paddle/cuda/include/hl_gpu_matrix_kernel.cuh
new file mode 100644
index 00000000000000..201c5c25f19cd8
--- /dev/null
+++ b/paddle/cuda/include/hl_gpu_matrix_kernel.cuh
@@ -0,0 +1,629 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+
+#ifndef HL_GPU_MATRIX_KERNEL_CUH_
+#define HL_GPU_MATRIX_KERNEL_CUH_
+
+#include <algorithm>
+#include "paddle/utils/Logging.h"
+#include "hl_base.h"
+
+#ifdef __NVCC__
+/* gpu apply interface */
+
+template<class T, class Op>
+__global__ void KeEltWiseUnaryOp(T* A_d, const int border, Op op) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    op.gpuOperator(A_d[idx]);
+  }
+}
+
+template<class T, class Op>
+__global__ void KeEltWiseUnaryOp(T* A_d,
+                                 int dimM,
+                                 int dimN,
+                                 int lda,
+                                 Op op) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
+      op.gpuOperator(A_d[i * lda + j]);
+    }
+  }
+}
+
+template<class T, class Op>
+__global__ void KeEltWiseBinaryOp(T* A_d, T *B_d, const int border, Op op) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    op.gpuOperator(A_d[idx], B_d[idx]);
+  }
+}
+
+template<class T, class Op, bool BAsRowVector, bool BAsColVector>
+__global__ void KeEltWiseBinaryOp(T *A_d,
+                                  T *B_d,
+                                  int dimM,
+                                  int dimN,
+                                  int lda,
+                                  int ldb,
+                                  Op op) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
+      if (BAsRowVector == 0 && BAsColVector == 0) {
+        op.gpuOperator(A_d[i * lda + j], B_d[i * ldb + j]);
+      } else if (BAsRowVector == 1 && BAsColVector == 0) {
+        op.gpuOperator(A_d[i * lda + j], B_d[j]);
+      } else if (BAsRowVector == 0 && BAsColVector == 1) {
+        op.gpuOperator(A_d[i * lda + j], B_d[i * ldb]);
+      } else {
+        op.gpuOperator(A_d[i * lda + j], B_d[0]);
+      }
+    }
+  }
+}
+
+template<class T, class Op>
+__global__ void KeEltWiseTernaryOp(T* A_d,
+                                   T *B_d,
+                                   T *C_d,
+                                   const int border,
+                                   Op op) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    op.gpuOperator(A_d[idx], B_d[idx], C_d[idx]);
+  }
+}
+
+template<class T, class Op, bool CAsRowVector, bool CAsColVector>
+__global__ void KeEltWiseTernaryOp(T* A_d,
+                                   T* B_d,
+                                   T* C_d,
+                                   int dimM,
+                                   int dimN,
+                                   int lda,
+                                   int ldb,
+                                   int ldc,
+                                   Op op) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
+      if (CAsRowVector == 0 && CAsColVector == 0) {
+        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[i*ldc + j]);
+      } else if (CAsRowVector == 1 && CAsColVector == 0) {
+        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[j]);
+      } else if (CAsRowVector == 0 && CAsColVector == 1) {
+        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[i*ldc]);
+      } else {
+        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[0]);
+      }
+    }
+  }
+}
+
+template<class T, class Op>
+__global__ void KeEltWiseQuaternaryOp(T* A_d,
+                                      T* B_d,
+                                      T* C_d,
+                                      T* D_d,
+                                      const int border,
+                                      Op op) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    op.gpuOperator(A_d[idx], B_d[idx], C_d[idx], D_d[idx]);
+  }
+}
+
+template<class T, class Op>
+__global__ void KeEltWiseQuaternaryOp(T* A_d,
+                                      T* B_d,
+                                      T* C_d,
+                                      T* D_d,
+                                      int dimM,
+                                      int dimN,
+                                      int lda,
+                                      int ldb,
+                                      int ldc,
+                                      int ldd,
+                                      Op op) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
+      op.gpuOperator(A_d[i*lda + j],
+        B_d[i*ldb + j], C_d[i*ldc + j], D_d[i*ldd + j]);
+    }
+  }
+}
+
+/**
+ * @brief   gpu element wise unary operator.
+ */
+template <class T, class Op>
+void hl_gpu_apply_unary_op(Op op, T* A_d, int dimM, int dimN, int lda) {
+  CHECK_NOTNULL(A_d);
+
+  if (dimM == 1 || dimN == lda) {
+    int size = dimM * dimN;
+    int blockSize = size <= 1024 ? size : 1024;
+    int gridSize = (size + 1024 - 1) / 1024;
+    KeEltWiseUnaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+      (A_d, size, op);
+  } else {
+    int blockSizeY = std::min(32, dimM);
+    int blockSizeX = (32 / blockSizeY) * 32;
+    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
+    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
+    dim3 threads(blockSizeX, blockSizeY);
+    dim3 grid(gridSizeX, gridSizeY);
+    KeEltWiseUnaryOp<T, Op><<<grid, threads, 0, STREAM_DEFAULT>>>
+      (A_d, dimM, dimN, lda, op);
+  }
+
+  CHECK_SYNC("hl_gpu_apply_unary_op failed");
+}
+
+/**
+ * @brief   gpu element wise binary operator.
+ */
+template <class T, class Op, bool BAsRowVector, bool BAsColVector>
+void hl_gpu_apply_binary_op(Op op,
+                            T* A_d,
+                            T* B_d,
+                            int dimM,
+                            int dimN,
+                            int lda,
+                            int ldb) {
+  CHECK_NOTNULL(A_d);
+
+  if ((BAsRowVector == 0 && BAsColVector == 0) &&
+      ((dimM == 1) || (dimN == lda && dimN == ldb))) {
+    int size = dimM * dimN;
+    int blockSize = size <= 1024 ? size : 1024;
+    int gridSize = (size + 1024 - 1) / 1024;
+    KeEltWiseBinaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+      (A_d, B_d, size, op);
+  } else {
+    int blockSizeY = std::min(32, dimM);
+    int blockSizeX = (32 / blockSizeY) * 32;
+    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
+    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
+    dim3 threads(blockSizeX, blockSizeY);
+    dim3 grid(gridSizeX, gridSizeY);
+    KeEltWiseBinaryOp<T, Op, BAsRowVector, BAsColVector>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>
+      (A_d, B_d, dimM, dimN, lda, ldb, op);
+  }
+
+  CHECK_SYNC("hl_gpu_apply_binary_op failed");
+}
+
+/**
+ * @brief   gpu element wise ternary operator.
+ */
+template <class T, class Op, bool CAsRowVector, bool CAsColVector>
+void hl_gpu_apply_ternary_op(Op op,
+                             T* A_d,
+                             T* B_d,
+                             T* C_d,
+                             int dimM,
+                             int dimN,
+                             int lda,
+                             int ldb,
+                             int ldc) {
+  CHECK_NOTNULL(A_d);
+
+  if ((CAsRowVector == 0 && CAsColVector == 0) &&
+      ((dimM == 1) || (dimN == lda && dimN == ldb && dimN == ldc))) {
+    int size = dimM * dimN;
+    int blockSize = size <= 1024 ? size : 1024;
+    int gridSize = (size + 1024 - 1) / 1024;
+    KeEltWiseTernaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+      (A_d, B_d, C_d, size, op);
+  } else {
+    int blockSizeY = std::min(32, dimM);
+    int blockSizeX = (32 / blockSizeY) * 32;
+    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
+    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
+    dim3 threads(blockSizeX, blockSizeY);
+    dim3 grid(gridSizeX, gridSizeY);
+    KeEltWiseTernaryOp<T, Op, CAsRowVector, CAsColVector>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>
+      (A_d, B_d, C_d, dimM, dimN, lda, ldb, ldc, op);
+  }
+
+  CHECK_SYNC("hl_gpu_apply_ternary_op failed");
+}
+
+
+/**
+ * @brief   gpu element wise quaternary operator.
+ */
+template <class T, class Op>
+void hl_gpu_apply_quaternary_op(Op op,
+                                T* A_d,
+                                T* B_d,
+                                T* C_d,
+                                T* D_d,
+                                int dimM,
+                                int dimN,
+                                int lda,
+                                int ldb,
+                                int ldc,
+                                int ldd) {
+  CHECK_NOTNULL(A_d);
+
+  if ((dimM == 1) ||
+      (dimN == lda && dimN == ldb && dimN == ldc && dimN == ldd)) {
+    int size = dimM * dimN;
+    int blockSize = size <= 1024 ? size : 1024;
+    int gridSize = (size + 1024 - 1) / 1024;
+    KeEltWiseQuaternaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+      (A_d, B_d, C_d, D_d, size, op);
+  } else {
+    int blockSizeY = std::min(32, dimM);
+    int blockSizeX = (32 / blockSizeY) * 32;
+    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
+    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
+    dim3 threads(blockSizeX, blockSizeY);
+    dim3 grid(gridSizeX, gridSizeY);
+    KeEltWiseQuaternaryOp<T, Op><<<grid, threads, 0, STREAM_DEFAULT>>>
+      (A_d, B_d, C_d, D_d, dimM, dimN, lda, ldb, ldc, ldd, op);
+  }
+
+  CHECK_SYNC("hl_gpu_apply_quaternary_op failed");
+}
+
+#else
+
+template <class T, class Op>
+void hl_gpu_apply_unary_op(Op op, T* A_d, int dimM, int dimN, int lda) {}
+
+template <class T, class Op, bool BAsRowVector, bool BAsColVector>
+void hl_gpu_apply_binary_op(Op op,
+                            T* A_d,
+                            T* B_d,
+                            int dimM,
+                            int dimN,
+                            int lda,
+                            int ldb) {}
+
+template <class T, class Op, bool CAsRowVector, bool CAsColVector>
+void hl_gpu_apply_ternary_op(Op op,
+                             T* A_d,
+                             T* B_d,
+                             T* C_d,
+                             int dimM,
+                             int dimN,
+                             int lda,
+                             int ldb,
+                             int ldc) {}
+
+template <class T, class Op>
+void hl_gpu_apply_quaternary_op(Op op,
+                                T* A_d,
+                                T* B_d,
+                                T* C_d,
+                                T* D_d,
+                                int dimM,
+                                int dimN,
+                                int lda,
+                                int ldb,
+                                int ldc,
+                                int ldd) {}
+#endif
+
+#ifdef __NVCC__
+/**
+ * @brief   matrix row operator.
+ */
+
+template<class Agg, class Op>
+__device__ __inline__ real sumRow(Agg agg, Op op,
+                                  int idx, int blockSize,
+                                  int dimN, real *A) {
+  real tmp = agg.init();
+  int cnt = (dimN + blockSize -1) / blockSize;
+  for (int i = 0; i < cnt && idx < dimN; i++) {
+      tmp = agg(tmp, op(A[idx]));
+      idx += blockSize;
+  }
+  return tmp;
+}
+
+template<class Agg, class Op>
+__device__ __inline__ real sumRow(Agg agg, Op op,
+                                  int idx, int blockSize,
+                                  int dimN, real *A, real *B) {
+  real tmp = agg.init();
+  int cnt = (dimN + blockSize -1) / blockSize;
+  for (int i = 0; i < cnt && idx < dimN; i++) {
+    tmp = agg(tmp, op(A[idx], B[idx]));
+    idx += blockSize;
+  }
+  return tmp;
+}
+
+template<class Agg>
+__device__ __inline__ void aggRow(Agg agg, real *row, int size, int tid) {
+  for (int stride = size/2; stride > 0; stride = stride/2) {
+    if (tid < stride) {
+      row[tid] = agg(row[tid], row[tid + stride]);
+    }
+    __syncthreads();
+  }
+}
+
+template<class Agg, class Op, class Saver, int blockSize>
+__global__ void KeMatrixRowOp(Agg agg, Op op, Saver sv,
+                              int dimN,
+                              real *dst, int ld,
+                              real *A, int lda) {
+  __shared__ real row_s[blockSize];
+  int rowId = blockIdx.x + blockIdx.y*gridDim.x;
+  int tid = threadIdx.x;
+
+  A += rowId*lda;
+  row_s[tid] = sumRow(agg, op, tid, blockSize, dimN, A);
+  __syncthreads();
+
+  aggRow(agg, row_s, blockSize, tid);
+  __syncthreads();
+
+  if (tid == 0) {
+    dst[rowId*ld] = sv(dst[rowId*ld], row_s[0]);
+  }
+}
+
+template<class Agg, class Op, class Saver, int blockSize>
+__global__ void KeMatrixRowOp(Agg agg, Op op, Saver sv,
+                              int dimN,
+                              real *dst, int ld,
+                              real *A, int lda,
+                              real *B, int ldb) {
+  __shared__ real row_s[blockSize];
+  int rowId = blockIdx.x + blockIdx.y*gridDim.x;
+  int tid = threadIdx.x;
+
+  A += rowId*lda;
+  B += rowId*ldb;
+  row_s[tid] = sumRow(agg, op, tid, blockSize, dimN, A, B);
+  __syncthreads();
+
+  aggRow(agg, row_s, blockSize, tid);
+  __syncthreads();
+
+  if (tid == 0) {
+    dst[rowId*ld] = sv(dst[rowId*ld], row_s[0]);
+  }
+}
+
+/**
+ * @brief   matrix column operator.
+ */
+template <class Agg, class Op>
+__device__ __inline__ real sumCol(Agg agg, Op op,
+                                  int index, int stride,
+                                  int dimM, real *A, int lda) {
+  real tmp = agg.init();
+  for (; index < dimM;) {
+    tmp = agg(tmp, op(A[index*lda]));
+    index += stride;
+  }
+  return tmp;
+}
+
+template <class Agg, class Op>
+__device__ __inline__ real sumCol(Agg agg, Op op,
+                                  int index, int stride, int dimM,
+                                  real *A, int lda, real *B, int ldb) {
+  real tmp = agg.init();
+  for (; index < dimM;) {
+    tmp = agg(tmp, op(A[index*lda], B[index*ldb]));
+    index += stride;
+  }
+  return tmp;
+}
+
+template <class Agg, class Op, class Saver>
+__global__ void KeMatrixColumnOp(Agg agg, Op op, Saver sv,
+                                 int dimM, int dimN,
+                                 real *dst,
+                                 real *A, int lda) {
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (rowIdx < dimN) {
+    A += rowIdx;
+    real tmp = sumCol(agg, op, 0, 1, dimM, A, lda);
+    dst[rowIdx] = sv(dst[rowIdx], tmp);
+  }
+}
+
+template <class Agg, class Op, class Saver, int blockDimX, int blockDimY>
+__global__ void KeMatrixColumnOp_S(Agg agg, Op op, Saver sv,
+                                   int dimM, int dimN,
+                                   real *dst,
+                                   real *A, int lda) {
+  __shared__ real col_s[blockDimX*blockDimY];
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (rowIdx < dimN) {
+    A += rowIdx;
+    real tmp = sumCol(agg, op, threadIdx.y, blockDimY, dimM, A, lda);
+    col_s[threadIdx.x + threadIdx.y*blockDimX] = tmp;
+  }
+  __syncthreads();
+
+  if (rowIdx < dimN) {
+    if (threadIdx.y ==0) {
+      real tmp = agg.init();
+      for (int i=0; i < blockDimY; i++) {
+        tmp = agg(tmp, col_s[threadIdx.x + i*blockDimX]);
+      }
+      dst[rowIdx] = sv(dst[rowIdx], tmp);
+    }
+  }
+}
+
+template <class Agg, class Op, class Saver>
+__global__ void KeMatrixColumnOp(Agg agg, Op op, Saver sv,
+                                 int dimM, int dimN,
+                                 real *dst,
+                                 real *A, int lda,
+                                 real *B, int ldb) {
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (rowIdx < dimN) {
+    A += rowIdx;
+    B += rowIdx;
+    real tmp = sumCol(agg, op, 0, 1, dimM, A, lda, B, ldb);
+    dst[rowIdx] = sv(dst[rowIdx], tmp);
+  }
+}
+
+template <class Agg, class Op, class Saver, int blockDimX, int blockDimY>
+__global__ void KeMatrixColumnOp_S(Agg agg, Op op, Saver sv,
+                                   int dimM, int dimN,
+                                   real *dst,
+                                   real *A, int lda,
+                                   real *B, int ldb) {
+  __shared__ real col_s[blockDimX*blockDimY];
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (rowIdx < dimN) {
+    A += rowIdx;
+    B += rowIdx;
+    real tmp = sumCol(agg, op,
+        threadIdx.y, blockDimY, dimM, A, lda, B, ldb);
+    col_s[threadIdx.x + threadIdx.y*blockDimX] = tmp;
+  }
+  __syncthreads();
+
+  if (rowIdx < dimN) {
+    if (threadIdx.y ==0) {
+      real tmp = agg.init();
+      for (int i=0; i < blockDimY; i++) {
+        tmp = agg(tmp, col_s[threadIdx.x + i*blockDimX]);
+      }
+      dst[rowIdx] = sv(dst[rowIdx], tmp);
+    }
+  }
+}
+
+#endif
+
+template <class Agg, class Op, class Saver>
+void hl_gpu_matrix_row_op(Agg agg, Op op, Saver sv,
+                          int dimM, int dimN,
+                          real *dst, int ld,
+                          real *A, int lda) {
+#ifdef __NVCC__
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(A);
+
+  int blocksX = dimM;
+  int blocksY = 1;
+  dim3 threads(128, 1);
+  dim3 grid(blocksX, blocksY);
+  KeMatrixRowOp<Agg, Op, Saver, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
+      (agg, op, sv, dimN, dst, ld, A, lda);
+
+  CHECK_SYNC("hl_matrix_row_op failed");
+#endif
+}
+
+template <class Agg, class Op, class Saver>
+void hl_gpu_matrix_row_op(Agg agg, Op op, Saver sv,
+                          int dimM, int dimN,
+                          real *dst, int ld,
+                          real *A, int lda,
+                          real *B, int ldb) {
+#ifdef __NVCC__
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(A);
+
+  int blocksX = dimM;
+  int blocksY = 1;
+  dim3 threads(128, 1);
+  dim3 grid(blocksX, blocksY);
+  KeMatrixRowOp<Agg, Op, Saver, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
+    (agg, op, sv, dimN, dst, ld, A, lda, B, ldb);
+
+  CHECK_SYNC("hl_matrix_row_op failed");
+#endif
+}
+
+template <class Agg, class Op, class Saver>
+void hl_gpu_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda) {
+#ifdef __NVCC__
+  if (dimN >= 8192) {
+    int blocksX = (dimN + 128 -1) / 128;
+    int blocksY = 1;
+    dim3 threads(128, 1);
+    dim3 grid(blocksX, blocksY);
+    KeMatrixColumnOp<Agg, Op, Saver>
+        <<< grid, threads, 0, STREAM_DEFAULT >>>
+        (agg, op, sv, dimM, dimN, dst, A, lda);
+  } else {
+    int blocksX = (dimN + 32 -1) / 32;
+    int blocksY = 1;
+    dim3 threads(32, 32);
+    dim3 grid(blocksX, blocksY);
+    KeMatrixColumnOp_S<Agg, Op, Saver, 32, 32>
+        <<< grid, threads, 0, STREAM_DEFAULT>>>
+        (agg, op, sv, dimM, dimN, dst, A, lda);
+  }
+
+  CHECK_SYNC("hl_matrix_column_op failed");
+#endif
+}
+
+template <class Agg, class Op, class Saver>
+void hl_gpu_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda,
+                             real *B, int ldb) {
+#ifdef __NVCC__
+  if (dimN >= 8192) {
+    int blocksX = (dimN + 128 -1) / 128;
+    int blocksY = 1;
+    dim3 threads(128, 1);
+    dim3 grid(blocksX, blocksY);
+    KeMatrixColumnOp<Agg, Op, Saver>
+        <<< grid, threads, 0, STREAM_DEFAULT >>>
+        (agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  } else {
+    int blocksX = (dimN + 32 -1) / 32;
+    int blocksY = 1;
+    dim3 threads(32, 32);
+    dim3 grid(blocksX, blocksY);
+    KeMatrixColumnOp_S<Agg, Op, Saver, 32, 32>
+        <<< grid, threads, 0, STREAM_DEFAULT>>>
+        (agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  }
+
+  CHECK_SYNC("hl_matrix_column_op failed");
+#endif
+}
+
+#endif /* HL_GPU_MATRIX_KERNEL_CUH_ */
diff --git a/paddle/cuda/include/hl_gru_ops.cuh b/paddle/cuda/include/hl_gru_ops.cuh
new file mode 100644
index 00000000000000..3c137d8d44bfe1
--- /dev/null
+++ b/paddle/cuda/include/hl_gru_ops.cuh
@@ -0,0 +1,205 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_GRU_OPS_CUH_
+#define HL_GRU_OPS_CUH_
+
+#ifdef __CUDA_ARCH__
+#define INLINE   __device__ inline
+#else
+#define INLINE   inline
+#endif
+
+namespace hppl {
+
+namespace forward {
+class gru_resetOutput {
+public:
+  /**
+   * @param[in,out]   valueUpdateGate  update gate
+   * @param[in,out]   valueResetGate   reset gate
+   * @param[in]       prevOut          previous output
+   * @param[out]      valueResetOutput intermediate value for frame state
+   * @param[in]       actGate          forward function of gate
+   */
+  INLINE void operator()(real &valueUpdateGate,
+                         real &valueResetGate,
+                         real &prevOut,
+                         real &valueResetOutput,
+                         Active<real>::forward actGate) {
+    valueUpdateGate  = actGate(valueUpdateGate);
+    valueResetGate   = actGate(valueResetGate);
+    valueResetOutput = prevOut * valueResetGate;
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  INLINE void operator()(__m256 &valueUpdateGate,
+                         __m256 &valueResetGate,
+                         __m256 &prevOut,
+                         __m256 &valueResetOutput,
+                         Active<__m256>::forward actGate) {
+    valueUpdateGate  = actGate(valueUpdateGate);
+    valueResetGate   = actGate(valueResetGate);
+    valueResetOutput = _mm256_mul_ps(prevOut, valueResetGate);
+  }
+#endif
+#endif
+};
+
+class gru_finalOutput {
+public:
+  /**
+   * @param[in]     valueUpdateGate   update gate
+   * @param[in,out] valueFrameState   frame state ({\tilde{h}_t})
+   * @param[in]     prevOut           previous output
+   * @param[out]    valueOutput       output
+   * @param[in]     actInput          forward function of node
+   */
+  INLINE void operator()(real &valueUpdateGate,
+                         real &valueFrameState,
+                         real &prevOut,
+                         real &valueOutput,
+                         Active<real>::forward actInput ) {
+    valueFrameState = actInput(valueFrameState);
+    valueOutput = prevOut - (valueUpdateGate * prevOut) +
+      (valueUpdateGate * valueFrameState);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  INLINE void operator()(__m256 &valueUpdateGate,
+                         __m256 &valueFrameState,
+                         __m256 &prevOut,
+                         __m256 &valueOutput,
+                         Active<__m256>::forward actInput) {
+    valueFrameState = actInput(valueFrameState);
+    valueOutput = _mm256_add_ps(
+      _mm256_sub_ps(prevOut, _mm256_mul_ps(valueUpdateGate, prevOut)),
+      _mm256_mul_ps(valueUpdateGate, valueFrameState));
+  }
+#endif
+#endif
+};
+}  // namespace forward
+
+namespace backward {
+class gru_stateGrad {
+public:
+  /**
+   * @param[in]     valueUpdateGate   update gate value
+   * @param[out]    gradUpdateGate    update gate grad
+   * @param[in]     valueFrameState   frame state value
+   * @param[out]    gradFrameState    frame state grad
+   * @param[in]     valuePrevOut      previous output value
+   * @param[in,out] gradPrevOut       previous output grad
+   * @param[in]     gradOutput        output grad
+   * @param[in]     actInput          backward function of frame state
+   */
+  INLINE void operator()(real &valueUpdateGate,
+                         real &gradUpdateGate,
+                         real &valueFrameState,
+                         real &gradFrameState,
+                         real &valuePrevOut,
+                         real &gradPrevOut,
+                         real &gradOutput,
+                         Active<real>::backward actInput) {
+    gradUpdateGate = (gradOutput * valueFrameState);
+    gradUpdateGate -= (gradOutput * valuePrevOut);
+    gradPrevOut -= (gradOutput * valueUpdateGate);
+    gradPrevOut += gradOutput;
+    gradFrameState = actInput(gradOutput * valueUpdateGate, valueFrameState);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  INLINE void operator()(__m256 &valueUpdateGate,
+                         __m256 &gradUpdateGate,
+                         __m256 &valueFrameState,
+                         __m256 &gradFrameState,
+                         __m256 &valuePrevOut,
+                         __m256 &gradPrevOut,
+                         __m256 &gradOutput,
+                         Active<__m256>::backward actInput) {
+    gradUpdateGate = _mm256_mul_ps(gradOutput, valueFrameState);
+    gradUpdateGate = _mm256_sub_ps(
+      gradUpdateGate, _mm256_mul_ps(gradOutput, valuePrevOut));
+    gradPrevOut = _mm256_add_ps(
+      _mm256_sub_ps(gradPrevOut, _mm256_mul_ps(gradOutput, valueUpdateGate)),
+      gradOutput);
+    gradFrameState = actInput(
+      _mm256_mul_ps(gradOutput, valueUpdateGate), valueFrameState);
+  }
+#endif
+#endif
+};
+
+class gru_resetGrad {
+public:
+  /**
+   * @param[in]     valueUpdateGate   update gate value
+   * @param[in,out] gradUpdateGate    update gate grad
+   * @param[in]     valueResetGate    reset gate value
+   * @param[out]    gradResetGate     reset gate grad
+   * @param[in]     valuePrevOut      previous output value
+   * @param[in,out] gradPrevOut       previous output grad
+   * @param[in]     gradResetOutput   reset output grad (temp val)
+   * @param[in]     actGate           backward function of gate
+   */
+  INLINE void operator()(real &valueUpdateGate,
+                         real &gradUpdateGate,
+                         real &valueResetGate,
+                         real &gradResetGate,
+                         real &valuePrevOut,
+                         real &gradPrevOut,
+                         real &gradResetOutput,
+                         Active<real>::backward actGate) {
+    gradResetGate = (gradResetOutput * valuePrevOut);
+    gradPrevOut += (gradResetOutput * valueResetGate);
+    gradUpdateGate = actGate(gradUpdateGate, valueUpdateGate);
+    gradResetGate  = actGate(gradResetGate , valueResetGate);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  INLINE void operator()(__m256 &valueUpdateGate,
+                         __m256 &gradUpdateGate,
+                         __m256 &valueResetGate,
+                         __m256 &gradResetGate,
+                         __m256 &valuePrevOut,
+                         __m256 &gradPrevOut,
+                         __m256 &gradResetOutput,
+                         Active<__m256>::backward actGate) {
+    gradResetGate = _mm256_mul_ps(gradResetOutput, valuePrevOut);
+    gradPrevOut = _mm256_add_ps(
+      gradPrevOut, _mm256_mul_ps(gradResetOutput, valueResetGate));
+    gradUpdateGate = actGate(gradUpdateGate, valueUpdateGate);
+    gradResetGate  = actGate(gradResetGate , valueResetGate);
+  }
+#endif
+#endif
+};
+}  // namespace backward
+}  // namespace hppl
+
+#endif /* HL_GRU_OPS_CUH_ */
diff --git a/paddle/cuda/include/hl_lstm.h b/paddle/cuda/include/hl_lstm.h
new file mode 100644
index 00000000000000..1f95e318a1fe06
--- /dev/null
+++ b/paddle/cuda/include/hl_lstm.h
@@ -0,0 +1,131 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_LSTM_H_
+#define HL_LSTM_H_
+
+#include "hl_base.h"
+
+/**
+ * @brief   Lstm sequence parallel forward.
+ *
+ * @param[in]   gateValue           input value.
+ * @param[out]  stateValue          state value.
+ * @param[out]  preOutputValue     prev output value.
+ * @param[out]  outputValue         output value.
+ * @param[in]   checkIg             bias.
+ * @param[in]   checkFg             bias.
+ * @param[in]   checkOg             bias.
+ * @param[in]   weight              weight.
+ * @param[in]   sequence            sequence index.
+ * @param[in]   frameSize           frame size.
+ * @param[in]   numSequences        number of sequences.
+ * @param[in]   reversed            reverse.
+ * @param[in]   active_node         active input type.
+ * @param[in]   active_gate         active state type.
+ * @param[in]   active_state        actvie gate type.
+ *
+ *
+ * @note    Only support frameSize = 32 or 64.
+ */
+extern void hl_lstm_parallel_forward(real *gateValue,
+                                     real *stateValue,
+                                     real *preOutputValue,
+                                     real *outputValue,
+                                     real *checkIg,
+                                     real *checkFg,
+                                     real *checkOg,
+                                     real *weight,
+                                     const int *sequence,
+                                     int frameSize,
+                                     int numSequences,
+                                     bool reversed,
+                                     hl_activation_mode_t active_node,
+                                     hl_activation_mode_t active_gate,
+                                     hl_activation_mode_t active_state);
+
+/**
+ * @brief   Lstm sequence parallel backward data.
+ *
+ * @param[in]   gateValue           input value.
+ * @param[out]  gateGrad            input gradient.
+ * @param[in]   stateValue          state value.
+ * @param[out]  stateGrad           state gradient.
+ * @param[out]  preOutputValue     prev output value.
+ * @param[out]  preOutputGrad      prev output gradient.
+ * @param[in]   outputGrad          output gradient.
+ * @param[in]   checkIg             bias.
+ * @param[out]  checkIgGrad         bias gradient.
+ * @param[in]   checkFg             bias.
+ * @param[out]  checkFgGrad         bias gradient.
+ * @param[in]   checkOg             bias.
+ * @param[out]  checkOgGrad         bias gradient.
+ * @param[in]   weight              weight.
+ * @param[in]   sequence            sequence index.
+ * @param[in]   frameSize           frame size.
+ * @param[in]   numSequences        number of sequences.
+ * @param[in]   reversed            reverse.
+ * @param[in]   active_node         active input type.
+ * @param[in]   active_gate         active state type.
+ * @param[in]   active_state        actvie gate type.
+ *
+ *
+ * @note    Only support frameSize = 32 or 64.
+ */
+extern void hl_lstm_parallel_backward_data(real *gateValue,
+                                           real *gateGrad,
+                                           real *stateValue,
+                                           real *stateGrad,
+                                           real *preOutputValue,
+                                           real *preOutputGrad,
+                                           real *outputGrad,
+                                           real *checkIg,
+                                           real *checkIgGrad,
+                                           real *checkFg,
+                                           real *checkFgGrad,
+                                           real *checkOg,
+                                           real *checkOgGrad,
+                                           real *weight,
+                                           const int *sequence,
+                                           int frameSize,
+                                           int numSequences,
+                                           bool reversed,
+                                           hl_activation_mode_t active_node,
+                                           hl_activation_mode_t active_gate,
+                                           hl_activation_mode_t active_state);
+
+/**
+ * @brief   Lstm sequence parallel backward weight.
+ *
+ * @param[out]  weightGrad          weight gradient.
+ * @param[in]   outputValue         output value.
+ * @param[in]   gateGrad            gate gradient.
+ * @param[in]   sequence            sequence index.
+ * @param[in]   frameSize           frame size.
+ * @param[in]   batchSize           batch size.
+ * @param[in]   numSequences        number of sequences.
+ * @param[in]   reversed            reverse.
+ *
+ */
+extern void hl_lstm_parallel_backward_weight(real *weightGrad,
+                                             real *outputValue,
+                                             real *gateGrad,
+                                             const int *sequence,
+                                             int frameSize,
+                                             int batchSize,
+                                             int numSequences,
+                                             bool reversed);
+
+#endif /* HL_LSTM_H_ */
diff --git a/paddle/cuda/include/hl_lstm_ops.cuh b/paddle/cuda/include/hl_lstm_ops.cuh
new file mode 100644
index 00000000000000..a5ea018dbcf77c
--- /dev/null
+++ b/paddle/cuda/include/hl_lstm_ops.cuh
@@ -0,0 +1,213 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_LSTM_OPS_CUH_
+#define HL_LSTM_OPS_CUH_
+
+#ifdef __CUDA_ARCH__
+#define INLINE   __device__ inline
+#else
+#define INLINE   inline
+#endif
+
+namespace hppl {
+
+namespace forward {
+class lstm {
+public:
+  /**
+   * @param   valueIn     input
+   * @param   valueIg     input gate
+   * @param   valueFg     forget gate
+   * @param   valueOg     output gate
+   * @param   prevState   previous state
+   * @param   state       current state
+   * @param   stateAtv    state active
+   * @param   output      output
+   * @param   checkI      check input gate
+   * @param   checkF      check forget gate
+   * @param   checkO      check output gate
+   * @param   actInput    forward function of input
+   * @param   actGate     forward function of gate
+   * @param   actState    forward function of state
+   */
+  INLINE void operator()(real &valueIn,
+                         real &valueIg,
+                         real &valueFg,
+                         real &valueOg,
+                         real &prevState,
+                         real &state,
+                         real &stateAtv,
+                         real &output,
+                         real &checkI,
+                         real &checkF,
+                         real &checkO,
+                         Active<real>::forward actInput,
+                         Active<real>::forward actGate,
+                         Active<real>::forward actState) {
+    valueIn = actInput(valueIn);
+    valueIg = actGate(valueIg + prevState * checkI);
+    valueFg = actGate(valueFg + prevState * checkF);
+    state = valueIn * valueIg + prevState * valueFg;
+    valueOg = actGate(valueOg + state * checkO);
+    stateAtv = actState(state);
+    output = valueOg * stateAtv;
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  INLINE void operator()(__m256 &valueIn,
+                         __m256 &valueIg,
+                         __m256 &valueFg,
+                         __m256 &valueOg,
+                         __m256 &prevState,
+                         __m256 &state,
+                         __m256 &stateAtv,
+                         __m256 &output,
+                         __m256 &checkI,
+                         __m256 &checkF,
+                         __m256 &checkO,
+                         Active<__m256>::forward actInput,
+                         Active<__m256>::forward actGate,
+                         Active<__m256>::forward actState) {
+    valueIn = actInput(valueIn);
+    valueIg = actGate(
+      _mm256_add_ps(valueIg, _mm256_mul_ps(prevState, checkI)));
+    valueFg = actGate(
+      _mm256_add_ps(valueFg, _mm256_mul_ps(prevState, checkF)));
+    state = _mm256_add_ps(_mm256_mul_ps(valueIn, valueIg)
+        , _mm256_mul_ps(prevState, valueFg));
+    valueOg = actGate(_mm256_add_ps(valueOg, _mm256_mul_ps(state, checkO)));
+    stateAtv = actState(state);
+    output = _mm256_mul_ps(valueOg, stateAtv);
+  }
+#endif
+#endif
+};
+}  // namespace forward
+
+namespace backward {
+class lstm {
+public:
+  /**
+   * @param   valueIn         input
+   * @param   valueIg         input gate
+   * @param   valueFg         forget gate
+   * @param   valueOg         output gate
+   * @param   gradIn          input grad
+   * @param   gradIg          input gate grad
+   * @param   gradFg          forget gate grad
+   * @param   gradOg          output gate grad
+   * @param   prevState       previous state value
+   * @param   prevStateGrad   previous state grad
+   * @param   state           current state value
+   * @param   stateGrad       current state grad
+   * @param   stateAtv        state active
+   * @param   outputGrad      output grad
+   * @param   checkI          check input gate
+   * @param   checkF          check forget gate
+   * @param   checkO          check output gate
+   * @param   checkIGrad      check input gate grad
+   * @param   checkFGrad      check forget gate grad
+   * @param   checkOGrad      check output gate grad
+   * @param   actInput        backward function of input
+   * @param   actGate         backward function of gate
+   * @param   actState        backward function of state
+   */
+  INLINE void operator()(real &valueIn,
+                         real &valueIg,
+                         real &valueFg,
+                         real &valueOg,
+                         real &gradIn,
+                         real &gradIg,
+                         real &gradFg,
+                         real &gradOg,
+                         real &prevState,
+                         real &prevStateGrad,
+                         real &state,
+                         real &stateGrad,
+                         real &stateAtv,
+                         real &outputGrad,
+                         real &checkI,
+                         real &checkF,
+                         real &checkO,
+                         real &checkIGrad,
+                         real &checkFGrad,
+                         real &checkOGrad,
+                         Active<real>::backward actInput,
+                         Active<real>::backward actGate,
+                         Active<real>::backward actState) {
+    gradOg = actGate(outputGrad * stateAtv, valueOg);
+    stateGrad += actState(outputGrad * valueOg, stateAtv) + gradOg * checkO;
+    gradIn = actInput(stateGrad * valueIg, valueIn);
+    gradIg = actGate(stateGrad * valueIn, valueIg);
+    gradFg = actGate(stateGrad * prevState, valueFg);
+    prevStateGrad = gradIg * checkI + gradFg * checkF + stateGrad * valueFg;
+    checkIGrad = gradIg * prevState;
+    checkFGrad = gradFg * prevState;
+    checkOGrad = gradOg * state;
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  INLINE void operator()(__m256 &valueIn,
+                         __m256 &valueIg,
+                         __m256 &valueFg,
+                         __m256 &valueOg,
+                         __m256 &gradIn,
+                         __m256 &gradIg,
+                         __m256 &gradFg,
+                         __m256 &gradOg,
+                         __m256 &prevState,
+                         __m256 &prevStateGrad,
+                         __m256 &state,
+                         __m256 &stateGrad,
+                         __m256 &stateAtv,
+                         __m256 &outputGrad,
+                         __m256 &checkI,
+                         __m256 &checkF,
+                         __m256 &checkO,
+                         __m256 &checkIGrad,
+                         __m256 &checkFGrad,
+                         __m256 &checkOGrad,
+                         Active<__m256>::backward actInput,
+                         Active<__m256>::backward actGate,
+                         Active<__m256>::backward actState) {
+    gradOg = actGate(_mm256_mul_ps(outputGrad, stateAtv), valueOg);
+    stateGrad = _mm256_add_ps(
+      actState(_mm256_mul_ps(outputGrad, valueOg), stateAtv), stateGrad);
+    stateGrad = _mm256_add_ps(_mm256_mul_ps(gradOg, checkO), stateGrad);
+    gradIn = actInput(_mm256_mul_ps(stateGrad, valueIg), valueIn);
+    gradIg = actGate(_mm256_mul_ps(stateGrad, valueIn), valueIg);
+    gradFg = actGate(_mm256_mul_ps(stateGrad, prevState), valueFg);
+    prevStateGrad = _mm256_add_ps(
+      _mm256_mul_ps(gradIg, checkI), _mm256_mul_ps(gradFg, checkF));
+    prevStateGrad = _mm256_add_ps(
+      _mm256_mul_ps(stateGrad, valueFg), prevStateGrad);
+    checkIGrad = _mm256_mul_ps(gradIg, prevState);
+    checkFGrad = _mm256_mul_ps(gradFg, prevState);
+    checkOGrad = _mm256_mul_ps(gradOg, state);
+  }
+#endif
+#endif
+};
+}  // namespace backward
+}  // namespace hppl
+
+#endif /* HL_LSTM_OPS_CUH_ */
diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h
new file mode 100644
index 00000000000000..17419790471a7d
--- /dev/null
+++ b/paddle/cuda/include/hl_matrix.h
@@ -0,0 +1,232 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_MATRIX_H_
+#define HL_MATRIX_H_
+
+#include "hl_base.h"
+
+/**
+ * @brief   Matrix addition: C_d[i] = alpha * A_d[i] + beta * B_d[i].
+ *
+ * @param[in]   A_d     input matrix (M x N).
+ * @param[in]   B_d     input matrix (M x N).
+ * @param[out]  C_d     output matrix (M x N).
+ * @param[in]   dimM    matrix height.
+ * @param[in]   dimN    matrix width.
+ * @param[in]   alpha   scalar used for addition.
+ * @param[in]   beta    scalar used for addition.
+ *
+ */
+extern void hl_matrix_add(real* A_d,
+                          real* B_d,
+                          real* C_d,
+                          int dimM,
+                          int dimN,
+                          real alpha,
+                          real beta);
+/**
+ * @brief   Matrix Softmax.
+ *
+ * @param[in]   A_d     input maxtrix (M x N).
+ * @param[out]  C_d     output matrix (M x N).
+ * @param[in]   dimM    matrix height.
+ * @param[in]   dimN    matrix width.
+ *
+ */
+extern void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN);
+
+/**
+ * @brief   Matrix softmax derivative.
+ *
+ * @param[out]  grad_d       intput matrix (M x N).
+ * @param[in]   output_d     output matrix (M x N).
+ * @param[in]   sftmaxSum_d  softmax sum (M * 1).
+ * @param[in]   dimM         matrix height.
+ * @param[in]   dimN         matrix width.
+ *
+ */
+extern void hl_matrix_softmax_derivative(real* grad_d,
+                                         real* output_d,
+                                         real* sftmaxSum_d,
+                                         int dimM,
+                                         int dimN);
+
+/**
+ * @brief   Sequence softmax.
+ *
+ * @param[in]   A_d         input vector.
+ * @param[out]  C_d         output vector.
+ * @param[in]   index       start positions of sequence.
+ * @param[in]   numSequence sequence number.
+ *
+ */
+extern void hl_sequence_softmax_forward(real *A_d,
+                                        real *C_d,
+                                        const int* index,
+                                        int numSequence);
+
+/**
+ * @brief   Matrix classification error.
+ *
+ * @param[in]   A_d     input matrix (M x N).
+ * @param[in]   B_d     input vector (M x 1).
+ * @param[out]  C_d     output vector (M x 1).
+ * @param[in]   dimM    matrix height.
+ * @param[in]   dimN    matrix width.
+ *
+ */
+extern void hl_matrix_classification_error(real* A_d,
+                                           int* B_d,
+                                           real* C_d,
+                                           int dimM,
+                                           int dimN);
+
+/**
+ * @brief   Matrix cross entropy.
+ *
+ * @param[in]   A_d     input matrix (M x N).
+ * @param[out]  C_d     output matrix (M X 1).
+ * @param[in]   label_d input matrix (M x 1).
+ * @param[in]   dimM    matrix height.
+ * @param[in]   dimN    matrix width.
+ *
+ */
+extern void hl_matrix_cross_entropy(real* A_d,
+                                    real* C_d,
+                                    int* label_d,
+                                    int dimM,
+                                    int dimN);
+
+/**
+ * @brief   Matrix cross entropy back propagation.
+ *
+ * @param[out]  grad_d      output matrix (M x N).
+ * @param[in]   output_d    input matrix (M x N).
+ * @param[in]   label_d     input vector (M x 1).
+ * @param[in]   dimM        matrix height.
+ * @param[in]   dimN        matrix width.
+ *
+ */
+extern void hl_matrix_cross_entropy_bp(real* grad_d,
+                                       real* output_d,
+                                       int* label_d,
+                                       int dimM,
+                                       int dimN);
+
+/**
+ * @brief  Matrix zero memory.
+ *
+ * @param[in,out]  data   input data.
+ * @param[in]      num    length of data.
+ *
+ */
+extern void hl_matrix_zero_mem(real* data, int num);
+
+/**
+ * @brief parameter relu forward
+ *
+ * @param[out] output     output data
+ * @param[in]  input      input data
+ * @param[in]  w          parameter data
+ * @param[in]  width      matrix width
+ * @param[in]  height     matrix height
+ * @param[in]  partial_sum
+ */
+
+extern void hl_param_relu_forward(real* output,
+                                  real* input,
+                                  real* w,
+                                  int width,
+                                  int height,
+                                  int partial_sum);
+/**
+ * @brief parameter relu backward w
+ *
+ * @param[out] grad_w      w grad
+ * @param[in]  grad_o      output grad
+ * @param[in]  input       input data
+ * @param[in]  width       matrix width
+ * @param[in]  height      matrix height
+ * @param[in]  partial_sum
+ */
+extern void hl_param_relu_backward_w(real* grad_w,
+                                     real* grad_o,
+                                     real* input,
+                                     int width,
+                                     int height,
+                                     int partial_sum);
+/**
+ * @brief parameter relu backward diff
+ *
+ * @param[in]       grad_o      output grad
+ * @param[in]       input       input data
+ * @param[in]       w           parameter
+ * @param[out]      diff        diff
+ * @param[in]       width       matrix width
+ * @param[in]       height      matrix height
+ * @param[in]       partial_sum
+ */
+extern void hl_param_relu_backward_diff(real* grad_o,
+                                        real* input,
+                                        real* w,
+                                        real* diff,
+                                        int width,
+                                        int height,
+                                        int partial_sum);
+/**
+ * @brief cos sim forward
+ *
+ * @param[out]    output         output data
+ * @param[in]     input1         input1 data(matrix)
+ * @param[in]     input2         input2 data(matrix or vector)
+ * @param[in]     width          matrix width
+ * @param[in]     input1_height  input1_height
+ * @param[in]     input2_height  input2_height
+ * @param[in]     scale          scale factor
+ */
+extern void hl_cossim(real* output,
+                      real* input1,
+                      real* input2,
+                      int width,
+                      int input1_height,
+                      int input2_height,
+                      real scale);
+/**
+ * @brief cos sim derivate
+ *
+ * @param[in]     grad             output grad
+ * @param[in]     output           output data
+ * @param[in]     prevOutX         input1 data
+ * @param[in]     prevOutY         input2 data
+ * @param[out]    prevGradX        input1 grad
+ * @param[out]    prevGradY        input2 grad
+ * @param[in]     width            matrix width
+ * @param[in]     input1_height    input1 height
+ * @param[in]     input2_height    input2 height
+ * @param[in]     scale            scale factor
+ */
+extern void hl_cossim_derivative(real* grad,
+                                 real* output,
+                                 real* prevOutX,
+                                 real* prevOutY,
+                                 real* prevGradX,
+                                 real* prevGradY,
+                                 int width,
+                                 int input1_height,
+                                 int input2_height,
+                                 real scale);
+
+#endif /* HL_MATRIX_H_ */
diff --git a/paddle/cuda/include/hl_matrix_apply.cuh b/paddle/cuda/include/hl_matrix_apply.cuh
new file mode 100644
index 00000000000000..927212c83d2e5a
--- /dev/null
+++ b/paddle/cuda/include/hl_matrix_apply.cuh
@@ -0,0 +1,423 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_MATRIX_APPLY_H_
+#define HL_MATRIX_APPLY_H_
+
+#include "hl_base.h"
+#include "hl_cpu_matrix_kernel.cuh"
+#include "hl_gpu_matrix_kernel.cuh"
+
+/**
+ * @brief   CPU element wise unary operator.
+ *
+ *  element wise op(a) for 0 <= i < dimM & for 0 <= j < dimN.
+ *
+ * @param[in]       op          unary op. see namespace unary
+ * @param[in,out]   A_h         matrix.
+ * @param[in]       dimM        matrix height.
+ * @param[in]       dimN        matrix width.
+ * @param[in]       lda         leading dimension of A.
+ *
+ */
+template <class T, class Op>
+extern void hl_cpu_apply_unary_op(Op op,
+                                  T* A_h,
+                                  int dimM,
+                                  int dimN,
+                                  int lda);
+
+/**
+ * @brief   CPU element wise binary operator.
+ *
+ * element wise op(a, b) for 0 <= i < dimM & for 0 <= j < dimN.
+ *
+ * if (BAsRowVector == 0 && BAsColVector == 0)
+ *   op(A[i * lda + j], B[i * ldb + j])
+ *
+ * if (BAsRowVector == 1 && BAsColVector == 0)
+ *   op(A[i * lda + j], B[j])
+ *
+ * if (BAsRowVector == 0 && BAsColVector == 1)
+ *   op(A[i * lda + j], B[i * ldb])
+ *
+ * if (BAsRowVector == 1 && BAsColVector == 1)
+ *   op(A[i * lda + j], B[0])
+ *
+ * @param[in]       op          binary op. see namespace binary.
+ * @param[in,out]   A_h         matrix.
+ * @param[in,out]   B_h         matrix.
+ * @param[in]       dimM        matrix height.
+ * @param[in]       dimN        matrix width.
+ * @param[in]       lda         leading dimension of A.
+ * @param[in]       ldb         leading dimension of B.
+ *
+ */
+template <class T, class Op, bool BAsRowVector, bool BAsColVector>
+extern void hl_cpu_apply_binary_op(Op op,
+                                   T* A_h,
+                                   T* B_h,
+                                   int dimM,
+                                   int dimN,
+                                   int lda,
+                                   int ldb);
+
+/**
+ * @brief   CPU element wise ternary operator.
+ *
+ * element wise op(a, b, c) for 0 <= i < dimM & for 0 <= j < dimN.
+ *
+ * if (CAsRowVector == 0 && CAsColVector == 0)
+ *   op(A[i*lda + j], B[i*ldb + j], C[i*ldc + j])
+ *
+ * if (CAsRowVector == 1 && CAsColVector == 0)
+ *   op(A[i*lda + j], B[i*ldb + j], C[j])
+ *
+ * if (CAsRowVector == 0 && CAsColVector == 1)
+ *   op(A[i*lda + j], B[i*ldb + j], C[i*ldc])
+ *
+ * if (CAsRowVector == 1 && CAsColVector == 1)
+ *   op(A[i*lda + j], B[i*ldb + j], C[0])
+ *
+ * @param[in]       op          ternary op. see namespace ternary.
+ * @param[in,out]   A_h         matrix.
+ * @param[in,out]   B_h         matrix.
+ * @param[in,out]   C_h         matrix.
+ * @param[in]       dimM        matrix height.
+ * @param[in]       dimN        matrix width.
+ * @param[in]       lda         leading dimension of A.
+ * @param[in]       ldb         leading dimension of B.
+ * @param[in]       ldc         leading dimension of C.
+ *
+ */
+template <class T, class Op, bool CAsRowVector, bool CAsColVector>
+extern void hl_cpu_apply_ternary_op(Op op,
+                                    T* A_h,
+                                    T* B_h,
+                                    T* C_h,
+                                    int dimM,
+                                    int dimN,
+                                    int lda,
+                                    int ldb,
+                                    int ldc);
+
+/**
+ * @brief   CPU element wise quaternary operator.
+ *          element wise op(a, b, c, d) for 0 <= i < dimM & for 0 <= j < dimN.
+ *
+ * @param[in]       op          quaternary op. see namespace ternary.
+ * @param[in,out]   A_h         matrix.
+ * @param[in,out]   B_h         matrix.
+ * @param[in,out]   C_h         matrix.
+ * @param[in,out]   D_h         matrix.
+ * @param[in]       dimM        matrix height.
+ * @param[in]       dimN        matrix width.
+ * @param[in]       lda         leading dimension of A.
+ * @param[in]       ldb         leading dimension of B.
+ * @param[in]       ldc         leading dimension of C.
+ * @param[in]       ldd         leading dimension of D.
+ *
+ */
+template <class T, class Op>
+extern void hl_cpu_apply_quaternary_op(Op op,
+                                       T* A_h,
+                                       T* B_h,
+                                       T* C_h,
+                                       T* D_h,
+                                       int dimM,
+                                       int dimN,
+                                       int lda,
+                                       int ldb,
+                                       int ldc,
+                                       int ldd);
+
+/**
+ * @brief   GPU element wise unary operator.
+ *          element wise op(a) for 0 <= i < dimM & for 0 <= j < dimN.
+ *
+ * @param[in]       op          unary op. see namespace unary.
+ * @param[in,out]   A_d         matrix.
+ * @param[in]       dimM        matrix height.
+ * @param[in]       dimN        matrix width.
+ * @param[in]       lda         leading dimension of A.
+ *
+ */
+template <class T, class Op>
+extern void hl_gpu_apply_unary_op(Op op,
+                                  T* A_d,
+                                  int dimM,
+                                  int dimN,
+                                  int lda);
+
+/**
+ * @brief   GPU element wise binary operator.
+ *
+ * element wise op(a, b) for 0 <= i < dimM & for 0 <= j < dimN
+ *
+ * if (BAsRowVector == 0 && BAsColVector == 0)
+ *   op(A[i * lda + j], B[i * ldb + j])
+ *
+ * if (BAsRowVector == 1 && BAsColVector == 0)
+ *   op(A[i * lda + j], B[j])
+ *
+ * if (BAsRowVector == 0 && BAsColVector == 1)
+ *   op(A[i * lda + j], B[i * ldb])
+ *
+ * if (BAsRowVector == 1 && BAsColVector == 1)
+ *   op(A[i * lda + j], B[0])
+ *
+ * @param[in]       op          binary op. see namespace binary.
+ * @param[in,out]   A_d         matrix.
+ * @param[in,out]   B_d         matrix.
+ * @param[in]       dimM        matrix height.
+ * @param[in]       dimN        matrix width.
+ * @param[in]       lda         leading dimension of A.
+ * @param[in]       ldb         leading dimension of B.
+ *
+ */
+template <class T, class Op, bool BAsRowVector, bool BAsColVector>
+extern void hl_gpu_apply_binary_op(Op op,
+                                   T* A_d,
+                                   T* B_d,
+                                   int dimM,
+                                   int dimN,
+                                   int lda,
+                                   int ldb);
+/**
+ * @brief   GPU element wise ternary operator.
+ *
+ * element wise op(a, b, c) for 0 <= i < dimM & for 0 <= j < dimN.
+ *
+ * if (CAsRowVector == 0 && CAsColVector == 0)
+ *   op(A[i*lda + j], B[i*ldb + j], C[i*ldc + j])
+ *
+ * if (CAsRowVector == 1 && CAsColVector == 0)
+ *   op(A[i*lda + j], B[i*ldb + j], C[j])
+ *
+ * if (CAsRowVector == 0 && CAsColVector == 1)
+ *   op(A[i*lda + j], B[i*ldb + j], C[i*ldc])
+ *
+ * if (CAsRowVector == 1 && CAsColVector == 1)
+ *   op(A[i*lda + j], B[i*ldb + j], C[0])
+ *
+ * @param[in]       op          ternary op. see namespace ternary.
+ * @param[in,out]   A_d         matrix.
+ * @param[in,out]   B_d         matrix.
+ * @param[in,out]   C_d         matrix.
+ * @param[in]       dimM        matrix height.
+ * @param[in]       dimN        matrix width.
+ * @param[in]       lda         leading dimension of A.
+ * @param[in]       ldb         leading dimension of B.
+ * @param[in]       ldc         leading dimension of C.
+ *
+ */
+template <class T, class Op, bool CAsRowVector, bool CAsColVector>
+extern void hl_gpu_apply_ternary_op(Op op,
+                                    T* A_d,
+                                    T* B_d,
+                                    T* C_d,
+                                    int dimM,
+                                    int dimN,
+                                    int lda,
+                                    int ldb,
+                                    int ldc);
+
+
+/**
+ * @brief   GPU element wise quaternary operator.
+ *          element wise op(a, b, c, d) for 0 <= i < dimM & for 0 <= j < dimN.
+ *
+ * @param[in]       op          quaternary op. see namespace ternary.
+ * @param[in,out]   A_d         matrix.
+ * @param[in,out]   B_d         matrix.
+ * @param[in,out]   C_d         matrix.
+ * @param[in,out]   D_d         matrix.
+ * @param[in]       dimM        matrix height.
+ * @param[in]       dimN        matrix width.
+ * @param[in]       lda         leading dimension of A.
+ * @param[in]       ldb         leading dimension of B.
+ * @param[in]       ldc         leading dimension of C.
+ * @param[in]       ldd         leading dimension of D.
+ *
+ */
+template <class T, class Op>
+extern void hl_gpu_apply_quaternary_op(Op op,
+                                       T* A_d,
+                                       T* B_d,
+                                       T* C_d,
+                                       T* D_d,
+                                       int dimM,
+                                       int dimN,
+                                       int lda,
+                                       int ldb,
+                                       int ldc,
+                                       int ldd);
+
+/**
+ * @brief  CPU matrix row operator.
+ */
+template <class Agg, class Op, class Saver>
+extern void hl_cpu_matrix_row_op(Agg agg, Op op, Saver sv,
+                                 int dimM, int dimN,
+                                 real *dst, int ld,
+                                 real *A, int lda);
+
+/**
+ * @brief  CPU matrix row operator.
+ *
+ * @param[in]  agg    aggregate operator expression.
+ * @param[in]  op     operator expression.
+ * @param[in]  dimM   matrix height.
+ * @param[in]  dimN   matrix width.
+ * @param[out] dst    destination matrix.
+ * @param[in]  ld     leading dimension of dst matrix.
+ * @param[in]  *A     matrix A.
+ * @param[in]  lda    leading dimension of matrix A.
+ * @param[in]  *B     matrix B.
+ * @param[in]  ldb    leading dimension of matrix B.
+ *
+ */
+template <class Saver, class Agg, class Op>
+extern void hl_cpu_matrix_row_op(Agg agg, Op op,
+                                 int dimM, int dimN,
+                                 real *dst, int ld,
+                                 real *A, int lda,
+                                 real *B, int ldb);
+
+/**
+ * @brief  CPU matrix column operator.
+ *
+ * @param[in]  agg    aggregate operator expression.
+ * @param[in]  op     operator expression.
+ * @param[in]  sv     assignment operator expression.
+ * @param[in]  dimM   matrix height.
+ * @param[in]  dimN   matrix width.
+ * @param[out] dst    destination matrix.
+ * @param[in]  *A     matrix A.
+ * @param[in]  lda    leading dimension of matrix A.
+ *
+ */
+template <class Agg, class Op, class Saver>
+extern void hl_cpu_matrix_column_op(Agg agg, Op op, Saver sv,
+                                    int dimM, int dimN,
+                                    real *dst,
+                                    real *A, int lda);
+
+/**
+ * @brief  CPU matrix column operator.
+ *
+ * @param[in]  agg    aggregate operator expression.
+ * @param[in]  op     operator expression.
+ * @param[in]  sv     assignment operator expression.
+ * @param[in]  dimM   matrix height.
+ * @param[in]  dimN   matrix width.
+ * @param[out] dst    destination matrix.
+ * @param[in]  *A     matrix A.
+ * @param[in]  lda    leading dimension of matrix A.
+ * @param[in]  *B     matrix B.
+ * @param[in]  ldb    leading dimension of matrix B.
+ *
+ */
+template <class Agg, class Op, class Saver>
+extern void hl_cpu_matrix_column_op(Agg agg, Op op, Saver sv,
+                                    int dimM, int dimN,
+                                    real *dst,
+                                    real *A, int lda,
+                                    real *B, int ldb);
+
+/**
+ * @brief  GPU matrix row operator.
+ *
+ * @param[in]  agg    aggregate operator expression.
+ * @param[in]  op     operator expression.
+ * @param[in]  sv     assignment operator expression.
+ * @param[in]  dimM   matrix height.
+ * @param[in]  dimN   matrix width.
+ * @param[out] dst    destination matrix.
+ * @param[in]  ld     leading dimension of dst.
+ * @param[in]  *A     matrix A.
+ * @param[in]  lda    leading dimension of matrix A.
+ *
+ */
+template <class Agg, class Op, class Saver>
+extern void hl_gpu_matrix_row_op(Agg agg, Op op, Saver sv,
+                                 int dimM, int dimN,
+                                 real *dst, int ld,
+                                 real *A, int lda);
+
+/**
+ * @brief  GPU matrix row operator.
+ *
+ * @param[in]  agg    aggregate operator expression.
+ * @param[in]  op     operator expression.
+ * @param[in]  dimM   matrix height.
+ * @param[in]  dimN   matrix width.
+ * @param[out] dst    destination matrix.
+ * @param[in]  ld     leading dimension of dst matrix.
+ * @param[in]  *A     matrix A.
+ * @param[in]  lda    leading dimension of matrix A.
+ * @param[in]  *B     matrix B.
+ * @param[in]  ldb    leading dimension of matrix B.
+ *
+ */
+template <class Saver, class Agg, class Op>
+extern void hl_gpu_matrix_row_op(Agg agg, Op op,
+                                 int dimM, int dimN,
+                                 real *dst, int ld,
+                                 real *A, int lda,
+                                 real *B, int ldb);
+
+/**
+ * @brief  GPU matrix column operator.
+ *
+ * @param[in]  agg    aggregate operator expression.
+ * @param[in]  op     operator expression.
+ * @param[in]  sv     assignment operator expression.
+ * @param[in]  dimM   matrix height.
+ * @param[in]  dimN   matrix width.
+ * @param[out] dst    destination matrix.
+ * @param[in]  *A     matrix A.
+ * @param[in]  lda    leading dimension of matrix A.
+ *
+ */
+template <class Agg, class Op, class Saver>
+extern void hl_gpu_matrix_column_op(Agg agg, Op op, Saver sv,
+                                    int dimM, int dimN,
+                                    real *dst,
+                                    real *A, int lda);
+
+/**
+ * @brief  GPU matrix column operator.
+ *
+ * @param[in]  agg    aggregate operator expression.
+ * @param[in]  op     operator expression.
+ * @param[in]  sv     assignment operator expression.
+ * @param[in]  dimM   matrix height.
+ * @param[in]  dimN   matrix width.
+ * @param[out] dst    destination matrix.
+ * @param[in]  *A     matrix A.
+ * @param[in]  lda    leading dimension of matrix A.
+ * @param[in]  *B     matrix B.
+ * @param[in]  ldb    leading dimension of matrix B.
+ *
+ */
+template <class Agg, class Op, class Saver>
+extern void hl_gpu_matrix_column_op(Agg agg, Op op, Saver sv,
+                                    int dimM, int dimN,
+                                    real *dst,
+                                    real *A, int lda,
+                                    real *B, int ldb);
+
+#endif /* HL_MATRIX_APPLY_H_ */
diff --git a/paddle/cuda/include/hl_matrix_base.cuh b/paddle/cuda/include/hl_matrix_base.cuh
new file mode 100644
index 00000000000000..473d394c0c688d
--- /dev/null
+++ b/paddle/cuda/include/hl_matrix_base.cuh
@@ -0,0 +1,184 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_MATRIX_BASE_CUH_
+#define HL_MATRIX_BASE_CUH_
+
+#include "hl_matrix_type.cuh"
+
+#ifdef __CUDA_ARCH__
+/**
+ * CUDA kernel inline function
+ */
+#define INLINE   __device__ inline
+#else
+/**
+ * CPP inline function
+ */
+#define INLINE   inline
+#endif
+
+#ifndef HPPL_TYPE_DOUBLE
+#define     DEVICE_FMAX     fmaxf
+#define     DEVICE_FMIN     fminf
+#else
+#define     DEVICE_FMAX     fmax
+#define     DEVICE_FMIN     fmin
+#endif
+
+class BaseOp {
+public:
+  static const bool sse = false;
+  BaseOp() {}
+  explicit BaseOp(const real s1) {}
+  explicit BaseOp(const real s1, const real s2) {}
+  INLINE vecType vecOp(const vecType a) const {
+    return a;
+  }
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
+    return a;
+  }
+};
+
+#ifdef __CUDA_ARCH__
+typedef BaseOp SSESum;
+typedef BaseOp SSEMax;
+typedef BaseOp SSEMin;
+typedef BaseOp SSEIdentity;
+typedef BaseOp SSEAdd;
+typedef BaseOp SSEAdd2;
+typedef BaseOp SSESub;
+typedef BaseOp SSEMul;
+typedef BaseOp SSEDiv;
+typedef BaseOp SSESquaredDiff;
+typedef BaseOp SSEFirst;
+typedef BaseOp SSESecond;
+typedef BaseOp SSEClassificationError;
+#else
+#include "hl_matrix_base_sse.cuh"
+#endif
+
+namespace aggregate {
+class sum : public SSESum {
+public:
+  INLINE real init() { return 0.0f; }
+  INLINE real operator()(const real a, const real b) const {
+    return a + b;
+  }
+};
+
+class max : public SSEMax {
+public:
+  INLINE real init() { return -HL_FLOAT_MAX; }
+  INLINE real operator()(const real a, const real b) const {
+    return a > b ? a : b;
+  }
+};
+
+class min : public SSEMin {
+public:
+  INLINE real init() {return HL_FLOAT_MAX;}
+  INLINE real operator()(const real a, const real b) const {
+    return a > b ? b : a;
+  }
+};
+}  // namespace aggregate
+
+namespace base {
+namespace unary {
+class identity : public SSEIdentity {
+public:
+  INLINE real operator()(const real a) const {
+    return a;
+  }
+};
+}  // namespace unary
+
+namespace binary {
+class add : public SSEAdd {
+public:
+  INLINE real operator()(const real a, const real b) const {
+    return a + b;
+  }
+};
+
+class add2 : public SSEAdd2 {
+private:
+  const real p1;
+  const real p2;
+public:
+  add2(const real s1, const real s2)
+    : SSEAdd2(s1, s2), p1(s1), p2(s2) {}
+  INLINE real operator()(const real a, const real b) const {
+     return p1 * a + p2 * b;
+  }
+};
+
+class sub : public SSESub {
+public:
+  INLINE real operator()(const real a, const real b) const {
+    return a - b;
+  }
+};
+
+class mul : public SSEMul {
+public:
+  INLINE real operator()(const real a, const real b) const {
+    return a * b;
+  }
+};
+
+class div : public SSEDiv {
+public:
+  INLINE real operator()(const real a, const real b) const  {
+    return a / b;
+  }
+};
+
+class squaredDiff : public SSESquaredDiff {
+public:
+  INLINE real operator()(const real a, const real b) const {
+    return (a - b) * (a - b);
+  }
+};
+
+class first : public SSEFirst {
+public:
+  INLINE real operator()(const real a, const real b) const {
+    return a;
+  }
+};
+
+class second : public SSESecond {
+public:
+  INLINE real operator()(const real a, const real b) const {
+    return b;
+  }
+};
+
+class classificationError : public SSEClassificationError {
+private:
+  const real p;
+public:
+  explicit classificationError(const real s)
+    : SSEClassificationError(s), p(s) {}
+  INLINE real operator()(const real a, const real b) const {
+    return ((a > p) == (b > p)) ? 0.0f : 1.0f;
+  }
+};
+}  // namespace binary
+}  // namespace base
+
+#endif /* HL_MATRIX_BASE_CUH_ */
diff --git a/paddle/cuda/include/hl_matrix_base_sse.cuh b/paddle/cuda/include/hl_matrix_base_sse.cuh
new file mode 100644
index 00000000000000..dd55b848849404
--- /dev/null
+++ b/paddle/cuda/include/hl_matrix_base_sse.cuh
@@ -0,0 +1,211 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_MATRIX_BASE_SSE_CUH_
+#define HL_MATRIX_BASE_SSE_CUH_
+
+namespace aggregate {
+class SSESum {
+public:
+  static const bool sse = true;
+  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
+    return _mm_add_ps(a, b);
+  }
+  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
+    return _mm_add_pd(a, b);
+  }
+};
+
+class SSEMax {
+public:
+  static const bool sse = true;
+  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
+    return _mm_max_ps(a, b);
+  }
+  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
+    return _mm_max_pd(a, b);
+  }
+};
+
+class SSEMin {
+public:
+  static const bool sse = true;
+  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
+    return _mm_min_ps(a, b);
+  }
+  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
+    return _mm_min_pd(a, b);
+  }
+};
+}  // namespace aggregate
+
+namespace base {
+namespace unary {
+class SSEIdentity {
+public:
+  static const bool sse = true;
+  INLINE __m128 vecOp(const __m128 a) const {
+    return a;
+  }
+  INLINE __m128d vecOp(const __m128d a) const {
+    return a;
+  }
+};
+}  // namespace unary
+
+namespace binary {
+class SSEAdd {
+public:
+  static const bool sse = true;
+  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
+    return _mm_add_ps(a, b);
+  }
+  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
+    return _mm_add_pd(a, b);
+  }
+};
+
+class SSEAdd2 {
+public:
+  static const bool sse = true;
+  const real p1;
+  const real p2;
+  union {__m128 f; __m128d d;} mp1;
+  union {__m128 f; __m128d d;} mp2;
+
+public:
+  SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) {
+    if (sizeof(real) == sizeof(float)) {
+      mp1.f = _mm_set1_ps(p1);
+      mp2.f = _mm_set1_ps(p2);
+    } else {
+      mp1.d = _mm_set1_pd(p1);
+      mp2.d = _mm_set1_pd(p2);
+    }
+  }
+  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
+    __m128 tmp1, tmp2;
+    tmp1 = _mm_mul_ps(mp1.f, a);
+    tmp2 = _mm_mul_ps(mp2.f, b);
+    return _mm_add_ps(tmp1, tmp2);
+  }
+  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
+    __m128d tmp1, tmp2;
+    tmp1 = _mm_mul_pd(mp1.d, a);
+    tmp2 = _mm_mul_pd(mp2.d, b);
+    return _mm_add_pd(tmp1, tmp2);
+  }
+};
+
+class SSESub {
+public:
+  static const bool sse = true;
+  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
+    return _mm_sub_ps(a, b);
+  }
+  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
+    return _mm_sub_pd(a, b);
+  }
+};
+
+class SSEMul {
+public:
+  static const bool sse = true;
+  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
+    return _mm_mul_ps(a, b);
+  }
+  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
+    return _mm_mul_pd(a, b);
+  }
+};
+
+class SSEDiv {
+public:
+  static const bool sse = true;
+  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
+    return _mm_div_ps(a, b);
+  }
+  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
+    return _mm_div_pd(a, b);
+  }
+};
+
+class SSESquaredDiff {
+public:
+  static const bool sse = true;
+  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
+    return _mm_mul_ps(_mm_sub_ps(a, b), _mm_sub_ps(a, b));
+  }
+  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
+    return _mm_mul_pd(_mm_sub_pd(a, b), _mm_sub_pd(a, b));
+  }
+};
+
+class SSEFirst {
+public:
+  static const bool sse = true;
+  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
+    return a;
+  }
+  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
+    return a;
+  }
+};
+
+class SSESecond {
+public:
+  static const bool sse = true;
+  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
+    return b;
+  }
+  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
+    return b;
+  }
+};
+
+class SSEClassificationError {
+public:
+  static const bool sse = true;
+  const real p;
+  union {__m128 f; __m128d d;} mp;
+  union {__m128 f; __m128d d;} result;
+
+public:
+  explicit SSEClassificationError(const real s) : p(s) {
+    if (sizeof(real) == sizeof(float)) {
+      mp.f = _mm_set1_ps(p);
+      result.f = _mm_set1_ps(1.0f);
+    } else {
+      mp.d = _mm_set1_pd(p);
+      result.d = _mm_set1_pd(1.0);
+    }
+  }
+  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
+    __m128 tmp1 = _mm_cmpgt_ps(a, mp.f);
+    __m128 tmp2 = _mm_cmpgt_ps(b, mp.f);
+    __m128 tmp3 = _mm_xor_ps(tmp1, tmp2);
+    return _mm_and_ps(tmp3, result.f);
+  }
+  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
+    __m128d tmp1 = _mm_cmpgt_pd(a, mp.d);
+    __m128d tmp2 = _mm_cmpgt_pd(b, mp.d);
+    __m128d tmp3 = _mm_xor_pd(tmp1, tmp2);
+    return _mm_and_pd(tmp3, result.d);
+  }
+};
+}  // namespace binary
+}  // namespace base
+
+#endif /* HL_MATRIX_BASE_SSE_CUH_ */
diff --git a/paddle/cuda/include/hl_matrix_ops.cuh b/paddle/cuda/include/hl_matrix_ops.cuh
new file mode 100644
index 00000000000000..3e5e1bc7010ec5
--- /dev/null
+++ b/paddle/cuda/include/hl_matrix_ops.cuh
@@ -0,0 +1,253 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_MATRIX_OPS_CUH_
+#define HL_MATRIX_OPS_CUH_
+
+#include "hl_base.h"
+
+#ifdef __NVCC__
+#define HL_DEVICE   __device__
+#else
+#define HL_DEVICE
+#endif
+
+/**
+ * @brief   parameter macro.
+ */
+#define ONE_PARAMETER(name)     \
+        private: \
+          const T p;\
+        public: \
+          name(const T s) : p(s) {}
+
+#define TWO_PARAMETER(name)     \
+        private: \
+          const T p1;\
+          const T p2;\
+        public: \
+          name(const T s1, T s2) : p1(s1), p2(s2) {}
+
+#define THREE_PARAMETER(name)     \
+        private: \
+          const T p1;\
+          const T p2;\
+          const T p3;\
+        public: \
+          name(const T s1, T s2, T s3) : p1(s1), p2(s2), p3(s3) {}
+
+#define FOUR_PARAMETER(name)     \
+        private: \
+          const T p1;\
+          const T p2;\
+          const T p3;\
+          const T p4;\
+        public: \
+          name(const T s1, T s2, T s3, T s4) : p1(s1), p2(s2), p3(s3), p4(s4) {}
+
+/**
+ * @brief   unary operator macro.
+ *
+ * @param   name    operator name.
+ * @param   op      operator expression.
+ *
+ * @note   op format: op supports multiple expressions that are separated
+ *         by a comma. e.g. a, b
+ *
+ * @see    hl_gpu_apply_unary_op
+ * @see    hl_cpu_apply_unary_op
+ */
+#define DEFINE_MATRIX_UNARY_OP(name, op) \
+    namespace unary {\
+    template<class T>\
+    class name {\
+    public:\
+        HL_DEVICE inline void gpuOperator(T &a) {op;}\
+        inline void cpuOperator(T &a) {op;}\
+    };\
+    }
+
+
+/**
+ * @brief   unary operator macro.
+ *
+ * @param   name        operator name.
+ * @param   PARA_MACRO  parameter macro.
+ * @param   op          operator expression.
+ *
+ * @note   op format: op supports multiple expressions that are separated
+ *         by a comma. e.g. a, b
+ *
+ * @see    hl_gpu_apply_unary_op
+ * @see    hl_cpu_apply_unary_op
+ */
+#define DEFINE_MATRIX_UNARY_PARAMETER_OP(name, PARA_MACRO, op) \
+    namespace unary {\
+    template<class T>\
+    class name {\
+    PARA_MACRO(name)\
+    public:\
+        HL_DEVICE inline void gpuOperator(T &a) {op;}\
+        inline void cpuOperator(T &a) {op;}\
+    };\
+    }
+
+
+/**
+ * @brief   binary operator macro.
+ *
+ * @param   name    operator name.
+ * @param   op      operator expression.
+ *
+ * @note   op format: op supports multiple expressions that are separated
+ *         by a comma. e.g. a, b
+ *
+ * @see    hl_gpu_apply_unary_op
+ * @see    hl_cpu_apply_unary_op
+ */
+#define DEFINE_MATRIX_BINARY_OP(name, op) \
+    namespace binary {\
+    template<class T>\
+    class name {\
+    public:\
+        HL_DEVICE inline void gpuOperator(T &a, T &b) {op;}\
+        inline void cpuOperator(T &a, T &b) {op;}\
+    };\
+    }
+
+
+/**
+ * @brief   binary operator macro.
+ *
+ * @param   name        operator name.
+ * @param   PARA_MACRO  parameter macro.
+ * @param   op          operator expression.
+ *
+ * @note   op format: op supports multiple expressions that are separated
+ *         by a comma. e.g. a, b
+ *
+ * @see    hl_gpu_apply_binary_op
+ * @see    hl_cpu_apply_binary_op
+ */
+#define DEFINE_MATRIX_BINARY_PARAMETER_OP(name, PARA_MACRO, op) \
+    namespace binary {\
+    template<class T>\
+    class name {\
+    PARA_MACRO(name)\
+    public:\
+        HL_DEVICE inline void gpuOperator(T &a, T &b) {op;}\
+        inline void cpuOperator(T &a, T &b) {op;}\
+    };\
+    }
+
+
+/**
+ * @brief   ternary operator macro.
+ *
+ * @param   name    operator name.
+ * @param   op      operator expression.
+ *
+ * @note   op format: op supports multiple expressions that are separated
+ *         by a comma. e.g. a, b, c
+ *
+ * @see    hl_gpu_apply_ternary_op
+ * @see    hl_cpu_apply_ternary_op
+ */
+#define DEFINE_MATRIX_TERNARY_OP(name, op) \
+    namespace ternary {\
+    template<class T>\
+    class name {\
+    public:\
+        HL_DEVICE inline void gpuOperator(T &a, T &b, T &c) {op;}\
+        inline void cpuOperator(T &a, T &b, T &c) {op;}\
+    };\
+    }
+
+
+/**
+ * @brief   ternary operator macro.
+ *
+ * @param   name        operator name.
+ * @param   PARA_MACRO  parameter macro.
+ * @param   op          operator expression.
+ *
+ * @note   op format: op supports multiple expressions that are separated
+ *         by a comma. e.g. a, b, c
+ *
+ * @see    hl_gpu_apply_ternary_op
+ * @see    hl_cpu_apply_ternary_op
+ */
+#define DEFINE_MATRIX_TERNARY_PARAMETER_OP(name, PARA_MACRO, op) \
+    namespace ternary {\
+    template<class T>\
+    class name {\
+    private:\
+    PARA_MACRO(name)\
+    public:\
+        HL_DEVICE inline void gpuOperator(T &a, T &b, T &c) {op;}\
+        inline void cpuOperator(T &a, T &b, T &c) {op;}\
+    };\
+    }
+
+
+/**
+ * @brief   quaternary operator macro.
+ *
+ * @param   name        operator name.
+ * @param   op          operator expression.
+ *
+ * @note   op format: op supports multiple expressions that are separated
+ *         by a comma. e.g. a, b, c, d
+ *
+ * @see    hl_gpu_apply_quaternary_op
+ * @see    hl_cpu_apply_quaternary_op
+ */
+#define DEFINE_MATRIX_QUATERNARY_OP(name, op)     \
+  namespace quaternary {\
+  template<class T>\
+  class name {\
+   public:\
+   HL_DEVICE inline void gpuOperator(T &a, T &b, T &c, T &d) {op;}\
+   inline void cpuOperator(T&a, T &b, T &c, T &d) {op;}\
+  };\
+  }
+
+
+/**
+ * @brief   quaternary operator macro.
+ *
+ * @param   name        operator name.
+ * @param   PARA_MACRO  parameter macro.
+ * @param   op          operator expression.
+ *
+ * @note   op format: op supports multiple expressions that are separated
+ *         by a comma. e.g. a, b, c, d
+ *
+ * @see    hl_gpu_apply_quaternary_op
+ * @see    hl_cpu_apply_quaternary_op
+ */
+#define DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(name, PARA_MACRO, op)     \
+  namespace quaternary {\
+  template<class T>\
+  class name {\
+   private:\
+   PARA_MACRO(name)\
+   public:\
+   HL_DEVICE inline void gpuOperator(T &a, T &b, T &c, T &d) {op;}\
+   inline void cpuOperator(T &a, T &b, T &c, T &d) {op;}\
+  };\
+  }
+
+#endif /* HL_MATRIX_OPS_CUH_ */
diff --git a/paddle/cuda/include/hl_matrix_type.cuh b/paddle/cuda/include/hl_matrix_type.cuh
new file mode 100644
index 00000000000000..85b60cc313fa78
--- /dev/null
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_MATRIX_TYPE_CUH_
+#define HL_MATRIX_TYPE_CUH_
+
+#include "hl_base.h"
+
+#ifdef __CUDA_ARCH__
+// typedef void*  vecType;
+#include <vector_types.h>
+#ifndef HPPL_TYPE_DOUBLE
+typedef float4 vecType;
+#else
+typedef double2 vecType;
+#endif
+#else
+#include <emmintrin.h>
+#ifndef HPPL_TYPE_DOUBLE
+typedef __m128  vecType;
+#else
+typedef __m128d vecType;
+#endif
+#endif
+
+#endif /* HL_MATRIX_TYPE_CUH_ */
diff --git a/paddle/cuda/include/hl_perturbation_util.cuh b/paddle/cuda/include/hl_perturbation_util.cuh
new file mode 100644
index 00000000000000..90fc1cb06035a2
--- /dev/null
+++ b/paddle/cuda/include/hl_perturbation_util.cuh
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef DISTRUB_UTIL_CUH_
+#define DISTRUB_UTIL_CUH_
+
+#include "hl_base.h"
+
+/*
+ * Functionality: randomly rotate, scale and sample a minibatch of images
+                  and their label maps
+ * images:            (numImages, imgPixels, 3)
+ * targets:           (numImages, imgPixels, 3)
+ *
+ * created by Wei Xu. Converted to paddle by Jiang Wang.
+ */
+void hl_conv_random_disturb(const real* images, int imgSize, int tgtSize,
+                            int channels, int numImages, real scaleRatio,
+                            real rotateAngle, int samplingRate,
+                            real* gpu_r_angle, real* gpu_s_ratio,
+                            int* gpu_center_r, int* gpu_center_c,
+                            int paddingValue, bool isTrain, real* targets);
+
+void hl_conv_random_disturb_with_params(const real* images, int imgSize,
+                                        int tgtSize, int channels,
+                                        int numImages, int samplingRate,
+                                        const real* gpuRotationAngle,
+                                        const real* gpuScaleRatio,
+                                        const int* gpuCenterR,
+                                        const int* gpuCenterC,
+                                        int paddingValue, real* targets);
+
+void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
+                                int*& gpuCenterR, int*& gpuCenterC,
+                                int numImages, int imgSize,
+                                real rotateAngle, real scaleRatio,
+                                int samplingRate, bool isTrain);
+
+#endif /* DISTURB_UTIL_CUH_ */
diff --git a/paddle/cuda/include/hl_recurrent_apply.cuh b/paddle/cuda/include/hl_recurrent_apply.cuh
new file mode 100644
index 00000000000000..0ccbf01f1c58da
--- /dev/null
+++ b/paddle/cuda/include/hl_recurrent_apply.cuh
@@ -0,0 +1,192 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_RECURRENT_APPLY_CUH_
+#define HL_RECURRENT_APPLY_CUH_
+
+#include "hl_base.h"
+#include "hl_activation_functions.h"
+#include "hl_lstm_ops.cuh"
+#include "hl_gpu_lstm.cuh"
+#include "hl_cpu_lstm.cuh"
+#include "hl_gru_ops.cuh"
+#include "hl_gpu_gru.cuh"
+#include "hl_cpu_gru.cuh"
+
+/**
+ * @brief   Cpu lstm forward one sequence.
+ *
+ * @param[in]   op                  hl_lstm_ops.cuh
+ * @param[out]  value               hl_lstm_value type.
+ * @param[in]   frameSize           frame size.
+ * @param[in]   active_node         active input type.
+ * @param[in]   active_gate         active state type.
+ * @param[in]   active_state        actvie gate type.
+ */
+template<class Op>
+extern void hl_cpu_lstm_forward(Op op,
+                                hl_lstm_value value,
+                                int frameSize,
+                                hl_activation_mode_t active_node,
+                                hl_activation_mode_t active_gate,
+                                hl_activation_mode_t active_state);
+
+/**
+ * @brief   Cpu lstm backward one sequence.
+ *
+ * @param[in]   op                  hl_lstm_ops.cuh
+ * @param[in]   value               lstm value.
+ * @param[out]  grad                output gradient.
+ * @param[in]   frameSize           frame size.
+ * @param[in]   active_node         active input type.
+ * @param[in]   active_gate         active state type.
+ * @param[in]   active_state        actvie gate type.
+ */
+template<class Op>
+extern void hl_cpu_lstm_backward(Op op,
+                                 hl_lstm_value value,
+                                 hl_lstm_grad grad,
+                                 int frameSize,
+                                 hl_activation_mode_t active_node,
+                                 hl_activation_mode_t active_gate,
+                                 hl_activation_mode_t active_state);
+
+/**
+ * @brief   Gpu lstm batch forward.
+ *
+ * @param[in]   op                  hl_lstm_ops.cuh
+ * @param[out]  value               lstm value.
+ * @param[in]   frameSize           frame size.
+ * @param[in]   batchSize           size of current batch.
+ * @param[in]   active_node         active input type.
+ * @param[in]   active_gate         active state type.
+ * @param[in]   active_state        actvie gate type.
+ */
+template<class Op>
+extern void hl_gpu_lstm_forward(Op op,
+                                hl_lstm_value value,
+                                int frameSize,
+                                int batchSize,
+                                hl_activation_mode_t active_node,
+                                hl_activation_mode_t active_gate,
+                                hl_activation_mode_t active_state);
+
+/**
+ * @brief   Gpu lstm batch backward.
+ *
+ * @param[in]   op                  hl_lstm_ops.cuh
+ * @param[out]  value               lstm value.
+ * @param[out]  grad                lstm gradient.
+ * @param[in]   frameSize           frame size.
+ * @param[in]   batchSize           size of current batch.
+ * @param[in]   active_node         active input type.
+ * @param[in]   active_gate         active state type.
+ * @param[in]   active_state        actvie gate type.
+ */
+template<class Op>
+extern void hl_gpu_lstm_backward(Op op,
+                                 hl_lstm_value value,
+                                 hl_lstm_grad grad,
+                                 int frameSize,
+                                 int batchSize,
+                                 hl_activation_mode_t active_node,
+                                 hl_activation_mode_t active_gate,
+                                 hl_activation_mode_t active_state);
+
+/**
+ * @brief   Cpu gru forward.
+ *
+ * @param[in]     opResetOutput   hl_gru_ops.cuh
+ * @param[in]     opFinalOutput   hl_gru_ops.cuh
+ * @param[in,out] value           gru value.
+ * @param[in]     frameSize       frame length/size.
+ * @param[in]     batchSize       size of current batch.
+ * @param[in]     active_node     active input type.
+ * @param[in]     active_gate     active state type.
+ */
+template<class OpResetOutput, class OpFinalOutput>
+extern void hl_cpu_gru_forward(OpResetOutput opResetOutput,
+                               OpFinalOutput opFinalOutput,
+                               hl_gru_value value,
+                               int frameSize,
+                               int batchSize,
+                               hl_activation_mode_t active_node,
+                               hl_activation_mode_t active_gate);
+
+/**
+ * @brief   Cpu gru forward.
+ *
+ * @param[in]     opStateGrad     hl_gru_ops.cuh
+ * @param[in]     opResetGrad     hl_gru_ops.cuh
+ * @param[in]     value           gru value.
+ * @param[in,out] grad            gru gradient.
+ * @param[in]     frameSize       frame length/size.
+ * @param[in]     batchSize       size of current batch.
+ * @param[in]     active_node     active input type.
+ * @param[in]     active_gate     active state type.
+ */
+template<class OpStateGrad, class OpResetGrad>
+extern void hl_cpu_gru_backward(OpStateGrad opStateGrad,
+                                OpResetGrad opResetGrad,
+                                hl_gru_value value,
+                                hl_gru_grad  grad,
+                                int frameSize,
+                                int batchSize,
+                                hl_activation_mode_t active_node,
+                                hl_activation_mode_t active_gate);
+
+/**
+ * @brief   Gpu gru forward.
+ *
+ * @param[in]     opResetOutput   hl_gru_ops.cuh
+ * @param[in]     opFinalOutput   hl_gru_ops.cuh
+ * @param[in,out] value           gru value.
+ * @param[in]     frameSize       frame length/size.
+ * @param[in]     batchSize       size of current batch.
+ * @param[in]     active_node     active input type.
+ * @param[in]     active_gate     active state type.
+ */
+template<class OpResetOutput, class OpFinalOutput>
+extern void hl_gpu_gru_forward(OpResetOutput opResetOutput,
+                               OpFinalOutput opFinalOutput,
+                               hl_gru_value value,
+                               int frameSize,
+                               int batchSize,
+                               hl_activation_mode_t active_node,
+                               hl_activation_mode_t active_gate);
+
+/**
+ * @brief   Gpu gru forward.
+ *
+ * @param[in]     opStateGrad     hl_gru_ops.cuh
+ * @param[in]     opResetGrad     hl_gru_ops.cuh
+ * @param[in]     value           gru value.
+ * @param[in,out] grad            gru gradient.
+ * @param[in]     frameSize       frame length/size.
+ * @param[in]     batchSize       size of current batch.
+ * @param[in]     active_node     active input type.
+ * @param[in]     active_gate     active state type.
+ */
+template<class OpStateGrad, class OpResetGrad>
+extern void hl_gpu_gru_backward(OpStateGrad opStateGrad,
+                                OpResetGrad opResetGrad,
+                                hl_gru_value value,
+                                hl_gru_grad  grad,
+                                int frameSize,
+                                int batchSize,
+                                hl_activation_mode_t active_node,
+                                hl_activation_mode_t active_gate);
+
+#endif /* HL_RECURRENT_APPLY_CUH_ */
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
new file mode 100644
index 00000000000000..828c21beb2fbd4
--- /dev/null
+++ b/paddle/cuda/include/hl_sequence.h
@@ -0,0 +1,203 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_SEQUENCE_H_
+#define HL_SEQUENCE_H_
+
+#include "hl_base.h"
+
+/**
+ * @brief   Maximum sequence forward.
+ *
+ * @param[in]   input           each sequence contains some instances.
+ * @param[in]   sequence        sequence index..
+ * @param[out]  output          max instance in this sequence.
+ * @param[out]  index           index of max instance.
+ * @param[in]   numSequences    size of sequence[in].
+ * @param[in]   dim             input dimension.
+ *
+ */
+extern void hl_max_sequence_forward(real* input,
+                                    const int* sequence,
+                                    real* output,
+                                    int *index,
+                                    int numSequences,
+                                    int dim);
+
+/**
+ * @brief   Maximum sequence backward.
+ *
+ * @param[in]   outputGrad      output gradient.
+ * @param[in]   index           index of max instance.
+ * @param[out]  inputGrad       input gradient.
+ * @param[in]   numSequences    size of sequence[in].
+ * @param[in]   dim             input dimension.
+ *
+ */
+extern void hl_max_sequence_backward(real* outputGrad,
+                                     int *index,
+                                     real* inputGrad,
+                                     int numSequences,
+                                     int dim);
+
+/**
+ * @brief   Context projection forward.
+ *
+ * @param[in]   input           input sequence.
+ * @param[in]   sequence        sequence index.
+ * @param[in]   weightData      padding data.
+ * @param[out]  output          output sequence.
+ * @param[in]   numSequences    number of sequences.
+ * @param[in]   inputDim        input sequence dimension.
+ * @param[in]   contextLength   context length.
+ * @param[in]   contextStart    context start.
+ * @param[in]   beginPad        number of extra timesteps added at the beginning.
+ * @param[in]   isPadding       trainable padding.
+ *
+ */
+extern void hl_context_projection_forward(real* input,
+                                          const int* sequence,
+                                          real* weightData,
+                                          real* output,
+                                          int numSequences,
+                                          int inputDim,
+                                          int contextLength,
+                                          int contextStart,
+                                          int beginPad,
+                                          bool isPadding);
+
+/**
+ * @brief   Context projection backward data.
+ *
+ * @param[in]   outputGrad      output gradient.
+ * @param[in]   sequence        sequence index.
+ * @param[out]  inputGrad       input gradient.
+ * @param[in]   numSequences    number of sequences.
+ * @param[in]   inputDim        input sequence dimension.
+ * @param[in]   contextLength   context length.
+ * @param[in]   contextStart    context start.
+ *
+ */
+extern void hl_context_projection_backward_data(real* outputGrad,
+                                                const int* sequence,
+                                                real* inputGrad,
+                                                int numSequences,
+                                                int inputDim,
+                                                int contextLength,
+                                                int contextStart);
+
+/**
+ * @brief   Context projection backward weight.
+ *
+ * @param[in]   outputGrad      output gradient.
+ * @param[in]   sequence        sequence index.
+ * @param[out]  weightGrad      weight gradient.
+ * @param[in]   numSequences    number of sequences.
+ * @param[in]   weightDim       input sequence dimension.
+ * @param[in]   totalPad        number of extra timesteps.
+ * @param[in]   contextLength   context length.
+ * @param[in]   contextStart    context start.
+ * @param[in]   beginPad        number of extra timesteps added at the beginning.
+ *
+ */
+extern void hl_context_projection_backward_weight(real* outputGrad,
+                                                  const int* sequence,
+                                                  real* weightGrad,
+                                                  int numSequences,
+                                                  int weightDim,
+                                                  int totalPad,
+                                                  int contextLength,
+                                                  int contextStart,
+                                                  int beginPad);
+
+/**
+ * @brief   Memory copy from sequence to batch.
+ *
+ * if seq2batch == true
+ *
+ *    copy from sequence to batch: batch[i] = sequence[batchIndex[i]].
+ *
+ * if seq2batch == false
+ *
+ *    copy from batch to sequence: sequence[batchIndex[i]] = batch[i].
+ *
+ * @param[in,out]   batch       batch matrix.
+ * @param[in,out]   sequence    equence matrix.
+ * @param[in]       batchIndex  index vector.
+ * @param[in]       seqWidth    width of sequence.
+ * @param[in]       batchCount  number of batchIndex.
+ * @param[in]       seq2batch   copy direction.
+ *
+ */
+extern void hl_sequence2batch_copy(real *batch,
+                                   real *sequence,
+                                   int *batchIndex,
+                                   int seqWidth,
+                                   int batchCount,
+                                   bool seq2batch);
+
+/**
+ * @brief   Add sequence to batch.
+ *
+ * if seq2batch == true
+ *
+ *    add sequence to batch: batch[i] = sequence[batchIndex[i]].
+ *
+ * if seq2batch == false
+ *
+ *    add batch to sequence: sequence[batchIndex[i]] = batch[i].
+ *
+ * @param[in,out]   batch       batch matrix.
+ * @param[in,out]   sequence    equence matrix.
+ * @param[in]       batchIndex  index vector.
+ * @param[in]       seqWidth    width of sequence.
+ * @param[in]       batchCount  number of batchIndex.
+ * @param[in]       seq2batch   copy direction.
+ *
+ */
+extern void hl_sequence2batch_add(real *batch,
+                                  real *sequence,
+                                  int *batchIndex,
+                                  int seqWidth,
+                                  int batchCount,
+                                  bool seq2batch);
+
+/**
+ * @brief  dst = Op(src), src is sequence.
+ *
+ * mode = 0, Op is average.
+ *
+ * mode = 1, Op is sum.
+ *
+ * mode = 2, Op is sum(src)/sqrt(N), N is sequence length.
+ *
+ * @param[in,out]   dst       destination data.
+ * @param[in]       src       source data.
+ * @param[in]       starts    sequence start positions.
+ * @param[in]       height    height of dst data.
+ * @param[in]       width     width of dst data.
+ * @param[in]       mode      0: avreage,
+ *                            1: sum,
+ *                            2: divide by square root
+ *                            of sequenceLength
+ */
+extern void hl_sequence_avg_forward(real* dst,
+                                    real* src,
+                                    const int* starts,
+                                    int height,
+                                    int width,
+                                    const int mode);
+
+#endif /* HL_SEQUENCE_H_ */
diff --git a/paddle/cuda/include/hl_sparse.h b/paddle/cuda/include/hl_sparse.h
new file mode 100644
index 00000000000000..22f7a228e0ad6b
--- /dev/null
+++ b/paddle/cuda/include/hl_sparse.h
@@ -0,0 +1,515 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_SPARSE_H_
+#define HL_SPARSE_H_
+
+#include "hl_base.h"
+
+/**
+ * @brief   Malloc a sparse matrix.
+ *
+ * @param[out]  A_d        sparse matrix.
+ * @param[in]   format     format.
+ * @param[in]   value_type valueType.
+ * @param[in]   dimM       height.
+ * @param[in]   dimN       width.
+ * @param[in]   nnz        number of none zero element.
+ *
+ */
+extern void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
+                                    hl_matrix_format_t format,
+                                    hl_matrix_value_t  value_type,
+                                    int dimM,
+                                    int dimN,
+                                    int nnz);
+
+/**
+ * @brief   Free a sparse matrix.
+ *
+ * @param[in]  A_d  GPU sparse matrix.
+ *
+ */
+extern void hl_free_sparse_matrix(hl_sparse_matrix_s A_d);
+
+/**
+ * @brief   Construct a sparse matrix use input gpu memory.
+ *
+ * @param[out]  A_d         sparse matrix.
+ * @param[in]   dest_d      gpu memory.
+ * @param[in]   size        size of dest_d.
+ * @param[in]   format      format.
+ * @param[in]   value_type  valueType.
+ * @param[in]   dimM        height.
+ * @param[in]   dimN        width.
+ * @param[in]   nnz         number of none zero element.
+ *
+ * @note    Destruct api is hl_destruct_sparse_matrix.
+ *
+ */
+extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
+                                       void * dest_d,
+                                       size_t size,
+                                       hl_matrix_format_t format,
+                                       hl_matrix_value_t  value_type,
+                                       int dimM,
+                                       int dimN,
+                                       int nnz);
+
+/**
+ * @brief   Use three arrays to construct sparse matrix.
+ *
+ * if format is HL_SPARSE_CSR, size of rows_d is dimM + 1,
+ * and size of cols_d is nnz;
+ *
+ * if format is HL_SPARSE_CSC, size of rows_d is nnz, and size of
+ * cols_d is dimN + 1.
+ *
+ * if valueType is HL_NO_VALUE, size of value_d is zero,
+ * else size of value_d is nnz.
+ *
+ * @param[out]  A_d        sparse matrix.
+ * @param[in]   value_d    value.
+ * @param[in]   rows_d     row.
+ * @param[in]   cols_d     col.
+ * @param[in]   format     format.
+ * @param[in]   value_type valueType.
+ * @param[in]   dimM       height.
+ * @param[in]   dimN       width.
+ * @param[in]   nnz        number of none zero element.
+ *
+ * @note    The corresponding destructor interface is hl_destruct_sparse_matrix.
+ *
+ */
+extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
+                                       real* value_d,
+                                       int* rows_d,
+                                       int* cols_d,
+                                       hl_matrix_format_t format,
+                                       hl_matrix_value_t  value_type,
+                                       int dimM,
+                                       int dimN,
+                                       int nnz);
+
+/**
+ * @brief   Destruct sparse matrix.
+ *
+ * @param[in] A_d  sparse matrix.
+ *
+ */
+extern void hl_destruct_sparse_matrix(hl_sparse_matrix_s A_d);
+
+/**
+ * @brief   Copy value & index to sparse matrix.
+ *
+ * if csr_matrix is HL_FLOAT_VALUE.
+ *
+ *  1. csr_val, csr_row, csr_col three pointers are not null.
+ *
+ *  2. csr_val is not null, csr_row adn csr_col are null.
+ *
+ * if csr_matrix is HL_NO_VALUE.
+ *
+ *  1. csr_val will be ignore, csr_row and csr_col are not null.
+ *
+ *
+ * @param[in,out]   csr_matrix sparse matrix.
+ * @param[in]       csr_val    point to csr value array(nnz).
+ * @param[in]       csr_row    point to csr row indices array(dimM+1).
+ * @param[in]       csr_col    point to csr col indices array(nnz).
+ * @param[in]       stream     hl_stream_t type.
+ *
+ */
+extern void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
+                                 real *csr_val,
+                                 int *csr_row,
+                                 int *csr_col,
+                                 hl_stream_t stream);
+
+/**
+ * @brief   Copy value & index to sparse matrix.
+ *
+ * if csr_matrix is HL_FLOAT_VALUE.
+ *
+ *   1. csc_val, csc_row, csc_col three pointers are not null.
+ *
+ *   2. csc_val is not null, csc_row and csc_col are null.
+ *
+ * if csr_matrix is HL_NO_VALUE.
+ *
+ *   1. csc_val will be ignore, csc_row and csc_col are not null.
+ *
+ * @param[in,out]   csc_matrix sparse matrix.
+ * @param[in]       csc_val    point to csc value array(nnz).
+ * @param[in]       csc_row    point to csc row indices array(nnz).
+ * @param[in]       csc_col    point to csc col indices array(dimN+1).
+ * @param[in]       stream     hl_stream_t type.
+ *
+ *
+ */
+extern void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
+                                 real *csc_val,
+                                 int *csc_row,
+                                 int *csc_col,
+                                 hl_stream_t stream);
+
+/**
+ * @brief   Copy sparse matrix to sparse matrix.
+ *
+ * @param[out]  dst     sparse matrix.
+ * @param[in]   src     sparse matrix.
+ * @param[in]   stream  hl_stream_t type.
+ *
+ *
+ * @note    1. Format of the src matrix and dst matrix needs to be consistent.
+ *          2. Source matrix has value, the destination matrix has value or
+ *             no value can be; the source matrix is no value, then the
+ *             destination matrix must also be no value;
+ */
+extern void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
+                                    hl_sparse_matrix_s src,
+                                    hl_stream_t stream);
+
+/**
+ * @brief   csr matrix to dense matrix.
+ *
+ * @param[in]   A_d     csr matrix.
+ * @param[out]  C_d     dense matrix.
+ * @param[in]   dimM    height.
+ * @param[in]   dimN    width.
+ *
+ */
+extern void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
+                                real *C_d,
+                                int dimM,
+                                int dimN);
+
+/**
+ * @brief   csc matrix to dense matrix.
+ *
+ * @param[in]   A_d     csc matrix.
+ * @param[out]  C_d     dense matrix.
+ * @param[in]   dimM    height.
+ * @param[in]   dimN    width.
+ *
+ */
+extern void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
+                                real *C_d,
+                                int dimM,
+                                int dimN);
+
+/**
+ * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d.
+ *
+ * @param[in]   A_d     csr sparse matrix.
+ * @param[in]   transa  operation op(A) that is non-or transpose.
+ * @param[in]   B_d     dense matrix.
+ * @param[in]   transb  operation op(B) that is non-or transpose.
+ * @param[out]  C_d     dense matrix.
+ * @param[in]   dimM    matrix height of op(A) & C
+ * @param[in]   dimN    matrix width of op(B) & C
+ * @param[in]   dimK    width of op(A) & height of op(B)
+ * @param[in]   alpha   scalar used for multiplication.
+ * @param[in]   beta    scalar used for multiplication.
+ *
+ * @note    transb is not support HPPL_OP_T.
+ *
+ */
+extern void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
+                                    hl_trans_op_t transa,
+                                    real *B_d,
+                                    hl_trans_op_t transb,
+                                    real *C_d,
+                                    int dimM,
+                                    int dimN,
+                                    int dimK,
+                                    real alpha,
+                                    real beta);
+
+/**
+ * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d.
+ *
+ * @param[in]   A_d     sparse matrix.
+ * @param[in]   transa  operation op(A) that is non-or transpose.
+ * @param[in]   B_d     dense matrix.
+ * @param[in]   transb  operation op(B) that is non-or transpose.
+ * @param[out]  C_d     dense matrix.
+ * @param[in]   dimM    matrix height of op(A) & C
+ * @param[in]   dimN    matrix width of op(B) & C
+ * @param[in]   dimK    width of op(A) & height of op(B)
+ * @param[in]   alpha   scalar used for multiplication.
+ * @param[in]   beta    scalar used for multiplication.
+ *
+ * @note    transb is not support HPPL_OP_T.
+ *
+ */
+extern void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
+                                    hl_trans_op_t transa,
+                                    real *B_d, hl_trans_op_t transb,
+                                    real *C_d,
+                                    int dimM, int dimN, int dimK,
+                                    real alpha, real beta);
+
+/**
+ * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d.
+ *
+ * @param[in]   A_d     dense matrix.
+ * @param[in]   transa  operation op(A) that is non-or transpose.
+ * @param[in]   B_d     csc sparse matrix.
+ * @param[in]   transb  operation op(B) that is non-or transpose.
+ * @param[out]  C_d     dense matrix.
+ * @param[in]   dimM    matrix height of op(A) & C
+ * @param[in]   dimN    matrix width of op(B) & C
+ * @param[in]   dimK    width of op(A) & height of op(B)
+ * @param[in]   alpha   scalar used for multiplication.
+ * @param[in]   beta    scalar used for multiplication.
+ *
+ * @note    transa is not support HPPL_OP_T.
+ *
+ */
+extern void hl_matrix_dense_mul_csc(real *A_d,
+                                    hl_trans_op_t transa,
+                                    hl_sparse_matrix_s B_d,
+                                    hl_trans_op_t transb,
+                                    real *C_d,
+                                    int dimM,
+                                    int dimN,
+                                    int dimK,
+                                    real alpha,
+                                    real beta);
+
+/**
+ * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d.
+ *          Calculated based on the non-zero elements of the matrix C.
+ *
+ * @param[in]     A_d     dense matrix.
+ * @param[in]     transa  operation op(A) that is non-or transpose.
+ * @param[in]     B_d     dense matrix.
+ * @param[in]     transb  operation op(B) that is non-or transpose.
+ * @param[in,out] C_d     sparse matrix.
+ * @param[in]     dimM    matrix height of op(A) & C
+ * @param[in]     dimN    matrix width of op(B) & C
+ * @param[in]     dimK    width of op(A) & height of op(B)
+ * @param[in]     alpha   scalar used for multiplication.
+ * @param[in]     beta    scalar used for multiplication.
+ *
+ * @note    transb is not support HPPL_OP_T.
+ *
+ */
+extern void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
+                                 real *B_d, hl_trans_op_t transb,
+                                 hl_sparse_matrix_s C_d,
+                                 int dimM, int dimN, int dimK,
+                                 real alpha, real beta);
+
+/**
+ * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
+ *
+ * @param[in]   A_d     dense matrix.
+ * @param[in]   transa  operation op(A) that is non-or transpose.
+ * @param[in]   B_d     sparse matrix.
+ * @param[in]   transb  operation op(B) that is non-or transpose.
+ * @param[out]  C_d     dense matrix.
+ * @param[in]   dimM    matrix height of op(A) & C
+ * @param[in]   dimN    matrix width of op(B) & C
+ * @param[in]   dimK    width of op(A) & height of op(B)
+ * @param[in]   alpha   scalar used for multiplication.
+ * @param[in]   beta    scalar used for multiplication.
+ *
+ *
+ * @note    transa is not support HPPL_OP_T.
+ *
+ */
+extern void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
+                                    hl_sparse_matrix_s B_d,
+                                    hl_trans_op_t transb,
+                                    real *C_d,
+                                    int dimM, int dimN, int dimK,
+                                    real alpha, real beta);
+
+/**
+ * @brief   Memcpy csc_matrix to host.
+ *
+ * a. according to csc_matrix, update three arrays
+ *
+ *  1. csc_val, csc_row, csc_col are dest Address.
+ *
+ *  2. if type of csc_matrix is HL_NO_VALUE, update csc_row and csc_col
+ *
+ *  3. if type of csc_matrix is HL_FLOAT_VALUE, update csc_row,
+ *     csc_col and csc_value.
+ *
+ * b. The interface is asynchronous copy. To ensure that the data is copied
+ *     please call the synchronous interface;
+ *
+ *
+ * @param[out]  csc_val     point to csc value array(nnz).
+ * @param[in]   val_size    csc value size.
+ * @param[out]  csc_row     point to csc row indices array(nnz).
+ * @param[in]   row_size    csc row size.
+ * @param[out]  csc_col     point to csc col indices array(dimN + 1).
+ * @param[in]   col_size    csc column size.
+ * @param[in]   csc_matrix  sparse matrix.
+ * @param[in]   stream      hl_stream_t type.
+ *
+ */
+extern void hl_memcpy_from_csc_matrix(real *csc_val,
+                                      size_t val_size,
+                                      int *csc_row,
+                                      size_t row_size,
+                                      int *csc_col,
+                                      size_t col_size,
+                                      hl_sparse_matrix_s csc_matrix,
+                                      hl_stream_t stream);
+
+/**
+ * @brief   Memcpy sparse matrix to host.
+ *
+ * a. according to csr_matrix, update three arrays
+ *
+ *  1. csr_val, csr_row, csr_col are dest Address.
+ *
+ *  2. if type of csr_matrix is HL_NO_VALUE, update csr_row and csr_col
+ *
+ *  3. if type of csr_matrix is HL_FLOAT_VALUE, update csr_row,
+ *     csr_col and csr_value
+ *
+ * b. The interface is asynchronous copy. To ensure that the data is copied
+ *     please call the synchronous interface;
+ *
+ * @param[out]  csr_val     point to csr value array(nnz).
+ * @param[in]   val_size    csr value size.
+ * @param[out]  csr_row     point to csr row indices array(nnz).
+ * @param[in]   row_size    csr row size.
+ * @param[out]  csr_col     point to csr col indices array(dimN + 1).
+ * @param[in]   col_size    csr column size.
+ * @param[in]   csr_matrix  sparse matrix.
+ * @param[in]   stream      hl_stream_t type.
+ *
+ */
+extern void hl_memcpy_from_csr_matrix(real *csr_val,
+                                      size_t val_size,
+                                      int *csr_row,
+                                      size_t row_size,
+                                      int *csr_col,
+                                      size_t col_size,
+                                      hl_sparse_matrix_s csr_matrix,
+                                      hl_stream_t stream);
+
+
+/**
+ * @brief   A_d[j] += B_d[i,j] for i in range(height)
+ *
+ * @param[in,out]   A_d    vector, size = width.
+ * @param[in]       B_d    sparse matrix.
+ * @param[in]       dimM   height.
+ * @param[in]       dimN   width.
+ * @param[in]       scale  scale of B_d
+ *
+ */
+extern void hl_sparse_matrix_column_sum(real* A_d,
+                                        hl_sparse_matrix_s B_d,
+                                        int dimM,
+                                        int dimN,
+                                        real scale);
+/**
+ * @brief implementation of csr sparse matrix in hl_sparse_matirx_column_sum
+ */
+extern void hl_matrix_csr_column_sum(real* A_d,
+                                     hl_sparse_matrix_s B_d,
+                                     int dimM,
+                                     int dimN,
+                                     real scale);
+
+/**
+ * @brief   A_d[i,j] += B_d[j]
+ *
+ * @param[in,out]   A_d    sprare matrix.
+ * @param[in]       B_d    vector, size = A_d.width.
+ * @param[in]       scale  scale of B_d.
+ *
+ */
+extern void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
+                                      real* B_d,
+                                      real scale);
+/**
+ * @brief implementation of csr sparse matrix in hl_sparse_matrix_add_bias
+ */
+extern void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
+                                   real* B_d,
+                                   real scale);
+
+/**
+ * @brief   sparseMatrix = alpha * denseMatrix + beta *sparseMatrix
+ *          A_d[i,j] = alpha * B_d[i,j] + beta * A_d[i,j]
+ *          Only add value of same (row, col) index in dense matrix and
+ *          do not use others values whoes postions are not in sparse matirx.
+ *
+ * @param[in,out]   A_d    sprare matrix.
+ * @param[in]       B_d    dense matrix.
+ * @param[in]       dimM   height of B_d.
+ * @param[in]       dimN   width of B_d.
+ * @param[in]       alpha  scale of B_d.
+ * @param[in]       beta   scale of A_d.
+ *
+ */
+extern void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
+                                       real* B_d,
+                                       int dimM,
+                                       int dimN,
+                                       real alpha,
+                                       real beta);
+/**
+ * @brief implementation of csr sparse matrix in hl_sparse_matrix_add_dense
+ */
+extern void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
+                                    real* B_d,
+                                    int dimM,
+                                    int dimN,
+                                    real alpha,
+                                    real beta);
+
+/**
+ * @brief get rows pionter of GpuSparseMatrix
+ *
+ * @param[in]    sMat  sparse matrix
+ *
+ * @return   return rows pointer, which is gpu address
+ *
+ */
+extern int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
+
+/**
+ * @brief get cols pionter of GpuSparseMatrix
+ *
+ * @param[in]    sMat  sparse matrix
+ *
+ * @return   return cols pointer, which is gpu address
+ *
+ */
+extern int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
+
+/**
+ * @brief get value pionter of GpuSparseMatrix
+ *
+ * @param[in]    sMat  sparse matrix
+ *
+ * @return   return value pointer, which is gpu address
+ *
+ */
+extern real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat);
+
+
+#endif /* HL_SPARSE_H_ */
diff --git a/paddle/cuda/include/hl_sparse.ph b/paddle/cuda/include/hl_sparse.ph
new file mode 100644
index 00000000000000..d3bc73c80d3741
--- /dev/null
+++ b/paddle/cuda/include/hl_sparse.ph
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_SPARSE_PH_
+#define HL_SPARSE_PH_
+
+#include "hl_base.h"
+
+/**
+ * @brief   sparse matrix csr format.
+ *
+ * @param   *csr_val     nonzero values of matrix.
+ * @param   *csr_row     row indices.
+ * @param   *csr_col     column indices.
+ * @param   nnz_s        sizeof of csr_val & csr_col.
+ * @param   row_s        sizeof of csr_row.
+ * @param   sparsity     sparsity pattern.
+ *
+ */
+typedef struct {
+    real                *csr_val;
+    int                 *csr_row;
+    int                 *csr_col;
+    size_t              nnz_s;
+    int                 row_s;
+    float               sparsity;
+}_hl_csr_matrix, *hl_csr_matrix;
+
+/**
+ * @brief   sparse matrix csc format.
+ *
+ * @param   *csc_val      nonzero values of matrix.
+ * @param   *csc_row      row indices.
+ * @param   *csc_col      column indices.
+ * @param   nnz_s         sizeof of csc_val & csc_row.
+ * @param   col_s         sizeof of csc_col.
+ * @param   sparsity      sparsity pattern.
+ *
+ */
+typedef struct {
+    real                *csc_val;
+    int                 *csc_row;
+    int                 *csc_col;
+    size_t              nnz_s;
+    int                 col_s;
+    float               sparsity;
+}_hl_csc_matrix, *hl_csc_matrix;
+
+#define __sparse_get_type_return__(mat, type, field)\
+  do {\
+    hl_##type##_matrix type##_d = (hl_##type##_matrix)((mat)->matrix);\
+    if (type##_d) {\
+      return type##_d -> type##_##field;\
+    } else {\
+      LOG(WARNING) << "parameter " <<  #field << "NULL error!";\
+      return NULL;\
+    }\
+  } while(0)
+
+#define __sparse_get_return__(mat, field)\
+  do {\
+    if ((mat) == NULL) {\
+      LOG(WARNING) << "parameter NULL error!";\
+      return NULL;\
+    }\
+    if ((mat)->format == HL_SPARSE_CSR) {\
+      __sparse_get_type_return__(mat, csr, field);\
+    } else {\
+      __sparse_get_type_return__(mat, csc, field);\
+    }\
+  } while(0)
+
+#endif  /* HL_SPARSE_PH_ */
diff --git a/paddle/cuda/include/hl_sse_matrix_kernel.cuh b/paddle/cuda/include/hl_sse_matrix_kernel.cuh
new file mode 100644
index 00000000000000..d774150c21e617
--- /dev/null
+++ b/paddle/cuda/include/hl_sse_matrix_kernel.cuh
@@ -0,0 +1,315 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_SSE_MATRIX_KERNEL_CUH_
+#define HL_SSE_MATRIX_KERNEL_CUH_
+
+#include "hl_matrix_type.cuh"
+
+#define VECTOR_SIZE     16
+
+#ifndef HPPL_TYPE_DOUBLE
+/* number of float in vector */
+#define     VECTOR_LEN      4
+#define     VECTOR_SET      _mm_set_ps1
+#else
+/* number of double in vector */
+#define     VECTOR_LEN      2
+#define     VECTOR_SET      _mm_set_pd1
+#endif
+
+inline bool hl_check_align(size_t size) {
+  return !(size & (VECTOR_SIZE - 1));
+}
+
+inline bool hl_check_align(void *ptr) {
+  return hl_check_align(reinterpret_cast<size_t>(ptr));
+}
+
+#ifndef HPPL_TYPE_DOUBLE
+template <class Agg>
+inline real hl_agg_op(Agg agg, vecType mm) {
+  __m128 lo = _mm_unpacklo_ps(mm, mm);
+  __m128 hi = _mm_unpackhi_ps(mm, mm);
+  __m128 tmp1 = agg.vecOp(lo, hi);
+  __m128 tmp2 = _mm_movehl_ps(tmp1, tmp1);
+  __m128 ret = agg.vecOp(tmp1, tmp2);
+
+  return _mm_cvtss_f32(ret);
+}
+#else
+template <class Agg>
+inline real hl_agg_op(Agg agg, vecType mm) {
+  __m128d lo = _mm_unpacklo_pd(mm, mm);
+  __m128d hi = _mm_unpackhi_pd(mm, mm);
+  __m128d ret = agg.vecOp(lo, hi);
+
+  return _mm_cvtsd_f64(ret);
+}
+#endif
+
+template <class Agg, class Op, class Saver>
+void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
+                          int dimM, int dimN,
+                          real *dst, int ld,
+                          real *A, int lda) {
+  for (int i = 0; i < dimM; i++, A += lda) {
+    vecType mm = VECTOR_SET(agg.init());
+    vecType *a = (vecType*)(A);
+    for (int j = 0; j < dimN / VECTOR_LEN; j++, a++) {
+        mm = agg.vecOp(mm, op.vecOp(*a));
+    }
+
+    int rem = dimN % VECTOR_LEN;
+    if (rem) {
+      real tmp = hl_agg_op(agg, mm);
+      real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
+      for (int j = 0; j < rem; j++) {
+          tmp = agg(tmp, op(a[j]));
+      }
+      dst[i*ld] = sv(dst[i*ld], tmp);
+    } else {
+        dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
+    }
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
+                          int dimM, int dimN,
+                          real *dst, int ld,
+                          real *A, int lda,
+                          real *B, int ldb) {
+  for (int i = 0; i < dimM; i++, A += lda, B += ldb) {
+    vecType mm = VECTOR_SET(agg.init());
+    vecType *a = (vecType*)(A);
+    vecType *b = (vecType*)(B);
+    for (int j = 0; j < dimN / VECTOR_LEN; j++, a++, b++) {
+        mm = agg.vecOp(mm, op.vecOp(*a, *b));
+    }
+
+    int rem = dimN % VECTOR_LEN;
+    if (rem) {
+      real tmp = hl_agg_op(agg, mm);
+      real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
+      real *b = B + (dimN / VECTOR_LEN) * VECTOR_LEN;
+      for (int j = 0; j < rem; j++) {
+          tmp = agg(tmp, op(a[j], b[j]));
+      }
+      dst[i*ld] = sv(dst[i*ld], tmp);
+    } else {
+        dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
+    }
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_matrix_column_op(Agg agg, Op op, Saver sv,
+                         int dimM, int dimN,
+                         real *dst,
+                         real *A, int lda) {
+  for (int j = 0; j < dimN; j++) {
+    real tmp = agg.init();
+    for (int i = 0; i < dimM; i++) {
+        tmp = agg(tmp, op(A[i * lda + j]));
+    }
+    dst[j] = sv(dst[j], tmp);
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_matrix_column_op(Agg agg, Op op, Saver sv,
+                         int dimM, int dimN,
+                         real *dst,
+                         real *A, int lda,
+                         real *B, int ldb) {
+  for (int j = 0; j < dimN; j++) {
+    real tmp = agg.init();
+    for (int i = 0; i < dimM; i++) {
+        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
+    }
+    dst[j] = sv(dst[j], tmp);
+  }
+}
+
+/*
+ * MaxRow greater than or equal dimN
+ * dimN is multiples of VECTOR_LEN
+ * so rem <= MaxRow / VECTOR_LEN
+ */
+template <int MaxRow, class Agg, class Op, class Saver>
+void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
+                               int dimM, int dimN,
+                               real *dst,
+                               real *A, int lda) {
+  vecType mm[MaxRow / VECTOR_LEN];
+  for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
+    mm[n] = VECTOR_SET(agg.init());
+  }
+
+  for (int i = 0; i < dimM; i++) {
+    vecType *a = (vecType*)(A + i * lda);
+    for (int n = 0; n < dimN / VECTOR_LEN; n++) {
+      mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
+    }
+  }
+
+  vecType *result = (vecType*)(dst);
+  for (int n = 0; n < dimN / VECTOR_LEN; n++) {
+    result[n] = sv.vecOp(result[n], mm[n]);
+  }
+
+  int rem = dimN % VECTOR_LEN;
+  if (rem) {
+    A += (dimN / VECTOR_LEN) * VECTOR_LEN;
+    dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
+    hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda);
+  }
+}
+
+/*
+ * dimN is multiples of VECTOR_LEN
+ * dimN greater than Step
+ */
+template <int Step, class Agg, class Op, class Saver>
+void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda) {
+  for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step) {
+    vecType mm[Step / VECTOR_LEN];
+    for (int n = 0; n < Step / VECTOR_LEN; n++) {
+      mm[n] = VECTOR_SET(agg.init());
+    }
+
+    for (int i = 0; i < dimM; i++) {
+      vecType *a = (vecType*)(A + i * lda);
+      for (int n = 0; n < Step / VECTOR_LEN; n++) {
+        mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
+      }
+    }
+
+    vecType *result = (vecType*)(dst);
+    for (int n = 0; n < Step / VECTOR_LEN; n++) {
+      result[n] = sv.vecOp(result[n], mm[n]);
+    }
+  }
+
+  int remRow = dimN % Step;
+  if (remRow) {
+    hl_sse_column_op_with_rem<Step>(agg, op, sv, dimM, remRow, dst, A, lda);
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda) {
+  if (dimN <= 16) {
+    hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda);
+  } else if (dimN <= 32) {
+    hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda);
+  } else if (dimN <= 1024 || dimM <= 512) {
+    hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda);
+  } else {
+    hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda);
+  }
+}
+
+template <int MaxRow, class Agg, class Op, class Saver>
+void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
+                               int dimM, int dimN,
+                               real *dst,
+                               real *A, int lda,
+                               real *B, int ldb) {
+  vecType mm[MaxRow / VECTOR_LEN];
+  for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
+    mm[n] = VECTOR_SET(agg.init());
+  }
+
+  for (int i = 0; i < dimM; i++) {
+    vecType *a = (vecType*)(A + i * lda);
+    vecType *b = (vecType*)(B + i * ldb);
+    for (int n = 0; n < dimN / VECTOR_LEN; n++) {
+      mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
+    }
+  }
+
+  vecType *result = (vecType*)(dst);
+  for (int n = 0; n < dimN / VECTOR_LEN; n++) {
+    result[n] = sv.vecOp(result[n], mm[n]);
+  }
+
+  int rem = dimN % VECTOR_LEN;
+  if (rem) {
+    A += (dimN / VECTOR_LEN) * VECTOR_LEN;
+    B += (dimN / VECTOR_LEN) * VECTOR_LEN;
+    dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
+    hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda, B, ldb);
+  }
+}
+
+template <int Step, class Agg, class Op, class Saver>
+void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda,
+                             real *B, int ldb) {
+  for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step, B += Step) {
+    vecType mm[Step / VECTOR_LEN];
+    for (int n = 0; n < Step / VECTOR_LEN; n++) {
+      mm[n] = VECTOR_SET(agg.init());
+    }
+
+    for (int i = 0; i < dimM; i++) {
+      vecType *a = (vecType*)(A + i * lda);
+      vecType *b = (vecType*)(B + i * ldb);
+      for (int n = 0; n < Step / VECTOR_LEN; n++) {
+        mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
+      }
+    }
+
+    vecType *result = (vecType*)(dst);
+    for (int n = 0; n < Step / VECTOR_LEN; n++) {
+      result[n] = sv.vecOp(result[n], mm[n]);
+    }
+  }
+
+  int remRow = dimN % Step;
+  if (remRow) {
+    hl_sse_column_op_with_rem<Step>(
+        agg, op, sv, dimM, remRow, dst, A, lda, B, ldb);
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda,
+                             real *B, int ldb) {
+  if (dimN <= 16) {
+    hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  } else if (dimN <= 32) {
+    hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  } else if (dimN <= 1024 || dimM <= 512) {
+    hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  } else {
+    hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  }
+}
+
+#endif /* HL_SSE_MATRIX_KERNEL_CUH_ */
diff --git a/paddle/cuda/include/hl_table_apply.h b/paddle/cuda/include/hl_table_apply.h
new file mode 100644
index 00000000000000..3c9428e9253d5e
--- /dev/null
+++ b/paddle/cuda/include/hl_table_apply.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_TABLE_APPLY_H_
+#define HL_TABLE_APPLY_H_
+
+/**
+ * @brief   Get row from table.
+ *          output[i] += table[ids[i]]
+ *          if ids[i] == -1, it will be ignored
+ *
+ * @param[out]  output          output matrix.
+ * @param[in]   ldo             leading dimension of output.
+ * @param[in]   table           table matrix.
+ * @param[in]   ldt             leading dimension of table.
+ * @param[in]   ids             ids vector.
+ * @param[in]   numSamples      height of output.
+ * @param[in]   tableSize       height of table.
+ * @param[in]   dim             width of table.
+ *
+ */
+extern void hl_matrix_select_rows(real* output, int ldo,
+                                  real* table, int ldt,
+                                  int* ids,
+                                  int numSamples,
+                                  int tableSize,
+                                  int dim);
+
+/**
+ * @brief   Add row to table.
+ *          table[ids[i]] += output[i]
+ *          if ids[i] == -1, it will be ignored
+ *
+ * @param[out]  table           table matrix.
+ * @param[in]   ldt             leading dimension of table.
+ * @param[in]   input           input matrix.
+ * @param[in]   ldi             leading dimension of input.
+ * @param[in]   ids             ids vector.
+ * @param[in]   numSamples      height of input.
+ * @param[in]   tableSize       height of table.
+ * @param[in]   dim             width of table.
+ *
+ */
+extern void hl_matrix_add_to_rows(real* table, int ldt,
+                                  real* input, int ldi,
+                                  int* ids,
+                                  int numSamples,
+                                  int tableSize,
+                                  int dim);
+
+/**
+ * @brief   Select element from vector.
+ *
+ * @param[out]  dst         output vector.
+ * @param[in]   sized       size of dst.
+ * @param[in]   src         input vector.
+ * @param[in]   sizes       size of src.
+ * @param[in]   ids         index vector.
+ * @param[in]   sizei       size of ids.
+ *
+ */
+template <class T>
+extern void hl_vector_select_from(T* dst, int sized,
+                                  const T* src, int sizes,
+                                  const int* ids, int sizei);
+
+#endif  /* HL_TABLE_APPLY_H_ */
diff --git a/paddle/cuda/include/hl_thread.ph b/paddle/cuda/include/hl_thread.ph
new file mode 100644
index 00000000000000..0cfc45993676d9
--- /dev/null
+++ b/paddle/cuda/include/hl_thread.ph
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_THREAD_PH_
+#define HL_THREAD_PH_
+
+#include <stdio.h>
+#include <pthread.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <curand.h>
+#include <cudnn.h>
+#include "hl_base.h"
+
+/**
+ * @brief   Thread resource structure.
+ *
+ * @param   stream[HPPL_STREAM_END] Stream for thread.
+ * @param   handle                  Cublas Handle.
+ * @param   gen                     Curand Generator.
+ * @param   cudnn_handle            Cudnn handle.
+ * @param   cudnn_desc              Cudnn image descriptor.
+ * @param   *gen_mutex              Gen lock.
+ * @param   *gpu_mem                HPPL GPU Memory.
+ * @param   *cpu_mem                HPPL CPU Memory.
+ * @param   event                   gpu_mem event.
+ * @param   device                  Thread device context.
+ * @param   major                   Compute capability.
+ * @param   is_init                 Thread init or not.
+ */
+typedef struct {
+    cudaStream_t             stream[HPPL_STREAM_END];
+    cublasHandle_t           handle;
+    curandGenerator_t        gen;
+    cudnnHandle_t            cudnn_handle;
+    cudnnTensorDescriptor_t  cudnn_desc;
+    pthread_mutex_t          *gen_mutex;
+    real                     *gpu_mem;
+    real                     *cpu_mem;
+    cudaEvent_t              event;
+    int                      device;
+    int                      major;
+    bool                     is_init;
+} _hl_thread_resource, *hl_thread_resource;
+
+extern __thread _hl_thread_resource t_resource;
+
+/**
+ * @brief   Initialize cudnn.
+ *
+ * @param   cudnn_handle  Cudnn handle.
+ * @param   stream        Cudnn stream.
+ */
+extern void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream);
+
+/**
+ * @brief   Initialize cublas.
+ *
+ * @param   cublas_handle  Cublas handle.
+ * @param   stream         Cuda stream.
+ */
+extern void hl_cublas_init(cublasHandle_t *cublas_handle, cudaStream_t stream);
+
+/**
+ * @brief   Initialize cudnn tensor descriptor.
+ *
+ * @param   cudnn_desc    Cudnn tensor descriptor.
+ */
+
+extern void hl_cudnn_desc_init(cudnnTensorDescriptor_t*  cudnn_desc);
+
+#endif  /* HL_THREAD_PH_ */
diff --git a/paddle/cuda/include/hl_time.h b/paddle/cuda/include/hl_time.h
new file mode 100644
index 00000000000000..4414b0b2d2ed4a
--- /dev/null
+++ b/paddle/cuda/include/hl_time.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_TIME_H_
+#define HL_TIME_H_
+
+/**
+ * @brief   High resolution timer.
+ *
+ * @return  int64_t the representation value of the object as a
+ *                  count of periods, which are not necessarily
+ *                  seconds.
+ *
+ * @note    It is used to generate random perturbation parameters.
+ */
+int64_t getCurrentTimeStick(void);
+
+#endif /* HL_TIME_H_ */
diff --git a/paddle/cuda/include/hl_top_k.h b/paddle/cuda/include/hl_top_k.h
new file mode 100644
index 00000000000000..a38d4cf862278a
--- /dev/null
+++ b/paddle/cuda/include/hl_top_k.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_TOP_K_H_
+#define HL_TOP_K_H_
+
+#include "hl_base.h"
+
+/**
+ * @brief   find top k element.
+ *
+ * @param[out]  topVal         top k element.
+ * @param[in]   ldv            leading dimension of topVal.
+ * @param[out]  topIds         top k index.
+ * @param[in]   src            input value.
+ * @param[in]   lds            leading dimension of src.
+ * @param[in]   dim            width of input value.
+ * @param[in]   beamSize       beam size.
+ * @param[in]   numSamples     height of input value.
+ *
+ */
+extern void hl_matrix_top_k(real* topVal, int ldv,
+                            int * topIds,
+                            real* src, int lds,
+                            int dim,
+                            int beamSize,
+                            int numSamples);
+
+/**
+ * @brief   find top k element for each row in sparse matrix.
+ *
+ * @param[out]  topVal         top k element.
+ * @param[in]   ldv            leading dimension of topVal.
+ * @param[out]  topIds         top k index.
+ * @param[in]   src            sparse matrix.
+ * @param[in]   beamSize       beam size.
+ * @param[in]   numSamples     height of input value.
+ *
+ * @note    Only support HL_SPARSE_CSR format.
+ */
+extern void hl_sparse_matrix_top_k(real* topVal, int ldv,
+                                   int * topIds,
+                                   hl_sparse_matrix_s src,
+                                   int beamSize,
+                                   int numSamples);
+
+#endif /* HL_TOP_K_H_ */
diff --git a/paddle/cuda/include/stub/hl_aggregate_stub.h b/paddle/cuda/include/stub/hl_aggregate_stub.h
new file mode 100644
index 00000000000000..4c0c68f3c98fe9
--- /dev/null
+++ b/paddle/cuda/include/stub/hl_aggregate_stub.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_AGGREGATE_STUB_H_
+#define HL_AGGREGATE_STUB_H_
+
+#include "hl_aggregate.h"
+
+inline void hl_matrix_row_sum(real *A_d, real *C_d,
+                              int dimM, int dimN) {}
+
+inline void hl_matrix_row_max(real *A_d, real *C_d,
+                              int dimM, int dimN) {}
+
+inline void hl_matrix_row_min(real *A_d, real *C_d,
+                              int dimM, int dimN) {}
+
+inline void hl_matrix_column_sum(real *A_d, real *C_d,
+                                 int dimM, int dimN) {}
+
+inline void hl_matrix_column_max(real *A_d, real *C_d,
+                                 int dimM, int dimN) {}
+
+inline void hl_matrix_column_min(real *A_d, real *C_d,
+                                 int dimM, int dimN) {}
+
+inline void hl_vector_sum(real *A_d, real *C_h, int dimM) {}
+
+inline void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {}
+
+#endif  // HL_AGGREGATE_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
new file mode 100644
index 00000000000000..e4d46e4fb186ee
--- /dev/null
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -0,0 +1,76 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_CNN_STUB_H_
+#define HL_CNN_STUB_H_
+
+#include "hl_cnn.h"
+
+inline void hl_shrink_col2feature(
+    const real * dataCol, size_t channels,
+    size_t height, size_t width,
+    size_t blockH, size_t blockW,
+    size_t strideH, size_t strideW,
+    size_t paddingH, size_t paddingW,
+    size_t outputH, size_t outputW,
+    real* dataIm,
+    real alpha, real beta) {}
+
+inline void hl_expand_feature2col(
+    const real* dataIm, size_t channels,
+    size_t height, size_t width,
+    size_t blockH, size_t blockW,
+    size_t strideH, size_t strideW,
+    size_t paddingH, size_t paddingW,
+    size_t outputH, size_t outputW,
+    real* dataCol) {}
+
+inline void hl_maxpool_forward(
+    int frameCnt, const real* inputData, int channels,
+    int height, int width, int pooledH, int pooledW,
+    int sizeX, int stride, int start, real* tgtData) {}
+
+inline void hl_maxpool_backward(
+    int frameCnt, const real* inputData,
+    const real* outData, const real* outGrad,
+    int channels, int height, int width,
+    int pooledH, int pooledW, int sizeX,
+    int stride, int start, real* targetGrad,
+    real scaleA, real scaleB) {}
+
+inline void hl_avgpool_forward(
+    int frameCnt, const real* inputData, int channels,
+    int height, int width, int pooledH, int pooledW,
+    int sizeX, int stride, int start, real* tgtData) {}
+
+inline void hl_avgpool_backward(
+    int frameCnt, const real* outGrad,
+    int channels, int height, int width,
+    int pooledH, int pooledW, int sizeX,
+    int stride, int start, real* backGrad,
+    real scaleA, real scaleB) {}
+
+inline void hl_CMRNorm_forward(
+    size_t frameCnt, const real* in, real* scale, real* out,
+    size_t channels, size_t height, size_t width, size_t sizeX,
+    real alpha, real beta) {}
+
+inline void hl_CMRNorm_backward(
+    size_t frameCnt, const real* inV, const real* scale,
+    const real* outV, const real* outDiff, real *inDiff,
+    size_t channels, size_t height, size_t width, size_t sizeX,
+    real alpha, real beta) {}
+
+#endif  // HL_CNN_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
new file mode 100644
index 00000000000000..4a5e2a25a71b38
--- /dev/null
+++ b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_CUDA_CUBLAS_STUB_H_
+#define HL_CUDA_CUBLAS_STUB_H_
+
+#include "hl_cuda_cublas.h"
+
+inline void hl_matrix_transpose(real *A_d,
+                                real *C_d,
+                                int dimM,
+                                int dimN,
+                                int lda,
+                                int ldc) {}
+
+inline void hl_matrix_transpose(real *A_d,
+                                real *C_d,
+                                int dimM,
+                                int dimN) {}
+
+inline void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
+                          real *B_d, hl_trans_op_t transb,
+                          real *C_d,
+                          int dimM, int dimN, int dimK,
+                          real alpha, real beta,
+                          int lda, int ldb, int ldc) {}
+
+inline void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
+                          real *B_d, hl_trans_op_t transb,
+                          real *C_d,
+                          int dimM, int dimN, int dimK,
+                          real alpha, real beta) {}
+
+#endif  // HL_CUDA_CUBLAS_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
new file mode 100644
index 00000000000000..34c173908246e4
--- /dev/null
+++ b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
@@ -0,0 +1,202 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_CUDA_CUDNN_STUB_H_
+#define HL_CUDA_CUDNN_STUB_H_
+
+#include "hl_cuda_cudnn.h"
+
+inline int hl_get_cudnn_lib_version() {
+  return 0;
+}
+
+inline void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {}
+
+inline void hl_tensor_reshape(hl_tensor_descriptor image_desc,
+                              int batch_size,
+                              int feature_maps,
+                              int height,
+                              int width) {}
+
+inline void hl_tensor_reshape(hl_tensor_descriptor image_desc,
+                              int batch_size,
+                              int feature_maps,
+                              int height,
+                              int width,
+                              int nStride,
+                              int cStride,
+                              int hStride,
+                              int wStride) {}
+
+inline void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) {}
+
+inline void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
+                                         hl_pooling_mode_t mode,
+                                         int height,
+                                         int width,
+                                         int height_padding,
+                                         int width_padding,
+                                         int stride_height,
+                                         int stride_width) {}
+
+inline void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) {}
+
+inline void hl_pooling_forward(hl_tensor_descriptor input,
+                               real* input_image,
+                               hl_tensor_descriptor output,
+                               real* output_image,
+                               hl_pooling_descriptor pooling) {}
+
+inline void hl_pooling_backward(hl_tensor_descriptor input,
+                                real* input_image,
+                                real* input_image_grad,
+                                hl_tensor_descriptor output,
+                                real* output_image,
+                                real* output_image_grad,
+                                hl_pooling_descriptor pooling) {}
+
+inline void hl_create_filter_descriptor(hl_filter_descriptor* filter,
+                                       int input_feature_maps,
+                                       int output_feature_maps,
+                                       int height,
+                                       int width) {}
+
+inline void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {}
+
+inline void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
+        hl_tensor_descriptor image,
+        hl_filter_descriptor filter,
+        int padding_height,
+        int padding_width,
+        int stride_height,
+        int stride_width) {}
+
+inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
+        hl_tensor_descriptor image,
+        hl_filter_descriptor filter,
+        int padding_height,
+        int padding_width,
+        int stride_height,
+        int stride_width) {}
+
+inline void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {}
+
+inline void hl_conv_workspace(hl_tensor_descriptor input,
+                       hl_tensor_descriptor output,
+                       hl_filter_descriptor filter,
+                       hl_convolution_descriptor conv,
+                       int* convFwdAlgo,
+                       size_t* fwdLimitBytes,
+                       int* convBwdDataAlgo,
+                       size_t* bwdDataLimitBytes,
+                       int* convBwdFilterAlgo,
+                       size_t* bwdFilterLimitBytes) {}
+
+inline void hl_convolution_forward(hl_tensor_descriptor input,
+                                   real* input_data,
+                                   hl_tensor_descriptor output,
+                                   real* output_data,
+                                   hl_filter_descriptor filter,
+                                   real* filter_data,
+                                   hl_convolution_descriptor conv,
+                                   void* gpuWorkSpace,
+                                   size_t sizeInBytes,
+                                   int convFwdAlgo) {}
+
+inline void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
+        real* bias_data,
+        hl_tensor_descriptor output,
+        real* output_data) {}
+
+inline void hl_convolution_backward_filter(
+        hl_tensor_descriptor input,
+        real* input_data,
+        hl_tensor_descriptor output,
+        real* output_grad_data,
+        hl_filter_descriptor filter,
+        real* filter_grad_data,
+        hl_convolution_descriptor conv,
+        void* gpuWorkSpace,
+        size_t sizeInBytes,
+        int convBwdFilterAlgo) {}
+
+inline void hl_convolution_backward_data(
+        hl_tensor_descriptor input,
+        real* input_data_grad,
+        hl_tensor_descriptor output,
+        real* output_grad_data,
+        hl_filter_descriptor filter,
+        real* filter_data,
+        hl_convolution_descriptor conv,
+        void* gpuWorkSpace,
+        size_t sizeInBytes,
+        int convBwdDataAlgo) {}
+
+inline void hl_convolution_backward_bias(hl_tensor_descriptor bias,
+                                        real* bias_grad_data,
+                                        hl_tensor_descriptor output,
+                                        real* output_grad_data) {}
+
+inline void hl_softmax_forward(real *input,
+                              real *output,
+                              int height,
+                              int width) {}
+
+inline void hl_softmax_backward(real *output_value,
+                               real *output_grad,
+                               int height,
+                               int width) {}
+
+inline void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
+                                           real *input,
+                                           hl_tensor_descriptor outputDesc,
+                                           real *output,
+                                           hl_tensor_descriptor bnParamDesc,
+                                           real *scale,
+                                           real *bias,
+                                           double factor,
+                                           real *runningMean,
+                                           real *runningInvVar,
+                                           double epsilon,
+                                           real *savedMean,
+                                           real *savedVar) {}
+
+inline void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
+                                            real *input,
+                                            hl_tensor_descriptor outputDesc,
+                                            real *output,
+                                            hl_tensor_descriptor bnParamDesc,
+                                            real *scale,
+                                            real *bias,
+                                            real *estimatedMean,
+                                            real *estimatedVar,
+                                            double epsilon) {}
+
+inline void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
+                                   real *input,
+                                   hl_tensor_descriptor outGradDesc,
+                                   real *outGrad,
+                                   hl_tensor_descriptor inGradDesc,
+                                   real *inGrad,
+                                   hl_tensor_descriptor dBnParamDesc,
+                                   real *scale,
+                                   real *scaleGrad,
+                                   real *biasGrad,
+                                   double epsilon,
+                                   real *savedMean,
+                                   real *savedInvVar) {}
+
+#endif  // HL_CUDA_CUDNN_STUB_H_
+
diff --git a/paddle/cuda/include/stub/hl_cuda_stub.h b/paddle/cuda/include/stub/hl_cuda_stub.h
new file mode 100644
index 00000000000000..395101c6f7f087
--- /dev/null
+++ b/paddle/cuda/include/stub/hl_cuda_stub.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_CUDA_STUB_H_
+#define HL_CUDA_STUB_H_
+
+#include "hl_cuda.h"
+
+inline void hl_start() {}
+
+inline void hl_specify_devices_start(int *device, int number) {}
+
+inline void hl_init(int device) {}
+
+inline int hl_get_cuda_lib_version(int device) {
+  return 0;
+}
+
+inline void hl_fini() {}
+
+inline void hl_set_sync_flag(bool flag) {}
+
+inline bool hl_get_sync_flag() {
+  return false;
+}
+
+inline int hl_get_device_count() { return 0;  }
+
+inline void hl_set_device(int device) {}
+
+inline int hl_get_device() { return 0;  }
+
+inline void* hl_malloc_device(size_t size) { return NULL; }
+
+inline void hl_free_mem_device(void *dest_d) {}
+
+inline void* hl_malloc_host(size_t size) { return NULL;  }
+
+inline void hl_free_mem_host(void *dest_h) {}
+
+inline void hl_memcpy(void *dst, void *src, size_t size) {}
+
+inline void hl_memset_device(void *dest_d, int value, size_t size) {}
+
+inline void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {}
+
+inline void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {}
+
+inline void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {}
+
+inline void hl_rand(real *dest_d, size_t num) {}
+
+inline void hl_srand(unsigned int seed) {}
+
+inline void hl_memcpy_async(void *dst, void *src, size_t size,
+                            hl_stream_t stream) {}
+
+inline void hl_stream_synchronize(hl_stream_t stream) {}
+
+inline void hl_create_event(hl_event_t *event) {}
+
+inline void hl_destroy_event(hl_event_t event) {}
+
+inline float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
+  return 0;
+}
+
+inline void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {}
+
+inline void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {}
+
+inline void hl_event_synchronize(hl_event_t event) {}
+
+inline int hl_get_device_last_error() { return 0;  }
+
+inline const char* hl_get_device_error_string() { return NULL; }
+
+inline const char* hl_get_device_error_string(size_t err) { return NULL; }
+
+inline void hl_cuda_event_query(hl_event_t event, bool& isNotReady) {}
+
+inline void hl_device_synchronize() {}
+
+#endif  // HL_CUDA_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_lstm_stub.h b/paddle/cuda/include/stub/hl_lstm_stub.h
new file mode 100644
index 00000000000000..2700bef02a5e1e
--- /dev/null
+++ b/paddle/cuda/include/stub/hl_lstm_stub.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_LSTM_STUB_H_
+#define HL_LSTM_STUB_H_
+
+#include "hl_lstm.h"
+
+inline void hl_lstm_parallel_forward(real *gateValue,
+                                     real *stateValue,
+                                     real *preOutputValue,
+                                     real *outputValue,
+                                     real *checkIg,
+                                     real *checkFg,
+                                     real *checkOg,
+                                     real *weight,
+                                     const int *sequence,
+                                     int frameSize,
+                                     int numSequences,
+                                     bool reversed,
+                                     hl_activation_mode_t active_node,
+                                     hl_activation_mode_t active_gate,
+                                     hl_activation_mode_t active_state) {}
+
+inline void hl_lstm_parallel_backward_data(real *gateValue,
+                                           real *gateGrad,
+                                           real *stateValue,
+                                           real *stateGrad,
+                                           real *preOutputValue,
+                                           real *preOutputGrad,
+                                           real *outputGrad,
+                                           real *checkIg,
+                                           real *checkIgGrad,
+                                           real *checkFg,
+                                           real *checkFgGrad,
+                                           real *checkOg,
+                                           real *checkOgGrad,
+                                           real *weight,
+                                           const int *sequence,
+                                           int frameSize,
+                                           int numSequences,
+                                           bool reversed,
+                                           hl_activation_mode_t active_node,
+                                           hl_activation_mode_t active_gate,
+                                           hl_activation_mode_t active_state) {}
+
+inline void hl_lstm_parallel_backward_weight(real *weightGrad,
+                                             real *outputValue,
+                                             real *gateGrad,
+                                             const int *sequence,
+                                             int frameSize,
+                                             int batchSize,
+                                             int numSequences,
+                                             bool reversed) {}
+
+#endif  // HL_LSTM_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h
new file mode 100644
index 00000000000000..f1f1020c84d46c
--- /dev/null
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_MATRIX_STUB_H_
+#define HL_MATRIX_STUB_H_
+
+#include "hl_matrix.h"
+
+inline void hl_matrix_add(real* A_d,
+                          real* B_d,
+                          real* C_d,
+                          int dimM,
+                          int dimN,
+                          real alpha,
+                          real beta) {}
+
+inline void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) {}
+
+inline void hl_sequence_softmax_forward(real *A_d,
+                                        real *C_d,
+                                        const int* index,
+                                        int numSequence) {}
+
+inline void hl_matrix_softmax_derivative(real* grad_d,
+                                         real* output_d,
+                                         real* sftmaxSum_d,
+                                         int dimM,
+                                         int dimN) {}
+
+inline void hl_matrix_classification_error(real* A_d,
+                                           int* B_d,
+                                           real* C_d,
+                                           int dimM,
+                                           int dimN) {}
+
+inline void hl_matrix_cross_entropy(real* A_d,
+                                    real* C_d,
+                                    int* label_d,
+                                    int dimM,
+                                    int dimN) {}
+
+inline void hl_matrix_cross_entropy_bp(real* grad_d,
+                                       real* output_d,
+                                       int* label_d,
+                                       int dimM,
+                                       int dimN) {}
+
+inline void hl_matrix_zero_mem(real* data, int num) {}
+
+inline void hl_param_relu_forward(real* output,
+                                  real* input,
+                                  real* w,
+                                  int width,
+                                  int height,
+                                  int partial_sum) {}
+
+inline void hl_param_relu_backward_w(real* grad_w,
+                                     real* grad_o,
+                                     real* input,
+                                     int width,
+                                     int height,
+                                     int partial_sum) {}
+
+inline void hl_param_relu_backward_diff(real* grad_o,
+                                        real* input,
+                                        real* w,
+                                        real* diff,
+                                        int width,
+                                        int height,
+                                        int partial_sum) {}
+
+inline void hl_cossim(real* output,
+                      real* input1,
+                      real* input2,
+                      int width,
+                      int input1_height,
+                      int input2_height,
+                      real scale) {}
+
+
+inline void hl_cossim_derivative(real* grad,
+                                 real* output,
+                                 real* prevOutX,
+                                 real* prevOutY,
+                                 real* prevGradX,
+                                 real* prevGradY,
+                                 int width,
+                                 int input1_height,
+                                 int input2_height,
+                                 real scale) {}
+
+#endif  // HL_MATRIX_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
new file mode 100644
index 00000000000000..417f40e0a69f6c
--- /dev/null
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_SEQUENCE_STUB_H_
+#define HL_SEQUENCE_STUB_H_
+
+#include "hl_sequence.h"
+
+inline void hl_max_sequence_forward(real* input,
+                                    const int* sequence,
+                                    real* output,
+                                    int *index,
+                                    int numSequences,
+                                    int dim) {}
+
+inline void hl_max_sequence_backward(real* outputGrad,
+                                     int *index,
+                                     real* inputGrad,
+                                     int numSequences,
+                                     int dim) {}
+
+inline void hl_context_projection_forward(real* input,
+                                          const int* sequence,
+                                          real* weightData,
+                                          real* output,
+                                          int numSequences,
+                                          int inputDim,
+                                          int contextLength,
+                                          int contextStart,
+                                          int beginPad,
+                                          bool isPadding) {}
+
+inline void hl_context_projection_backward_data(real* outputGrad,
+                                                const int* sequence,
+                                                real* inputGrad,
+                                                int numSequences,
+                                                int inputDim,
+                                                int contextLength,
+                                                int contextStart) {}
+
+inline void hl_context_projection_backward_weight(real* outputGrad,
+                                                  const int* sequence,
+                                                  real* weightGrad,
+                                                  int numSequences,
+                                                  int weightDim,
+                                                  int totalPad,
+                                                  int contextLength,
+                                                  int contextStart,
+                                                  int beginPad) {}
+
+inline void hl_sequence2batch_copy(real *batch,
+                                   real *sequence,
+                                   int *batchIndex,
+                                   int seqWidth,
+                                   int batchCount,
+                                   bool seq2batch) {}
+
+inline void hl_sequence2batch_add(real *batch,
+                                  real *sequence,
+                                  int *batchIndex,
+                                  int seqWidth,
+                                  int batchCount,
+                                  bool seq2batch) {}
+
+inline void hl_sequence_avg_forward(real* dst,
+                                    real* src,
+                                    const int* starts,
+                                    int height,
+                                    int width,
+                                    const int mode) {}
+
+#endif  // HL_SEQUENCE_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_sparse_stub.h b/paddle/cuda/include/stub/hl_sparse_stub.h
new file mode 100644
index 00000000000000..346a1900dda582
--- /dev/null
+++ b/paddle/cuda/include/stub/hl_sparse_stub.h
@@ -0,0 +1,183 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_SPARSE_STUB_H_
+#define HL_SPARSE_STUB_H_
+
+#include "hl_sparse.h"
+
+inline void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
+                                    hl_matrix_format_t format,
+                                    hl_matrix_value_t  value_type,
+                                    int dimM,
+                                    int dimN,
+                                    int nnz) {}
+
+inline void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {}
+
+inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
+                                       void * dest_d,
+                                       size_t size,
+                                       hl_matrix_format_t format,
+                                       hl_matrix_value_t  value_type,
+                                       int dimM,
+                                       int dimN,
+                                       int nnz) {}
+
+inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
+                                       real* value_d,
+                                       int* rows_d,
+                                       int* cols_d,
+                                       hl_matrix_format_t format,
+                                       hl_matrix_value_t  value_type,
+                                       int dimM,
+                                       int dimN,
+                                       int nnz) {}
+
+inline void hl_destruct_sparse_matrix(hl_sparse_matrix_s A_d) {}
+
+inline void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
+                                 real *csr_val,
+                                 int *csr_row,
+                                 int *csr_col,
+                                 hl_stream_t stream) {}
+
+inline void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
+                                 real *csc_val,
+                                 int *csc_row,
+                                 int *csc_col,
+                                 hl_stream_t stream) {}
+
+inline void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
+                                    hl_sparse_matrix_s src,
+                                    hl_stream_t stream) {}
+
+inline void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
+                                real *C_d,
+                                int dimM,
+                                int dimN) {}
+
+inline void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
+                                real *C_d,
+                                int dimM,
+                                int dimN) {}
+
+inline void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
+                                    hl_trans_op_t transa,
+                                    real *B_d,
+                                    hl_trans_op_t transb,
+                                    real *C_d,
+                                    int dimM,
+                                    int dimN,
+                                    int dimK,
+                                    real alpha,
+                                    real beta) {}
+
+inline void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
+                                    hl_trans_op_t transa,
+                                    real *B_d, hl_trans_op_t transb,
+                                    real *C_d,
+                                    int dimM, int dimN, int dimK,
+                                    real alpha, real beta) {}
+
+inline void hl_matrix_dense_mul_csc(real *A_d,
+                                    hl_trans_op_t transa,
+                                    hl_sparse_matrix_s B_d,
+                                    hl_trans_op_t transb,
+                                    real *C_d,
+                                    int dimM,
+                                    int dimN,
+                                    int dimK,
+                                    real alpha,
+                                    real beta) {}
+
+inline void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
+                                 real *B_d, hl_trans_op_t transb,
+                                 hl_sparse_matrix_s C_d,
+                                 int dimM, int dimN, int dimK,
+                                 real alpha, real beta) {}
+
+inline void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
+                                    hl_sparse_matrix_s B_d,
+                                    hl_trans_op_t transb,
+                                    real *C_d,
+                                    int dimM, int dimN, int dimK,
+                                    real alpha, real beta) {}
+
+inline void hl_memcpy_from_csc_matrix(real *csc_val,
+                                      size_t val_size,
+                                      int *csc_row,
+                                      size_t row_size,
+                                      int *csc_col,
+                                      size_t col_size,
+                                      hl_sparse_matrix_s csc_matrix,
+                                      hl_stream_t stream) {}
+
+inline void hl_memcpy_from_csr_matrix(real *csr_val,
+                                      size_t val_size,
+                                      int *csr_row,
+                                      size_t row_size,
+                                      int *csr_col,
+                                      size_t col_size,
+                                      hl_sparse_matrix_s csr_matrix,
+                                      hl_stream_t stream) {}
+
+inline void hl_sparse_matrix_column_sum(real* A_d,
+                                        hl_sparse_matrix_s B_d,
+                                        int dimM,
+                                        int dimN,
+                                        real scale) {}
+
+inline void hl_matrix_csr_column_sum(real* A_d,
+                                     hl_sparse_matrix_s B_d,
+                                     int dimM,
+                                     int dimN,
+                                     real scale) {}
+
+inline void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
+                                      real* B_d,
+                                      real scale) {}
+
+inline void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
+                                   real* B_d,
+                                   real scale) {}
+
+inline void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
+                                       real* B_d,
+                                       int dimM,
+                                       int dimN,
+                                       real alpha,
+                                       real beta) {}
+
+inline void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
+                                    real* B_d,
+                                    int dimM,
+                                    int dimN,
+                                    real alpha,
+                                    real beta) {}
+
+inline int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
+  return NULL;
+}
+
+inline int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
+  return NULL;
+}
+
+inline real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
+  return NULL;
+}
+
+#endif  // HL_SPARSE_STUB_H_
diff --git a/paddle/cuda/src/avx_mathfun.h b/paddle/cuda/src/avx_mathfun.h
new file mode 100644
index 00000000000000..808c2508d1a1a0
--- /dev/null
+++ b/paddle/cuda/src/avx_mathfun.h
@@ -0,0 +1,721 @@
+/*
+   AVX implementation of sin, cos, sincos, exp and log
+
+   Based on "sse_mathfun.h", by Julien Pommier
+   http://gruntthepeon.free.fr/ssemath/
+
+   Copyright (C) 2012 Giovanni Garberoglio
+   Interdisciplinary Laboratory for Computational Science (LISC)
+   Fondazione Bruno Kessler and University of Trento
+   via Sommarive, 18
+   I-38123 Trento (Italy)
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#include <immintrin.h>
+
+/* yes I know, the top of this file is quite ugly */
+# define ALIGN32_BEG
+# define ALIGN32_END __attribute__((aligned(32)))
+
+/* __m128 is ugly to write */
+typedef __m256  v8sf; // vector of 8 float (avx)
+typedef __m256i v8si; // vector of 8 int   (avx)
+typedef __m128i v4si; // vector of 8 int   (avx)
+
+#define _PI32AVX_CONST(Name, Val)                                            \
+  static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { Val, Val, Val, Val }
+
+_PI32AVX_CONST(1, 1);
+_PI32AVX_CONST(inv1, ~1);
+_PI32AVX_CONST(2, 2);
+_PI32AVX_CONST(4, 4);
+
+
+/* declare some AVX constants -- why can't I figure a better way to do that? */
+#define _PS256_CONST(Name, Val)                                            \
+  static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
+#define _PI32_CONST256(Name, Val)                                            \
+  static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
+#define _PS256_CONST_TYPE(Name, Type, Val)                                 \
+  static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
+
+_PS256_CONST(1  , 1.0f);
+_PS256_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS256_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000);
+_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST256(0, 0);
+_PI32_CONST256(1, 1);
+_PI32_CONST256(inv1, ~1);
+_PI32_CONST256(2, 2);
+_PI32_CONST256(4, 4);
+_PI32_CONST256(0x7f, 0x7f);
+
+_PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
+_PS256_CONST(cephes_log_p0, 7.0376836292E-2);
+_PS256_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS256_CONST(cephes_log_p2, 1.1676998740E-1);
+_PS256_CONST(cephes_log_p3, - 1.2420140846E-1);
+_PS256_CONST(cephes_log_p4, + 1.4249322787E-1);
+_PS256_CONST(cephes_log_p5, - 1.6668057665E-1);
+_PS256_CONST(cephes_log_p6, + 2.0000714765E-1);
+_PS256_CONST(cephes_log_p7, - 2.4999993993E-1);
+_PS256_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS256_CONST(cephes_log_q1, -2.12194440e-4);
+_PS256_CONST(cephes_log_q2, 0.693359375);
+
+#ifndef __AVX2__
+
+typedef union imm_xmm_union {
+  v8si imm;
+  v4si xmm[2];
+} imm_xmm_union;
+
+#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) {    \
+    imm_xmm_union u __attribute__((aligned(32)));  \
+    u.imm = imm_;				   \
+    xmm0_ = u.xmm[0];                            \
+    xmm1_ = u.xmm[1];                            \
+}
+
+#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) {                       \
+    imm_xmm_union u __attribute__((aligned(32))); \
+    u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \
+  }
+
+
+#define AVX2_BITOP_USING_SSE2(fn) \
+static inline v8si avx2_mm256_##fn(v8si x, int a) \
+{ \
+  /* use SSE2 instruction to perform the bitop AVX2 */ \
+  v4si x1, x2; \
+  v8si ret; \
+  COPY_IMM_TO_XMM(x, x1, x2); \
+  x1 = _mm_##fn(x1,a); \
+  x2 = _mm_##fn(x2,a); \
+  COPY_XMM_TO_IMM(x1, x2, ret); \
+  return(ret); \
+}
+
+//#warning "Using SSE2 to perform AVX2 bitshift ops"
+AVX2_BITOP_USING_SSE2(slli_epi32)
+AVX2_BITOP_USING_SSE2(srli_epi32)
+
+#define AVX2_INTOP_USING_SSE2(fn) \
+static inline v8si avx2_mm256_##fn(v8si x, v8si y) \
+{ \
+  /* use SSE2 instructions to perform the AVX2 integer operation */ \
+  v4si x1, x2; \
+  v4si y1, y2; \
+  v8si ret; \
+  COPY_IMM_TO_XMM(x, x1, x2); \
+  COPY_IMM_TO_XMM(y, y1, y2); \
+  x1 = _mm_##fn(x1,y1); \
+  x2 = _mm_##fn(x2,y2); \
+  COPY_XMM_TO_IMM(x1, x2, ret); \
+  return(ret); \
+}
+
+//#warning "Using SSE2 to perform AVX2 integer ops"
+AVX2_INTOP_USING_SSE2(and_si128)
+AVX2_INTOP_USING_SSE2(andnot_si128)
+AVX2_INTOP_USING_SSE2(cmpeq_epi32)
+AVX2_INTOP_USING_SSE2(sub_epi32)
+AVX2_INTOP_USING_SSE2(add_epi32)
+#define avx2_mm256_and_si256 avx2_mm256_and_si128
+#define avx2_mm256_andnot_si256 avx2_mm256_andnot_si128
+#else
+#define avx2_mm256_slli_epi32 _mm256_slli_epi32
+#define avx2_mm256_srli_epi32 _mm256_srli_epi32
+#define avx2_mm256_and_si256 _mm256_and_si256
+#define avx2_mm256_andnot_si256 _mm256_andnot_si256
+#define avx2_mm256_cmpeq_epi32 _mm256_cmpeq_epi32
+#define avx2_mm256_sub_epi32 _mm256_sub_epi32
+#define avx2_mm256_add_epi32 _mm256_add_epi32
+#endif /* __AVX2__ */
+
+
+/* natural logarithm computed for 8 simultaneous float 
+   return NaN for x <= 0
+*/
+v8sf log256_ps(v8sf x) {
+  v8si imm0;
+  v8sf one = *(v8sf*)_ps256_1;
+
+  //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
+  v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
+
+  x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos);  /* cut off denormalized stuff */
+
+  // can be done with AVX2
+  imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);
+
+  /* keep only the fractional part */
+  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask);
+  x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5);
+
+  // this is again another AVX2 instruction
+  imm0 = avx2_mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+  v8sf e = _mm256_cvtepi32_ps(imm0);
+
+  e = _mm256_add_ps(e, one);
+
+  /* part2: 
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
+  v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
+  v8sf tmp = _mm256_and_ps(x, mask);
+  x = _mm256_sub_ps(x, one);
+  e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
+  x = _mm256_add_ps(x, tmp);
+
+  v8sf z = _mm256_mul_ps(x,x);
+
+  v8sf y = *(v8sf*)_ps256_cephes_log_p0;
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8);
+  y = _mm256_mul_ps(y, x);
+
+  y = _mm256_mul_ps(y, z);
+  
+  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1);
+  y = _mm256_add_ps(y, tmp);
+
+
+  tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+
+  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2);
+  x = _mm256_add_ps(x, y);
+  x = _mm256_add_ps(x, tmp);
+  x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
+  return x;
+}
+
+_PS256_CONST(exp_hi,	88.3762626647949f);
+_PS256_CONST(exp_lo,	-88.3762626647949f);
+
+_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
+_PS256_CONST(cephes_exp_C1, 0.693359375);
+_PS256_CONST(cephes_exp_C2, -2.12194440e-4);
+
+_PS256_CONST(cephes_exp_p0, 1.9875691500E-4);
+_PS256_CONST(cephes_exp_p1, 1.3981999507E-3);
+_PS256_CONST(cephes_exp_p2, 8.3334519073E-3);
+_PS256_CONST(cephes_exp_p3, 4.1665795894E-2);
+_PS256_CONST(cephes_exp_p4, 1.6666665459E-1);
+_PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
+
+v8sf exp256_ps(v8sf x) {
+  v8sf tmp = _mm256_setzero_ps(), fx;
+  v8si imm0;
+  v8sf one = *(v8sf*)_ps256_1;
+
+  x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi);
+  x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF);
+  fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5);
+
+  /* how to perform a floorf with SSE: just below */
+  //imm0 = _mm256_cvttps_epi32(fx);
+  //tmp  = _mm256_cvtepi32_ps(imm0);
+  
+  tmp = _mm256_floor_ps(fx);
+
+  /* if greater, substract 1 */
+  //v8sf mask = _mm256_cmpgt_ps(tmp, fx);    
+  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);    
+  mask = _mm256_and_ps(mask, one);
+  fx = _mm256_sub_ps(tmp, mask);
+
+  tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1);
+  v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2);
+  x = _mm256_sub_ps(x, tmp);
+  x = _mm256_sub_ps(x, z);
+
+  z = _mm256_mul_ps(x,x);
+  
+  v8sf y = *(v8sf*)_ps256_cephes_exp_p0;
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, x);
+  y = _mm256_add_ps(y, one);
+
+  /* build 2^n */
+  imm0 = _mm256_cvttps_epi32(fx);
+  // another two AVX2 instructions
+  imm0 = avx2_mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+  imm0 = avx2_mm256_slli_epi32(imm0, 23);
+  v8sf pow2n = _mm256_castsi256_ps(imm0);
+  y = _mm256_mul_ps(y, pow2n);
+  return y;
+}
+
+_PS256_CONST(minus_cephes_DP1, -0.78515625);
+_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS256_CONST(sincof_p0, -1.9515295891E-4);
+_PS256_CONST(sincof_p1,  8.3321608736E-3);
+_PS256_CONST(sincof_p2, -1.6666654611E-1);
+_PS256_CONST(coscof_p0,  2.443315711809948E-005);
+_PS256_CONST(coscof_p1, -1.388731625493765E-003);
+_PS256_CONST(coscof_p2,  4.166664568298827E-002);
+_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
+
+/* evaluation of 8 sines at onces using AVX intrisics
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+*/
+v8sf sin256_ps(v8sf x) { // any x
+  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
+  v8si imm0, imm2;
+
+#ifndef __AVX2__
+  v4si imm0_1, imm0_2;
+  v4si imm2_1, imm2_2;
+#endif
+
+  sign_bit = x;
+  /* take the absolute value */
+  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+
+  /*
+    Here we start a series of integer operations, which are in the
+    realm of AVX2.
+    If we don't have AVX, let's perform them using SSE2 directives
+  */
+
+#ifdef __AVX2__
+  /* store the integer part of y in mm0 */
+  imm2 = _mm256_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  // another two AVX2 instruction
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+  y = _mm256_cvtepi32_ps(imm2);
+
+  /* get the swap sign flag */
+  imm0 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
+  imm0 = avx2_mm256_slli_epi32(imm0, 29);
+  /* get the polynom selection mask 
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
+  imm2 = avx2_mm256_cmpeq_epi32(imm2,*(v8si*)_pi32_256_0);
+#else
+  /* we use SSE2 routines to perform the integer ops */
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
+
+  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+  y = _mm256_cvtepi32_ps(imm2);
+
+  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
+  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
+
+  imm0_1 = _mm_slli_epi32(imm0_1, 29);
+  imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
+
+  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+#endif
+ 
+  v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
+  v8sf poly_mask = _mm256_castsi256_ps(imm2);
+  sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
+
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
+  xmm1 = _mm256_mul_ps(y, xmm1);
+  xmm2 = _mm256_mul_ps(y, xmm2);
+  xmm3 = _mm256_mul_ps(y, xmm3);
+  x = _mm256_add_ps(x, xmm1);
+  x = _mm256_add_ps(x, xmm2);
+  x = _mm256_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v8sf*)_ps256_coscof_p0;
+  v8sf z = _mm256_mul_ps(x,x);
+
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_mul_ps(y, z);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_mul_ps(y2, x);
+  y2 = _mm256_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
+  y = _mm256_andnot_ps(xmm3, y);
+  y = _mm256_add_ps(y,y2);
+  /* update the sign */
+  y = _mm256_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* almost the same as sin_ps */
+v8sf cos256_ps(v8sf x) { // any x
+  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
+  v8si imm0, imm2;
+
+#ifndef __AVX2__
+  v4si imm0_1, imm0_2;
+  v4si imm2_1, imm2_2;
+#endif
+
+  /* take the absolute value */
+  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+  
+#ifdef __AVX2__
+  /* store the integer part of y in mm0 */
+  imm2 = _mm256_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+  y = _mm256_cvtepi32_ps(imm2);
+  imm2 = avx2_mm256_sub_epi32(imm2, *(v8si*)_pi32_256_2);
+  
+  /* get the swap sign flag */
+  imm0 = avx2_mm256_andnot_si256(imm2, *(v8si*)_pi32_256_4);
+  imm0 = avx2_mm256_slli_epi32(imm0, 29);
+  /* get the polynom selection mask */
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
+#else
+
+  /* we use SSE2 routines to perform the integer ops */
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
+
+  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+  y = _mm256_cvtepi32_ps(imm2);
+
+  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si*)_pi32avx_2);
+  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si*)_pi32avx_2);
+
+  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si*)_pi32avx_4);
+  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si*)_pi32avx_4);
+
+  imm0_1 = _mm_slli_epi32(imm0_1, 29);
+  imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
+
+  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+#endif
+
+  v8sf sign_bit = _mm256_castsi256_ps(imm0);
+  v8sf poly_mask = _mm256_castsi256_ps(imm2);
+
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
+  xmm1 = _mm256_mul_ps(y, xmm1);
+  xmm2 = _mm256_mul_ps(y, xmm2);
+  xmm3 = _mm256_mul_ps(y, xmm3);
+  x = _mm256_add_ps(x, xmm1);
+  x = _mm256_add_ps(x, xmm2);
+  x = _mm256_add_ps(x, xmm3);
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v8sf*)_ps256_coscof_p0;
+  v8sf z = _mm256_mul_ps(x,x);
+
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_mul_ps(y, z);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_mul_ps(y2, x);
+  y2 = _mm256_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
+  y = _mm256_andnot_ps(xmm3, y);
+  y = _mm256_add_ps(y,y2);
+  /* update the sign */
+  y = _mm256_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
+
+  v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
+  v8si imm0, imm2, imm4;
+
+#ifndef __AVX2__
+  v4si imm0_1, imm0_2;
+  v4si imm2_1, imm2_2;
+  v4si imm4_1, imm4_2;
+#endif
+
+  sign_bit_sin = x;
+  /* take the absolute value */
+  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+
+#ifdef __AVX2__    
+  /* store the integer part of y in imm2 */
+  imm2 = _mm256_cvttps_epi32(y);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+
+  y = _mm256_cvtepi32_ps(imm2);
+  imm4 = imm2;
+
+  /* get the swap sign flag for the sine */
+  imm0 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
+  imm0 = avx2_mm256_slli_epi32(imm0, 29);
+  //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+
+  /* get the polynom selection mask for the sine*/
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
+  //v8sf poly_mask = _mm256_castsi256_ps(imm2);
+#else
+  /* we use SSE2 routines to perform the integer ops */
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
+  
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
+
+  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+  y = _mm256_cvtepi32_ps(imm2);
+
+  imm4_1 = imm2_1;
+  imm4_2 = imm2_2;
+
+  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
+  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
+  
+  imm0_1 = _mm_slli_epi32(imm0_1, 29);
+  imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
+
+  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+#endif
+  v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+  v8sf poly_mask = _mm256_castsi256_ps(imm2);
+
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
+  xmm1 = _mm256_mul_ps(y, xmm1);
+  xmm2 = _mm256_mul_ps(y, xmm2);
+  xmm3 = _mm256_mul_ps(y, xmm3);
+  x = _mm256_add_ps(x, xmm1);
+  x = _mm256_add_ps(x, xmm2);
+  x = _mm256_add_ps(x, xmm3);
+
+#ifdef __AVX2__
+  imm4 = avx2_mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2);
+  imm4 = avx2_mm256_andnot_si256(imm4, *(v8si*)_pi32_256_4);
+  imm4 = avx2_mm256_slli_epi32(imm4, 29);
+#else
+  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si*)_pi32avx_2);
+  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si*)_pi32avx_2);
+
+  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si*)_pi32avx_4);
+  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si*)_pi32avx_4);
+  
+  imm4_1 = _mm_slli_epi32(imm4_1, 29);
+  imm4_2 = _mm_slli_epi32(imm4_2, 29);
+
+  COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
+#endif
+
+  v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
+
+  sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  v8sf z = _mm256_mul_ps(x,x);
+  y = *(v8sf*)_ps256_coscof_p0;
+
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_mul_ps(y, z);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_mul_ps(y2, x);
+  y2 = _mm256_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  v8sf ysin2 = _mm256_and_ps(xmm3, y2);
+  v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
+  y2 = _mm256_sub_ps(y2,ysin2);
+  y = _mm256_sub_ps(y, ysin1);
+
+  xmm1 = _mm256_add_ps(ysin1,ysin2);
+  xmm2 = _mm256_add_ps(y,y2);
+ 
+  /* update the sign */
+  *s = _mm256_xor_ps(xmm1, sign_bit_sin);
+  *c = _mm256_xor_ps(xmm2, sign_bit_cos);
+}
+
diff --git a/paddle/cuda/src/hl_avx_functions.cc b/paddle/cuda/src/hl_avx_functions.cc
new file mode 100644
index 00000000000000..2d471206f61f28
--- /dev/null
+++ b/paddle/cuda/src/hl_avx_functions.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <immintrin.h>
+#include "hl_functions.h"
+
+namespace hppl {
+
+  extern __m256 exp(__m256 a);
+
+  __m256 relu(const __m256 a) {
+    __m256 tmp = _mm256_set1_ps(0.0f);
+    return _mm256_max_ps(a, tmp);
+  }
+
+  __m256 sigmoid(const __m256 a) {
+    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
+    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
+    __m256 tmp = _mm256_max_ps(a, min);
+    tmp = _mm256_min_ps(tmp, max);
+    tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
+    tmp = exp(tmp);
+    tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
+    tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
+    return tmp;
+  }
+
+  __m256 tanh(const __m256 a) {
+    __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
+    tmp = exp(tmp);
+    return _mm256_sub_ps(
+        _mm256_div_ps(_mm256_set1_ps(2.0f),
+        _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)), _mm256_set1_ps(1.0f));
+  }
+
+  __m256 linear(const __m256 a) {
+    return a;
+  }
+
+  __m256 relu(const __m256 a, const __m256 b) {
+    return _mm256_mul_ps(a,
+      _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
+      _mm256_set1_ps(1.0f)));
+  }
+
+  __m256 sigmoid(const __m256 a, const __m256 b) {
+    return _mm256_mul_ps(_mm256_mul_ps(a, b),
+        _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
+  }
+
+  __m256 tanh(const __m256 a, const __m256 b) {
+    return _mm256_mul_ps(a,
+      _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
+  }
+
+  __m256 linear(const __m256 a, const __m256 b) {
+    return a;
+  }
+}  // namespace hppl
diff --git a/paddle/cuda/src/hl_batch_transpose.cu b/paddle/cuda/src/hl_batch_transpose.cu
new file mode 100644
index 00000000000000..00fd18e7f3936c
--- /dev/null
+++ b/paddle/cuda/src/hl_batch_transpose.cu
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_batch_transpose.h"
+#include "hl_base.h"
+
+const int TILE_DIM = 64;
+const int BLOCK_ROWS = 16;
+
+// No bank-conflict transpose for a batch of data.
+__global__ void batchTransposeNoBankConflicts(real* odata,
+                                              const real* idata,
+                                              int numSamples, int width,
+                                              int height) {
+  __shared__ float tile[TILE_DIM][TILE_DIM + 1];
+
+  const int x = blockIdx.x * TILE_DIM + threadIdx.x;
+  const int y = blockIdx.y * TILE_DIM + threadIdx.y;
+  const int sampleId = blockIdx.z;
+  if (sampleId > numSamples) return;
+  if (x < width) {
+    for (int j = threadIdx.y; j < TILE_DIM && j < height - y + threadIdx.y;
+         j += BLOCK_ROWS)
+      tile[j][threadIdx.x] =
+          idata[sampleId * width * height + (y + j - threadIdx.y) * width + x];
+  }
+
+  __syncthreads();
+
+  // The matrix is tranposed. Thus height is new width, and width is new height.
+  const int newX = blockIdx.y * TILE_DIM + threadIdx.x;
+  const int newY = blockIdx.x * TILE_DIM + threadIdx.y;
+  if (newX >= height) {
+    return;
+  }
+  for (int j = threadIdx.y; j < TILE_DIM && j < width - newY + threadIdx.y;
+       j += BLOCK_ROWS)
+    odata[sampleId * width * height + (newY + j - threadIdx.y) * height +
+          newX] = tile[threadIdx.x][j];
+}
+
+void batchTranspose(const real* input, real* output, int width, int height,
+                    int batchSize) {
+  dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
+  dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize);
+  batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+      (output, input, batchSize, width, height);
+
+  CHECK_SYNC("batchTranspose failed!");
+}
diff --git a/paddle/cuda/src/hl_cpu_functions.cc b/paddle/cuda/src/hl_cpu_functions.cc
new file mode 100644
index 00000000000000..3fd6b278d05371
--- /dev/null
+++ b/paddle/cuda/src/hl_cpu_functions.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <math.h>
+#include "hl_functions.h"
+
+namespace hppl {
+
+  real relu(const real a) {
+    return a > 0.0f ? a : 0.0f;
+  }
+
+  real sigmoid(const real a) {
+    const real min = SIGMOID_THRESHOLD_MIN;
+    const real max = SIGMOID_THRESHOLD_MAX;
+    real tmp = (a < min) ? min : ((a > max) ? max : a);
+    return 1.0 / (1.0 + exp(-tmp));
+  }
+
+  real tanh(const real a) {
+    return (2.0 / (1.0 + exp(-2.0*a))) - 1.0;
+  }
+
+  real linear(const real a) {
+    return a;
+  }
+
+  real relu(const real a, const real b) {
+    return a * (b > 0.0f ? 1.0f : 0.0f);
+  }
+
+  real sigmoid(const real a, const real b) {
+    return a * b * (1 - b);
+  }
+
+  real tanh(const real a, const real b) {
+    return a * (1.0f - b * b);
+  }
+
+  real linear(const real a, const real b) {
+    return a;
+  }
+}  // namespace hppl
diff --git a/paddle/cuda/src/hl_cuda_aggregate.cu b/paddle/cuda/src/hl_cuda_aggregate.cu
new file mode 100644
index 00000000000000..c0b84b087b156f
--- /dev/null
+++ b/paddle/cuda/src/hl_cuda_aggregate.cu
@@ -0,0 +1,335 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "hl_base.h"
+#include "hl_cuda.h"
+#include "hl_cuda.ph"
+#include "hl_aggregate.h"
+#include "hl_thread.ph"
+#include "hl_matrix_base.cuh"
+#include "paddle/utils/Logging.h"
+
+/**
+ * @brief   matrix row operator.
+ */
+template<class Agg, int blockSize>
+__global__ void KeMatrixRowOp(Agg agg,
+                              real *E,
+                              real *Sum,
+                              int dimN) {
+  __shared__ real sum_s[blockSize];
+  int cnt = (dimN + blockSize -1) / blockSize;
+  int rowId = blockIdx.x + blockIdx.y*gridDim.x;
+  int index = rowId*dimN;
+  int tid = threadIdx.x;
+  int lmt = tid;
+
+  real tmp = agg.init();
+  for (int ii = 0; ii < cnt && lmt < dimN; ii++) {
+    tmp = agg(tmp, E[index + lmt]);
+    lmt += blockSize;
+  }
+  sum_s[tid] = tmp;
+  __syncthreads();
+
+  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
+    if (tid < stride) {
+      sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]);
+    }
+    __syncthreads();
+  }
+  __syncthreads();
+
+  if (tid == 0) {
+    Sum[rowId] = sum_s[0];
+  }
+}
+
+template <class Agg>
+void hl_matrix_row_op(Agg agg,
+                      real *A_d,
+                      real *C_d,
+                      int dimM,
+                      int dimN) {
+  int blocksX = dimM;
+  int blocksY = 1;
+  dim3 threads(128, 1);
+  dim3 grid(blocksX, blocksY);
+
+  KeMatrixRowOp<Agg, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
+           (agg, A_d, C_d, dimN);
+}
+
+void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  hl_matrix_row_op(aggregate::sum(),
+                   A_d,
+                   C_d,
+                   dimM,
+                   dimN);
+  CHECK_SYNC("hl_matrix_row_sum failed");
+}
+
+void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  hl_matrix_row_op(aggregate::max(),
+                   A_d,
+                   C_d,
+                   dimM,
+                   dimN);
+  CHECK_SYNC("hl_matrix_row_max failed");
+}
+
+void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  hl_matrix_row_op(aggregate::min(),
+                   A_d,
+                   C_d,
+                   dimM,
+                   dimN);
+  CHECK_SYNC("hl_matrix_row_min failed");
+}
+
+/**
+ * @brief   matrix column operator.
+ */
+template<class Agg>
+__global__ void KeMatrixColumnOp(Agg agg,
+                                 real *E,
+                                 real *Sum,
+                                 int dimM,
+                                 int dimN) {
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  real tmp = agg.init();
+  if (rowIdx < dimN) {
+    for (int index = 0; index < dimM; index++) {
+      tmp = agg(tmp, E[dimN * index + rowIdx]);
+    }
+    Sum[rowIdx] = tmp;
+  }
+}
+
+template<class Agg, int blockDimX, int blockDimY>
+__global__ void KeMatrixColumnOp_S(Agg agg,
+                                   real *E,
+                                   real *Sum,
+                                   int dimM,
+                                   int dimN) {
+    __shared__ real _sum[blockDimX*blockDimY];
+    int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+    int index = threadIdx.y;
+
+  real tmp = agg.init();
+  if (rowIdx < dimN) {
+    for (; index < dimM;) {
+      tmp = agg(tmp, E[dimN * index + rowIdx]);
+      index += blockDimY;
+    }
+  }
+  _sum[threadIdx.x + threadIdx.y*blockDimX] = tmp;
+  __syncthreads();
+
+  if (rowIdx < dimN) {
+    if (threadIdx.y ==0) {
+      real tmp = agg.init();
+      for (int i=0; i < blockDimY; i++) {
+        tmp = agg(tmp, _sum[threadIdx.x + i*blockDimX]);
+      }
+      Sum[rowIdx] = tmp;
+    }
+  }
+}
+
+template <class Agg>
+void hl_matrix_column_op(Agg agg,
+                         real *A_d,
+                         real *C_d,
+                         int dimM,
+                         int dimN) {
+  if (dimN >= 8192) {
+    int blocksX = (dimN + 128 -1) / 128;
+    int blocksY = 1;
+    dim3 threads(128, 1);
+    dim3 grid(blocksX, blocksY);
+    KeMatrixColumnOp<Agg><<< grid, threads, 0, STREAM_DEFAULT >>>
+             (agg, A_d, C_d, dimM, dimN);
+  } else {
+    int blocksX = (dimN + 32 -1) / 32;
+    int blocksY = 1;
+    dim3 threads(32, 32);
+    dim3 grid(blocksX, blocksY);
+    KeMatrixColumnOp_S<Agg, 32, 32><<< grid, threads, 0, STREAM_DEFAULT>>>
+             (agg, A_d, C_d, dimM, dimN);
+  }
+
+  return;
+}
+
+void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  hl_matrix_column_op(aggregate::sum(),
+                      A_d,
+                      C_d,
+                      dimM,
+                      dimN);
+
+  CHECK_SYNC("hl_matrix_column_sum failed");
+}
+
+void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  hl_matrix_column_op(aggregate::max(),
+                      A_d,
+                      C_d,
+                      dimM,
+                      dimN);
+
+  CHECK_SYNC("hl_matrix_column_max failed");
+}
+
+void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  hl_matrix_column_op(aggregate::min(),
+                      A_d,
+                      C_d,
+                      dimM,
+                      dimN);
+
+  CHECK_SYNC("hl_matrix_column_min failed");
+}
+
+template <int blockSize>
+__global__ void KeVectorSum(real *E, real *Sum, int dimM) {
+  __shared__ double sum_s[blockSize];
+  int tid = threadIdx.x;
+  int index = blockIdx.y*blockDim.x+threadIdx.x;
+
+  sum_s[tid] = 0.0f;
+  while (index < dimM) {
+    sum_s[tid] += E[index];
+    index += blockDim.x*gridDim.y;
+  }
+  __syncthreads();
+
+  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
+    if (tid < stride) {
+      sum_s[tid] += sum_s[tid + stride];
+    }
+    __syncthreads();
+  }
+  __syncthreads();
+
+  if (tid == 0) {
+    Sum[blockIdx.y] = sum_s[0];
+  }
+}
+
+void hl_vector_sum(real *A_d, real *C_h, int dimM) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_h);
+
+  int blockSize = 128;
+  int gridSize = 128;
+  int blocksX = 1;
+  int blocksY = gridSize;
+  dim3 threads(blockSize, 1);
+  dim3 grid(blocksX, blocksY);
+
+  struct _hl_event_st hl_event_st  = {.cu_event = t_resource.event};
+  hl_event_t hl_event = &hl_event_st;
+
+  bool isNotReady = false;
+  do {
+    hl_cuda_event_query(hl_event, isNotReady);
+  } while (isNotReady == cudaErrorNotReady);
+
+  KeVectorSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
+           (A_d, t_resource.gpu_mem, dimM);
+  KeVectorSum<128><<< 1, threads, 0, STREAM_DEFAULT >>>
+           (t_resource.gpu_mem, t_resource.cpu_mem, 128);
+
+  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
+  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
+
+  CHECK_SYNC("hl_vector_sum failed");
+}
+
+template <int blockSize>
+__global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) {
+  __shared__ double sum_s[blockSize];
+  int tid = threadIdx.x;
+  int index = blockIdx.y*blockDim.x+threadIdx.x;
+
+  sum_s[tid] = 0.0f;
+  while (index < dimM) {
+    sum_s[tid] += abs(E[index]);
+    index += blockDim.x*gridDim.y;
+  }
+  __syncthreads();
+
+  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
+    if (tid < stride) {
+      sum_s[tid] += sum_s[tid + stride];
+    }
+    __syncthreads();
+  }
+  __syncthreads();
+
+  if (tid == 0) {
+    Sum[blockIdx.y] = sum_s[0];
+  }
+}
+
+void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_h);
+
+  int blockSize = 128;
+  int gridSize = 128;
+  int blocksX = 1;
+  int blocksY = gridSize;
+  dim3 threads(blockSize, 1);
+  dim3 grid(blocksX, blocksY);
+
+  struct _hl_event_st hl_event_st  = {.cu_event = t_resource.event};
+  hl_event_t hl_event = &hl_event_st;
+
+  bool isNotReady = false;
+  do {
+    hl_cuda_event_query(hl_event, isNotReady);
+  } while (isNotReady == cudaErrorNotReady);
+
+  KeVectorAbsSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
+           (A_d, t_resource.gpu_mem, dimM);
+  KeVectorAbsSum<128><<< 1, threads, 0, STREAM_DEFAULT >>>
+           (t_resource.gpu_mem, t_resource.cpu_mem, 128);
+
+  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
+  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
+
+  CHECK_SYNC("hl_vector_abs_sum failed");
+}
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
new file mode 100644
index 00000000000000..b3695a2c7f88ee
--- /dev/null
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -0,0 +1,490 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <float.h>
+#include "hl_base.h"
+#include "hl_cnn.h"
+
+__global__ void KeFeature2col(size_t n, size_t height, const real* data_im,
+                              size_t blockH, size_t blockW, size_t width,
+                              size_t strideH, size_t strideW,
+                              size_t paddingH, size_t paddingW,
+                              size_t height_col, size_t width_col,
+                              real* data_col) {
+  size_t index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < n) {
+    size_t w_out = index % width_col;
+    index /= width_col;
+    size_t h_out = index % height_col;
+    size_t channel_in = index / height_col;
+    size_t channel_out = channel_in * blockH * blockW;
+    size_t h_in = h_out * strideH;
+    size_t w_in = w_out * strideW;
+
+    data_col += (channel_out * height_col + h_out) * width_col + w_out;
+    for (size_t i = 0; i < blockH; ++i) {
+      for (size_t j = 0; j < blockW; ++j) {
+        int rIdx = int(h_in+i);
+        int cIdx = int(w_in+j);
+        if ((rIdx-(int)paddingH) >= (int)height ||
+            (rIdx-(int)paddingH) < 0 ||
+            (cIdx-(int)paddingW) >= (int)width ||
+            (cIdx-(int)paddingW) < 0) {
+          *data_col = 0;
+        } else {
+          rIdx = rIdx + channel_in*height - paddingH;
+          cIdx = cIdx - paddingW;
+          *data_col = data_im[rIdx* width + cIdx];
+        }
+        data_col += height_col * width_col;
+      }
+    }
+  }
+}
+
+void hl_expand_feature2col(const real* dataIm, size_t channels,
+                           size_t height, size_t width,
+                           size_t blockH, size_t blockW,
+                           size_t strideH, size_t strideW,
+                           size_t paddingH, size_t paddingW,
+                           size_t outputH, size_t outputW,
+                           real* dataCol) {
+  size_t numKernels = channels * outputH * outputW;
+
+  size_t blocks = (numKernels + 1024 -1) / 1024;
+  size_t blockX = 512;
+  size_t blockY = (blocks+512-1)/512;
+  dim3 threads(1024, 1);
+  dim3 grid(blockX, blockY);
+  KeFeature2col<<< grid, threads, 0, STREAM_DEFAULT >>>
+           (numKernels, height, dataIm, blockH, blockW, width,
+           strideH, strideW, paddingH, paddingW,
+           outputH, outputW, dataCol);
+  CHECK_SYNC("hl_expand_feature2col failed");
+}
+
+__global__ void KeCol2Feature(size_t n, const real* data_col, size_t height,
+                              size_t width, size_t channels,
+                              size_t blockH, size_t blockW,
+                              size_t strideH, size_t strideW,
+                              size_t paddingH, size_t paddingW,
+                              size_t height_col, size_t width_col,
+                              real* data_im, real alpha, real beta) {
+  size_t index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < n) {
+    real val = 0;
+    int w = int(index % width);
+    int h = int((index / width) % height);
+    int c = int(index / (width * height));
+    if ((w - (int)paddingW) >= 0 &&
+        (w - (int)paddingW) < (width-2 * paddingW) &&
+        (h - (int)paddingH) >= 0 &&
+        (h - paddingH) < (height - 2 * paddingH)) {
+      // compute the start and end of the output
+      int w_col_start =
+        (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
+      int w_col_end =
+        min((int)(w / (int)strideW + 1), (int)(width_col));
+      int h_col_start =
+        (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
+      int h_col_end = min(int(h / strideH + 1), int(height_col));
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          // the col location: [c * width * height + h_out, w_out]
+          int c_col = int(c * blockH* blockW) + \
+            (h - h_col * (int)strideH) * (int)blockW +
+            (w - w_col * (int)strideW);
+          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
+        }
+      }
+      h -= paddingH;
+      w -= paddingW;
+      real tD = data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
+                          h*(width-2*paddingW) + w];
+      data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
+              h*(width-2*paddingW) + w] = alpha * val + beta*tD;
+    }
+  }
+}
+
+void hl_shrink_col2feature(const real * dataCol, size_t channels,
+                           size_t height, size_t width,
+                           size_t blockH, size_t blockW,
+                           size_t strideH, size_t strideW,
+                           size_t paddingH, size_t paddingW,
+                           size_t outputH, size_t outputW,
+                           real* dataIm, real alpha, real beta) {
+  size_t numKernels = channels * (height + 2*paddingH) * (width + 2*paddingW);
+
+  size_t blocks = (numKernels + 1024 -1) / 1024;
+  size_t blockX = 512;
+  size_t blockY = (blocks+512-1)/512;
+  dim3 threads(1024, 1);
+  dim3 grid(blockX, blockY);
+
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  KeCol2Feature<<< grid, threads, 0, STREAM_DEFAULT >>>
+           (numKernels, dataCol, height + 2*paddingH, width + 2*paddingW,
+           channels, blockH, blockW, strideH, strideW, paddingH, paddingW,
+           outputH, outputW, dataIm, alpha, beta);
+  CHECK_SYNC("hl_shrink_col2feature failed");
+}
+
+__global__ void KeMaxPoolForward(int nthreads, const real* inputData,
+                                 int channels, int height, int width,
+                                 int pooledH, int pooledW,
+                                 int ksize, int stride, int start,
+                                 real* tgtData) {
+  int index =  blockIdx.y * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    int pw = index % pooledW;
+    int ph = (index / pooledW) % pooledH;
+    int c = (index / pooledW / pooledH) % channels;
+    int frameNum = blockIdx.x;
+    int hstart = ph * stride + start;
+    int hend = min(hstart + ksize, height);
+    int wstart = pw * stride + start;
+    int wend = min(wstart + ksize, width);
+    real maxval = -FLT_MAX;
+    inputData += (frameNum * channels + c) * height * width;
+    tgtData += (frameNum * channels) * pooledW * pooledH;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        if (maxval < inputData[h * width + w])
+          maxval = inputData[h * width + w];
+      }
+    }
+    tgtData[index] = maxval;
+  }
+}
+
+void hl_maxpool_forward(int frameCnt, const real* inputData, int channels,
+                        int height, int width, int pooledH, int pooledW,
+                        int sizeX, int stride, int start, real* tgtData) {
+  int num_kernels = pooledH * pooledW * channels;
+  int blocksX = frameCnt;
+  int blocksY = (num_kernels + 1024 -1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocksX, blocksY);
+  KeMaxPoolForward<<< grid, threads, 0, STREAM_DEFAULT >>>
+           (num_kernels, inputData, channels, height, width,
+           pooledH, pooledW, sizeX, stride, start, tgtData);
+  CHECK_SYNC("hl_maxpool_forward failed");
+}
+
+__global__ void KeMaxPoolBackward(int nthreads, const real* inputData,
+                                  const real* outData, const real* outGrad,
+                                  int channels, int height, int width,
+                                  int pooledH, int pooledW, int sizeX,
+                                  int stride, int start, real* targetGrad,
+                                  real scaleA, real scaleB) {
+  int index = blockIdx.y  * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    // find out the local index
+    // find out the local offset
+    int offsetW = index % width + start;
+    int offsetH = (index / width) % height + start;
+    int offsetC = (index / width / height) % channels;
+    int frameNum = blockIdx.x;
+    int phstart = (offsetH < sizeX) ? 0 : (offsetH - sizeX) / stride + 1;
+    int phend = min(offsetH / stride + 1, pooledH);
+    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / stride + 1;
+    int pwend = min(offsetW / stride + 1, pooledW);
+    real gradient = 0;
+    inputData += (frameNum * channels) * height * width;
+    real input = inputData[index];
+    outData += (frameNum * channels + offsetC) * pooledH * pooledW;
+    outGrad += (frameNum * channels + offsetC) * pooledH * pooledW;
+    targetGrad += (frameNum * channels) * height * width;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        if (input == outData[ph * pooledW + pw]) {
+          gradient += outGrad[ph * pooledW + pw];
+        }
+      }
+    }
+    targetGrad[index] =
+      scaleB * targetGrad[index] + scaleA * gradient;
+  }
+}
+
+void hl_maxpool_backward(int frameCnt, const real* inputData,
+                        const real* outData, const real* outGrad,
+                        int channels, int height, int width,
+                        int pooledH, int pooledW, int sizeX,
+                        int stride, int start, real* targetGrad,
+                        real scaleA, real scaleB) {
+  int num_kernels = (height - start) * (width - start) * channels;
+  int blocksX = frameCnt;
+  int blocksY = (num_kernels + 1024 -1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocksX, blocksY);
+
+  KeMaxPoolBackward<<< grid, threads, 0, STREAM_DEFAULT >>>
+           (num_kernels, inputData, outData, outGrad, channels,
+           height, width, pooledH, pooledW, sizeX, stride, start,
+           targetGrad, scaleA, scaleB);
+  CHECK_SYNC("hl_maxpool_backward");
+}
+
+__global__ void KeAvePoolForward(int nthreads, const real* inputData,
+                                 int channels, int height, int width,
+                                 int pooledH, int pooledW, int sizeX,
+                                 int stride, int start, real* tgtData) {
+  int index = blockIdx.y * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    int pw = index % pooledW;
+    int ph = (index / pooledW) % pooledH;
+    int c = (index / pooledW / pooledH) % channels;
+    int frameNum = blockIdx.x;
+    int hstart = ph * stride + start;
+    int hend = min(hstart + sizeX, height);
+    int wstart = pw * stride + start;
+    int wend = min(wstart + sizeX, width);
+    real aveval = 0;
+    inputData += (frameNum * channels + c) * height * width;
+    tgtData += (frameNum * channels) * pooledH * pooledW;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        aveval += inputData[h * width + w];
+      }
+    }
+    tgtData[index] = aveval / ((hend - hstart) * (wend - wstart));
+  }
+}
+
+void hl_avgpool_forward(int frameCnt, const real* inputData, int channels,
+                        int height, int width, int pooledH, int pooledW,
+                        int sizeX, int stride, int start, real* tgtData) {
+  int num_kernels = pooledH * pooledW * channels;
+  int blocksX = frameCnt;
+  int blocksY = (num_kernels + 1024 -1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocksX, blocksY);
+  KeAvePoolForward<<< grid, threads, 0, STREAM_DEFAULT >>>
+           (num_kernels, inputData, channels,
+           height, width, pooledH, pooledW,
+           sizeX, stride, start, tgtData);
+  CHECK_SYNC("hl_avgpool_forward failed");
+}
+
+__global__ void KeAvgPoolBackward(int nthreads, const real* outGrad,
+                                  int channels, int height, int width,
+                                  int pooledH, int pooledW, int sizeX,
+                                  int stride, int start, real* tgtGrad,
+                                  real scaleA, real scaleB) {
+  int index = blockIdx.y * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    int offsetW = index % width + start;
+    int offsetH = (index / width) % height + start;
+    int offsetC = (index / width / height) % channels;
+    int frameNum = blockIdx.x;
+    int phstart = (offsetH < sizeX) ? 0 : (offsetH - sizeX) / stride + 1;
+    int phend = min(offsetH / stride + 1, pooledH);
+    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / stride + 1;
+    int pwend = min(offsetW / stride + 1, pooledW);
+    real gradient = 0;
+    outGrad += (frameNum * channels + offsetC) * pooledH * pooledW;
+    tgtGrad += (frameNum * channels) * height * width;
+
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        // figure out the pooling size
+        int poolsize = (min(ph * stride + sizeX, height) - ph * stride) *
+            (min(pw * stride + sizeX, width) - pw * stride);
+        gradient += outGrad[ph * pooledW + pw]/poolsize;
+      }
+    }
+    tgtGrad[index] = scaleB * tgtGrad[index] + scaleA * gradient;
+  }
+}
+
+void hl_avgpool_backward(int frameCnt, const real* outGrad,
+                         int channels, int height, int width,
+                         int pooledH, int pooledW, int sizeX,
+                         int stride, int start, real* backGrad,
+                         real scaleA, real scaleB) {
+  int num_kernels = (height - start) * (width - start) * channels;
+  int blocksX = frameCnt;
+  int blocksY = (num_kernels + 1024 -1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocksX, blocksY);
+
+  KeAvgPoolBackward <<< grid, threads, 0, STREAM_DEFAULT >>>
+           (num_kernels, outGrad, channels, height, width,
+           pooledH, pooledW, sizeX, stride, start, backGrad, scaleA, scaleB);
+  CHECK_SYNC("hl_avgpool_backward failed");
+}
+
+__global__ void KeCMRNormFillScale(size_t nthreads, const real* in,
+                                   real* scale, size_t channels,
+                                   size_t height, size_t width, size_t size,
+                                   real alpha) {
+  size_t index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < nthreads) {
+    // find out the local offset
+    size_t w = index % width;
+    size_t h = (index / width) % height;
+    size_t n = index / width / height;
+    size_t offset = (n * channels * height + h) * width + w;
+    size_t step = height * width;
+    in += offset;
+    scale += offset;
+    size_t head = 0;
+    size_t pre_pad = (size - 1) / 2;
+    size_t post_pad = size - pre_pad - 1;
+    real accum_scale = 0;
+    // fill the scale at [n, :, h, w]
+    // accumulate values
+    while (head < post_pad) {
+      accum_scale += in[head * step] * in[head * step];
+      ++head;
+    }
+    // until we reach size, nothing needs to be subtracted
+    while (head < size) {
+      accum_scale += in[head * step] * in[head * step];
+      scale[(head - post_pad) * step] = 1. + accum_scale * alpha;
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_scale += in[head * step] * in[head * step];
+      accum_scale -= in[(head - size) * step] * in[(head - size) * step];
+      scale[(head - post_pad) * step] = 1. + accum_scale * alpha;
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      accum_scale -= in[(head - size) * step] * in[(head - size) * step];
+      scale[(head - post_pad) * step] = 1. + accum_scale * alpha;
+      ++head;
+    }
+  }
+}
+
+ __global__ void KeCMRNormOutput(size_t nthreads, const real* in,
+                                 const real* scale, real negative_beta,
+                                 real* out) {
+  size_t index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < nthreads) {
+    out[index] = in[index] * pow(scale[index], negative_beta);
+  }
+}
+
+void hl_CMRNorm_forward(size_t frameCnt, const real* in, real* scale,
+                        real* out, size_t channels,
+                        size_t height, size_t width, size_t sizeX,
+                        real alpha, real beta) {
+  size_t threadsNum = frameCnt * height * width;
+  size_t blocksX = (threadsNum + 1024 - 1) / 1024;
+  size_t blocksY = 1;
+  dim3 threads(1024, 1);
+  dim3 grid(blocksX, blocksY);
+
+  KeCMRNormFillScale<<<grid, threads, 0, STREAM_DEFAULT>>>
+      (threadsNum, in, scale, channels, height, width, sizeX, alpha);
+
+  threadsNum = frameCnt * height * width *channels;
+  blocksX = (threadsNum + 1024 -1) / 1024;
+  dim3 threads2(1024, 1);
+  dim3 grid2(blocksX, blocksY);
+  KeCMRNormOutput<<<grid2, threads2, 0, STREAM_DEFAULT>>>
+           (threadsNum, in, scale, beta, out);
+  CHECK_SYNC("hl_CMRNorm_forward");
+}
+
+__global__ void KeCMRNormDiff(size_t nthreads, const real* bottom_data,
+                              const real* top_data, const real* scale,
+                              const real* top_diff, size_t channels,
+                              size_t height, size_t width, size_t size,
+                              real negative_beta, real cache_ratio,
+                              real* bottom_diff ) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < nthreads) {
+    // find out the local offset
+    size_t w = index % width;
+    size_t h = (index / width) % height;
+    size_t n = index / width / height;
+    size_t offset = (n * channels * height + h) * width + w;
+    size_t step = height * width;
+    bottom_data += offset;
+    top_data += offset;
+    scale += offset;
+    top_diff += offset;
+    bottom_diff += offset;
+    int head = 0;
+    int pre_pad = size - (size + 1) / 2;
+    int post_pad = size - pre_pad - 1;
+    real accum_ratio = 0;
+    // accumulate values
+    while (head < post_pad) {
+      accum_ratio += top_diff[head * step] *
+        top_data[head * step] / scale[head * step];
+      ++head;
+    }
+    // until we reach size, nothing needs to be subtracted
+    while (head < size) {
+      accum_ratio += top_diff[head * step] *
+        top_data[head * step] / scale[head * step];
+      bottom_diff[(head - post_pad) * step] +=
+        top_diff[(head - post_pad) * step] *
+        pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
+        bottom_data[(head - post_pad) * step] * accum_ratio;
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_ratio += top_diff[head * step] * top_data[head * step] /
+          scale[head * step];
+      accum_ratio -= top_diff[(head - size) * step] *
+          top_data[(head - size) * step] / scale[(head - size) * step];
+      bottom_diff[(head - post_pad) * step] +=
+        top_diff[(head - post_pad) * step] *
+        pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
+        bottom_data[(head - post_pad) * step] * accum_ratio;
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      accum_ratio -= top_diff[(head - size) * step] *
+          top_data[(head - size) * step] / scale[(head - size) * step];
+      bottom_diff[(head - post_pad) * step] +=
+        top_diff[(head - post_pad) * step] *
+        pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio *
+        bottom_data[(head - post_pad) * step] * accum_ratio;
+      ++head;
+    }
+  }
+}
+
+void hl_CMRNorm_backward(size_t frameCnt, const real* inV,
+                         const real* scale,
+                         const real* outV, const real* outDiff,
+                         real *inDiff, size_t channels,
+                         size_t height, size_t width, size_t sizeX,
+                         real alpha, real beta) {
+  size_t threadsNum = frameCnt * height * width;
+  size_t blocksX = (threadsNum + 1024 -1) / 1024;
+  size_t blocksY = 1;
+  dim3 threads(1024, 1);
+  dim3 grid(blocksX, blocksY);
+  KeCMRNormDiff <<<grid, threads, 0, STREAM_DEFAULT>>>
+           (threadsNum, inV, outV, scale, outDiff, channels,
+           height, width, sizeX, alpha, beta, inDiff);
+  CHECK_SYNC("hl_CMRNorm_backward");
+}
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
new file mode 100644
index 00000000000000..445279fa01034c
--- /dev/null
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -0,0 +1,279 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <sys/time.h>
+#include <mutex>
+#include "hl_cuda_cublas.h"
+#include "hl_thread.ph"
+#include "hl_dso_loader.h"
+#include "paddle/utils/Logging.h"
+
+namespace dynload {
+
+std::once_flag cublas_dso_flag;
+void* cublas_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cublas routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#ifdef PADDLE_USE_DSO
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                          \
+   struct DynLoad__##__name {                                     \
+    template <typename... Args>                                   \
+    cublasStatus_t operator()(Args... args) {                     \
+        typedef cublasStatus_t (*cublasFunc)(Args...);            \
+        std::call_once(cublas_dso_flag, GetCublasDsoHandle,       \
+                      &cublas_dso_handle);                        \
+        void* p_##__name = dlsym(cublas_dso_handle, #__name);     \
+        return reinterpret_cast<cublasFunc>(p_##__name)(args...); \
+    }                                                             \
+  } __name;  // struct DynLoad__##__name
+#else
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                          \
+   struct DynLoad__##__name {                                     \
+    template <typename... Args>                                   \
+    cublasStatus_t operator()(Args... args) {                     \
+      return __name(args...);                                     \
+    }                                                             \
+  } __name;  // struct DynLoad__##__name
+#endif
+
+#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
+  DYNAMIC_LOAD_CUBLAS_WRAP(__name)
+
+// include all needed cublas functions in HPPL
+#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
+  __macro(cublasSgemv)                    \
+  __macro(cublasDgemv)                    \
+  __macro(cublasSgemm)                    \
+  __macro(cublasDgemm)                    \
+  __macro(cublasSgeam)                    \
+  __macro(cublasDgeam)                    \
+
+DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasCreate)
+DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasDestroy)
+DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetStream)
+DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetPointerMode)
+DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasGetPointerMode)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched)
+CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
+
+#undef DYNAMIC_LOAD_CUBLAS_WRAP
+#undef DYNAMIC_LOAD_CUBLAS_V2_WRAP
+#undef CUBLAS_BLAS_ROUTINE_EACH
+
+} /* namespace dynload */
+
+
+#ifndef HPPL_TYPE_DOUBLE
+#define     CUBLAS_GEAM     dynload::cublasSgeam
+#define     CUBLAS_GEMV     dynload::cublasSgemv
+#define     CUBLAS_GEMM     dynload::cublasSgemm
+#else
+#define     CUBLAS_GEAM     dynload::cublasDgeam
+#define     CUBLAS_GEMV     dynload::cublasDgemv
+#define     CUBLAS_GEMM     dynload::cublasDgemm
+#endif
+
+const char* hl_cublas_get_error_string(cublasStatus_t status) {
+  switch(status) {
+     case CUBLAS_STATUS_NOT_INITIALIZED:
+        return "[cublas status]: not initialized";
+     case CUBLAS_STATUS_ALLOC_FAILED:
+        return "[cublas status]: allocate failed";
+     case CUBLAS_STATUS_INVALID_VALUE:
+        return "[cublas status]: invalid value";
+     case CUBLAS_STATUS_ARCH_MISMATCH:
+        return "[cublas status]: arch mismatch";
+     case CUBLAS_STATUS_MAPPING_ERROR:
+        return "[cublas status]: mapping error";
+     case CUBLAS_STATUS_EXECUTION_FAILED:
+        return "[cublas status]: execution failed";
+     case CUBLAS_STATUS_INTERNAL_ERROR:
+        return "[cublas status]: internal error";
+     case CUBLAS_STATUS_SUCCESS:
+        return "[cublas status]: success";
+     default:
+        return "[cublas status]: unknown error";
+  }
+}
+
+/**
+ * Check build-in cublas function using glog and it also
+ * support << operator for more details error info.
+ */
+cublasStatus_t g_cublasStat;
+#define CHECK_CUBLAS(cublas_func)                 \
+  g_cublasStat = cublas_func;                     \
+  CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat)   \
+      << "Cublas Error: "                         \
+      << hl_cublas_get_error_string(g_cublasStat) \
+      << " "
+
+void hl_cublas_init(cublasHandle_t *cublas_handle, cudaStream_t stream) {
+  CHECK_CUBLAS(dynload::cublasCreate(cublas_handle))
+    << "[cublas init] Cublas create handle faild!";
+
+  CHECK_CUBLAS(dynload::cublasSetStream(*cublas_handle, stream))
+    << "[cublas init] Cublas set stream faild!";
+}
+
+void hl_matrix_transpose(real *A_d,
+                         real *C_d,
+                         int dimM,
+                         int dimN,
+                         int lda,
+                         int ldc) {
+  real alpha = 1.0;
+  real beta = 0.0;
+
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  CHECK_CUBLAS(CUBLAS_GEAM(t_resource.handle,
+               CUBLAS_OP_T, CUBLAS_OP_N,
+               dimM, dimN,
+               &alpha, A_d, lda,
+               &beta, nullptr, dimM,
+               C_d, ldc));
+  CHECK_SYNC("hl_matrix_transpose failed");
+}
+
+void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {
+  hl_matrix_transpose(A_d, C_d, dimM, dimN, dimN, dimM);
+}
+
+void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
+                   real *B_d, hl_trans_op_t transb,
+                   real *C_d,
+                   int dimM, int dimN, int dimK,
+                   real alpha, real beta,
+                   int lda, int ldb, int ldc) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+
+  if (dimN == 1 && dimM != 1 && dimK != 1 && transb == HPPL_OP_N) {
+    int m = (transa == HPPL_OP_N) ? dimM : dimK;
+    int n = (transa == HPPL_OP_N) ? dimK : dimM;
+    hl_matrix_mul_vector(A_d, transa, B_d, C_d, m, n,
+                         alpha, beta, lda, ldb, ldc);
+    return;
+  }
+
+  if (dimM == 1 && dimN != 1 && dimK != 1 && transa == HPPL_OP_N) {
+    int m = (transb == HPPL_OP_N) ? dimK : dimN;
+    int n = (transb == HPPL_OP_N) ? dimN : dimK;
+    hl_trans_op_t trans = (transb == HPPL_OP_N) ? HPPL_OP_T : HPPL_OP_N;
+    hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n,
+                         alpha, beta, ldb, 1, 1);
+    return;
+  }
+
+  cublasStatus_t stat;
+  if ((HPPL_OP_N == transa) && (HPPL_OP_N == transb)) {
+    stat = CUBLAS_GEMM(t_resource.handle,
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_N,
+                       dimN, dimM, dimK,
+                       &alpha, B_d, ldb,
+                       A_d, lda,
+                       &beta, C_d, ldc);
+  } else if ((HPPL_OP_T == transa) && (HPPL_OP_N == transb)) {
+    stat = CUBLAS_GEMM(t_resource.handle,
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_T,
+                       dimN, dimM, dimK,
+                       &alpha, B_d, ldb,
+                       A_d, lda,
+                       &beta, C_d, ldc);
+  } else if ((HPPL_OP_N == transa) && (HPPL_OP_T == transb)) {
+    stat = CUBLAS_GEMM(t_resource.handle,
+                       CUBLAS_OP_T,
+                       CUBLAS_OP_N,
+                       dimN, dimM, dimK,
+                       &alpha, B_d, ldb,
+                       A_d, lda,
+                       &beta, C_d, ldc);
+  } else {
+    LOG(FATAL) << "parameter transa error!";
+  }
+  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS);
+  CHECK_SYNC("hl_matrix_mul failed");
+}
+
+void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
+                   real *B_d, hl_trans_op_t transb,
+                   real *C_d,
+                   int dimM, int dimN, int dimK,
+                   real alpha, real beta) {
+  int lda = (HPPL_OP_N == transa) ? dimK : dimM;
+  int ldb = (HPPL_OP_N == transb) ? dimN : dimK;
+  int ldc = dimN;
+
+  hl_matrix_mul(A_d, transa, B_d, transb, C_d, dimM, dimN,
+                dimK, alpha, beta, lda, ldb, ldc);
+}
+
+void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
+                          real *B_d, real *C_d,
+                          int dimM, int dimN,
+                          real alpha, real beta,
+                          int lda, int incb, int incc) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+
+  cublasStatus_t stat;
+  if (HPPL_OP_N == trans) {
+    stat = CUBLAS_GEMV(t_resource.handle,
+                       CUBLAS_OP_T,
+                       dimN, dimM,
+                       &alpha,
+                       A_d, lda,
+                       B_d, incb,
+                       &beta,
+                       C_d, incc);
+  } else if (HPPL_OP_T == trans) {
+    stat = CUBLAS_GEMV(t_resource.handle,
+                       CUBLAS_OP_N,
+                       dimN, dimM,
+                       &alpha,
+                       A_d, lda,
+                       B_d, incb,
+                       &beta,
+                       C_d, incc);
+  } else {
+    LOG(FATAL) << "parameter transa error!";
+  }
+
+  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS);
+  CHECK_SYNC("hl_matrix_mul_vector");
+}
+
+void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
+                          real *B_d, real *C_d,
+                          int dimM, int dimN,
+                          real alpha, real beta) {
+  hl_matrix_mul_vector(A_d, trans, B_d, C_d, dimM, dimN,
+                       alpha, beta, dimN, 1, 1);
+}
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
new file mode 100644
index 00000000000000..722a21fed29652
--- /dev/null
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -0,0 +1,1101 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <cudnn.h>
+#include <mutex>
+#include "hl_cuda_cudnn.h"
+#include "hl_cuda_cudnn.ph"
+#include "hl_thread.ph"
+#include "hl_dso_loader.h"
+#include "paddle/utils/Logging.h"
+
+namespace dynload {
+
+std::once_flag cudnn_dso_flag;
+void* cudnn_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cudbnn routine
+ * via operator overloading: operator ()
+ *
+ * note: default dynamic linked libs
+ **/
+
+#ifdef PADDLE_USE_DSO
+
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                         \
+  struct DynLoad__##__name {                                    \
+    template <typename... Args>                                 \
+    cudnnStatus_t operator()(Args... args) {                    \
+      typedef cudnnStatus_t (*cudnnFunc)(Args...);              \
+      std::call_once(cudnn_dso_flag, GetCudnnDsoHandle,         \
+                     &cudnn_dso_handle);                        \
+      void* p_##__name = dlsym(cudnn_dso_handle, #__name);      \
+      return reinterpret_cast<cudnnFunc>(p_##__name)(args...);  \
+    }                                                           \
+  } __name; /* struct DynLoad__##__name */
+
+struct DynLoad__cudnnGetVersion {
+  template <typename... Args>
+  size_t operator()(Args... args) {
+    typedef size_t (*cudnnFunc)(Args...);
+    std::call_once(cudnn_dso_flag, GetCudnnDsoHandle,
+                   &cudnn_dso_handle);
+    void* p_name = dlsym(cudnn_dso_handle, "cudnnGetVersion");
+    return reinterpret_cast<cudnnFunc>(p_name)(args...);
+  }
+} cudnnGetVersion; /* struct DynLoad__##__name */
+
+struct DynLoad__cudnnGetErrorString {
+  template <typename... Args>
+  const char* operator()(Args... args) {
+    typedef const char* (*cudnnFunc)(Args...);
+    std::call_once(cudnn_dso_flag, GetCudnnDsoHandle,
+                   &cudnn_dso_handle);
+    void* p_name = dlsym(cudnn_dso_handle, "cudnnGetErrorString");
+    return reinterpret_cast<cudnnFunc>(p_name)(args...);
+  }
+} cudnnGetErrorString; /* struct DynLoad__##__name */
+
+
+#else
+
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                         \
+  struct DynLoad__##__name {                                    \
+    template <typename... Args>                                 \
+    cudnnStatus_t operator()(Args... args) {                    \
+      return __name(args...);                                   \
+    }                                                           \
+  } __name; /* struct DynLoad__##__name */
+
+struct DynLoad__cudnnGetVersion {
+  template <typename... Args>
+  size_t operator()(Args... args) {
+    return cudnnGetVersion(args...);
+  }
+} cudnnGetVersion; /* struct DynLoad__##__name */
+
+struct DynLoad__cudnnGetErrorString {
+  template <typename... Args>
+  const char* operator()(Args... args) {
+    return cudnnGetErrorString(args...);
+  }
+} cudnnGetErrorString; /* struct DynLoad__##__name */
+
+#endif
+
+/**
+ * include all needed cudnn functions in HPPL
+ * different cudnn version has different interfaces
+ **/
+#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
+  __macro(cudnnSetTensor4dDescriptor)                     \
+  __macro(cudnnSetTensor4dDescriptorEx)                   \
+  __macro(cudnnGetConvolutionNdForwardOutputDim)          \
+  __macro(cudnnGetConvolutionForwardAlgorithm)            \
+  __macro(cudnnCreateTensorDescriptor)                    \
+  __macro(cudnnDestroyTensorDescriptor)                   \
+  __macro(cudnnCreateFilterDescriptor)                    \
+  __macro(cudnnSetFilter4dDescriptor)                     \
+  __macro(cudnnSetPooling2dDescriptor)                    \
+  __macro(cudnnDestroyFilterDescriptor)                   \
+  __macro(cudnnCreateConvolutionDescriptor)               \
+  __macro(cudnnCreatePoolingDescriptor)                   \
+  __macro(cudnnDestroyPoolingDescriptor)                  \
+  __macro(cudnnSetConvolution2dDescriptor)                \
+  __macro(cudnnDestroyConvolutionDescriptor)              \
+  __macro(cudnnCreate)                                    \
+  __macro(cudnnDestroy)                                   \
+  __macro(cudnnSetStream)                                 \
+  __macro(cudnnActivationForward)                         \
+  __macro(cudnnConvolutionForward)                        \
+  __macro(cudnnConvolutionBackwardBias)                   \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize)        \
+  __macro(cudnnTransformTensor)                           \
+  __macro(cudnnPoolingForward)                            \
+  __macro(cudnnPoolingBackward)                           \
+  __macro(cudnnSoftmaxBackward)                           \
+  __macro(cudnnSoftmaxForward)
+CUDNN_DNN_ROUTINE_EACH(DYNAMIC_LOAD_CUDNN_WRAP)
+
+#define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
+  __macro(cudnnAddTensor)                                 \
+  __macro(cudnnConvolutionBackwardData)                   \
+  __macro(cudnnConvolutionBackwardFilter)
+CUDNN_DNN_ROUTINE_EACH_R2(DYNAMIC_LOAD_CUDNN_WRAP)
+
+// APIs available after R3:
+#if CUDNN_VERSION >= 3000
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)              \
+  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize)     \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
+  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DYNAMIC_LOAD_CUDNN_WRAP)
+#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
+#endif
+
+
+// APIs available after R4:
+#if CUDNN_VERSION >= 4000
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)             \
+  __macro(cudnnBatchNormalizationForwardTraining)            \
+  __macro(cudnnBatchNormalizationForwardInference)           \
+  __macro(cudnnBatchNormalizationBackward)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DYNAMIC_LOAD_CUDNN_WRAP)
+#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
+#endif
+
+// APIs in R5
+#if CUDNN_VERSION >= 5000
+#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)                    \
+  __macro(cudnnCreateActivationDescriptor)                    \
+  __macro(cudnnSetActivationDescriptor)                       \
+  __macro(cudnnGetActivationDescriptor)                       \
+  __macro(cudnnDestroyActivationDescriptor)
+CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
+#undef CUDNN_DNN_ROUTINE_EACH_R5
+#endif
+
+#undef CUDNN_DNN_ROUTINE_EACH
+
+} /* namespace dynload */
+
+/**
+ * Check build-in cudnn function using glog and it also
+ * support << operator for more details error info.
+ */
+cudnnStatus_t g_cudnnStat;
+#define CHECK_CUDNN(cudnn_func)                                \
+  g_cudnnStat = cudnn_func;                                    \
+  CHECK_EQ(CUDNN_STATUS_SUCCESS, g_cudnnStat)                  \
+      << "Cudnn Error: "                                       \
+      << dynload::cudnnGetErrorString(g_cudnnStat) << ". "     \
+
+bool g_is_libcudnn_init = false;
+int g_cudnn_lib_version = 0;
+
+void hl_cudnn_desc_init(cudnnTensorDescriptor_t*  cudnn_desc)
+{
+    CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
+}
+
+void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream)
+{
+    size_t cudnn_dso_ver = dynload::cudnnGetVersion();
+    size_t cudnn_dso_major = cudnn_dso_ver / 1000;
+    size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
+
+    // Compare cudnn header version with that of cudnn.so.
+    CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) ||
+          (cudnn_cuh_major == cudnn_dso_major))
+        << "[cudnn init] libcudnn v" << cudnn_dso_major <<
+        " with header v" << cudnn_cuh_major << " unmatched!\n"
+        << "PaddlePaddle Requirement: "
+        << "(header v[2-3] with libcudnn v[2-3]) Or "
+        << "(header v4 with libcudnn v4) Or "
+        << "(header v5 with libcudnn v5).";
+
+    CHECK(!(CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
+        << "cudnn v5 requires cuda version >= 7.5";
+
+    CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
+    CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
+
+    g_is_libcudnn_init = true;
+    g_cudnn_lib_version = cudnn_dso_ver;
+}
+
+int hl_get_cudnn_lib_version() {
+  return g_cudnn_lib_version;
+}
+
+void hl_conv_workspace(hl_tensor_descriptor input,
+                       hl_tensor_descriptor output,
+                       hl_filter_descriptor filter,
+                       hl_convolution_descriptor conv,
+                       int* convFwdAlgo,
+                       size_t* fwdLimitBytes,
+                       int* convBwdDataAlgo,
+                       size_t* bwdDataLimitBytes,
+                       int* convBwdFilterAlgo,
+                       size_t* bwdFilterLimitBytes) {
+#if CUDNN_VERSION >= 4000
+
+    CHECK_NOTNULL(input);
+    CHECK_NOTNULL(output);
+    CHECK_NOTNULL(filter);
+    CHECK_NOTNULL(conv);
+
+    // Specify workspace limit directly
+    size_t memoryLimitBytes = 8 * 1024 * 1024;
+
+    // cudnn convolution forward configuration
+    cudnnTensorDescriptor_t       fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
+    cudnnTensorDescriptor_t       fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
+    cudnnFilterDescriptor_t       fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+    cudnnConvolutionDescriptor_t  fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+
+    CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
+             t_resource.cudnn_handle,
+             fwd_src_desc,
+             fwd_filter_desc,
+             fwd_conv_desc,
+             fwd_dest_desc,
+             CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+             memoryLimitBytes,
+             reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
+
+    CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
+             t_resource.cudnn_handle,
+             fwd_src_desc,
+             fwd_filter_desc,
+             fwd_conv_desc,
+             fwd_dest_desc,
+             static_cast<cudnnConvolutionFwdAlgo_t>(*convFwdAlgo),
+             fwdLimitBytes));
+
+    // cudnn convolution backward data configuration
+    cudnnFilterDescriptor_t       bwd_data_filter_desc =
+                                          GET_FILTER_DESCRIPTOR(filter);
+    cudnnTensorDescriptor_t       bwd_data_diff_desc =
+                                          GET_TENSOR_DESCRIPTOR(output);
+    cudnnTensorDescriptor_t       bwd_data_grad_desc =
+                                          GET_TENSOR_DESCRIPTOR(input);
+    cudnnConvolutionDescriptor_t  bwd_data_conv_desc =
+                                          GET_CONVOLUTION_DESCRIPTOR(conv);
+
+    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+             t_resource.cudnn_handle,
+             bwd_data_filter_desc,
+             bwd_data_diff_desc,
+             bwd_data_conv_desc,
+             bwd_data_grad_desc,
+             CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+             memoryLimitBytes,
+             reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
+
+    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+             t_resource.cudnn_handle,
+             bwd_data_filter_desc,
+             bwd_data_diff_desc,
+             bwd_data_conv_desc,
+             bwd_data_grad_desc,
+             static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
+             bwdDataLimitBytes));
+
+    // cudnn convolution backward filter configuration
+    cudnnTensorDescriptor_t       bwd_filter_src_desc =
+                                      GET_TENSOR_DESCRIPTOR(input);
+    cudnnTensorDescriptor_t       bwd_filter_diff_desc =
+                                      GET_TENSOR_DESCRIPTOR(output);
+    cudnnConvolutionDescriptor_t  bwd_filter_conv_desc =
+                                      GET_CONVOLUTION_DESCRIPTOR(conv);
+    cudnnFilterDescriptor_t       bwd_filter_grad_desc =
+                                      GET_FILTER_DESCRIPTOR(filter);
+
+    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+             t_resource.cudnn_handle,
+             bwd_filter_src_desc,
+             bwd_filter_diff_desc,
+             bwd_filter_conv_desc,
+             bwd_filter_grad_desc,
+             CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+             memoryLimitBytes,
+             reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
+
+    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+             t_resource.cudnn_handle, bwd_filter_src_desc,
+             bwd_filter_diff_desc, bwd_filter_conv_desc,
+             bwd_filter_grad_desc,
+             static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
+             bwdFilterLimitBytes));
+
+#endif
+}
+
+void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
+                                 int batch_size,
+                                 int feature_maps,
+                                 int height,
+                                 int width)
+{
+    CHECK_NOTNULL(image_desc);
+
+    cudnn_tensor_descriptor hl_desc =
+        (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
+    CHECK_NOTNULL(hl_desc);
+
+#ifndef HPPL_TYPE_DOUBLE
+    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+    CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
+
+    CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
+                hl_desc->desc,
+                CUDNN_TENSOR_NCHW,
+                data_type,
+                batch_size,
+                feature_maps,
+                height,
+                width));
+
+    hl_desc->format = CUDNN_TENSOR_NCHW;
+    hl_desc->data_type = data_type;
+    hl_desc->batch_size = batch_size;
+    hl_desc->feature_maps = feature_maps;
+    hl_desc->height = height;
+    hl_desc->width = width;
+
+    *image_desc = (hl_tensor_descriptor)hl_desc;
+}
+
+void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {
+    CHECK_NOTNULL(image_desc);
+
+    cudnn_tensor_descriptor hl_desc =
+        (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
+    CHECK_NOTNULL(hl_desc);
+
+#ifndef HPPL_TYPE_DOUBLE
+    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+    CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
+
+    hl_desc->data_type = data_type;
+
+    *image_desc = (hl_tensor_descriptor)hl_desc;
+}
+
+void hl_tensor_reshape(hl_tensor_descriptor image_desc,
+                       int batch_size,
+                       int feature_maps,
+                       int height,
+                       int width)
+{
+    const int stride_w = 1;
+    const int stride_h = width * stride_w;
+    const int stride_c = height * stride_h;
+    const int stride_n = feature_maps * stride_c;
+    return hl_tensor_reshape(image_desc,
+                             batch_size,
+                             feature_maps,
+                             height,
+                             width,
+                             stride_n,
+                             stride_c,
+                             stride_h,
+                             stride_w);
+}
+
+void hl_tensor_reshape(hl_tensor_descriptor image_desc,
+                       int batch_size,
+                       int feature_maps,
+                       int height,
+                       int width,
+                       int nStride,
+                       int cStride,
+                       int hStride,
+                       int wStride)
+{
+    CHECK_NOTNULL(image_desc);
+
+    cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
+    CHECK_NOTNULL(hl_desc->desc);
+
+    CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc,
+                hl_desc->data_type,
+                batch_size,
+                feature_maps,
+                height,
+                width,
+                nStride,
+                cStride,
+                hStride,
+                wStride));
+
+    hl_desc->batch_size = batch_size;
+    hl_desc->feature_maps = feature_maps;
+    hl_desc->height = height;
+    hl_desc->width = width;
+}
+
+void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc)
+{
+    CHECK_NOTNULL(image_desc);
+
+    cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
+    CHECK_NOTNULL(hl_desc->desc);
+
+    CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc));
+
+    hl_desc->desc = NULL;
+
+    free(image_desc);
+}
+
+
+void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
+                                  hl_pooling_mode_t mode,
+                                  int height,
+                                  int width,
+                                  int height_padding,
+                                  int width_padding,
+                                  int stride_height,
+                                  int stride_width)
+{
+    cudnnPoolingMode_t cudnn_mode;
+    switch (mode)
+    {
+        case HL_POOLING_MAX:
+            cudnn_mode = CUDNN_POOLING_MAX;
+            break;
+        case HL_POOLING_AVERAGE:
+            cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+            break;
+        case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
+            cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+            break;
+        default:
+            LOG(FATAL) << "parameter mode error";
+    }
+
+    CHECK_NOTNULL(pooling_desc);
+
+    cudnn_pooling_descriptor hl_pooling_desc =
+        (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor));
+    CHECK_NOTNULL(hl_pooling_desc);
+
+    CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc));
+
+    CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(
+                hl_pooling_desc->desc,
+                cudnn_mode,
+#if CUDNN_VERSION >= 5000
+                CUDNN_PROPAGATE_NAN,
+#endif
+                height,
+                width,
+                height_padding,
+                width_padding,
+                stride_height,
+                stride_width));
+
+    hl_pooling_desc->mode = cudnn_mode;
+    hl_pooling_desc->window_height = height;
+    hl_pooling_desc->window_width = width;
+    hl_pooling_desc->stride_height = stride_height;
+    hl_pooling_desc->stride_width = stride_width;
+
+    *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
+}
+
+void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc)
+{
+    CHECK_NOTNULL(pooling_desc);
+
+    cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
+    CHECK_NOTNULL(hl_pooling->desc);
+
+    CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
+
+    hl_pooling->desc = NULL;
+
+    free(pooling_desc);
+}
+
+void hl_pooling_forward(hl_tensor_descriptor input,
+                        real* input_image,
+                        hl_tensor_descriptor output,
+                        real* output_image,
+                        hl_pooling_descriptor pooling)
+{
+    cudnnPoolingDescriptor_t    pooling_desc;
+    cudnnTensorDescriptor_t     input_desc;
+    cudnnTensorDescriptor_t     output_desc;
+
+    CHECK_NOTNULL(input);
+    CHECK_NOTNULL(output);
+    CHECK_NOTNULL(pooling);
+    CHECK_NOTNULL(input_image);
+    CHECK_NOTNULL(output_image);
+
+    real alpha = 1.0f;
+    real beta = 1.0f;
+    input_desc = ((cudnn_tensor_descriptor)input)->desc;
+    output_desc = ((cudnn_tensor_descriptor)output)->desc;
+    pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
+    CHECK_CUDNN(dynload::cudnnPoolingForward(
+                t_resource.cudnn_handle,
+                pooling_desc,
+                &alpha,
+                input_desc,
+                input_image,
+                &beta,
+                output_desc,
+                output_image));
+    CHECK_SYNC("hl_pooling_forward failed");
+}
+
+void hl_pooling_backward(hl_tensor_descriptor input,
+                         real* input_image,
+                         real* input_image_grad,
+                         hl_tensor_descriptor output,
+                         real* output_image,
+                         real* output_image_grad,
+                         hl_pooling_descriptor pooling)
+{
+    cudnnPoolingDescriptor_t    pooling_desc;
+    cudnnTensorDescriptor_t     input_desc;
+    cudnnTensorDescriptor_t     output_desc;
+
+    CHECK_NOTNULL(input);
+    CHECK_NOTNULL(output);
+    CHECK_NOTNULL(pooling);
+    CHECK_NOTNULL(input_image);
+    CHECK_NOTNULL(input_image_grad);
+    CHECK_NOTNULL(output_image);
+    CHECK_NOTNULL(output_image_grad);
+
+    real alpha = 1.0f;
+    real beta = 1.0f;
+    input_desc = ((cudnn_tensor_descriptor)input)->desc;
+    output_desc = ((cudnn_tensor_descriptor)output)->desc;
+    pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
+    CHECK_CUDNN(dynload::cudnnPoolingBackward(
+                t_resource.cudnn_handle,
+                pooling_desc,
+                &alpha,
+                output_desc,
+                output_image,
+                output_desc,
+                output_image_grad,
+                input_desc,
+                input_image,
+                &beta,
+                input_desc,
+                input_image_grad));
+  CHECK_SYNC("hl_pooling_backward failed");
+}
+
+
+void hl_create_filter_descriptor(hl_filter_descriptor* filter,
+                                 int input_feature_maps,
+                                 int output_feature_maps,
+                                 int height,
+                                 int width)
+{
+    CHECK_NOTNULL(filter);
+
+    cudnn_filter_descriptor hl_filter =
+        (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor));
+    CHECK_NOTNULL(hl_filter);
+
+    CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
+
+#ifndef HPPL_TYPE_DOUBLE
+    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+    CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(
+             hl_filter->desc,
+             data_type,
+#if CUDNN_VERSION >= 5000
+             CUDNN_TENSOR_NCHW,
+#endif
+             output_feature_maps,
+             input_feature_maps,
+             height,
+             width));
+
+    hl_filter->data_type = data_type;
+    hl_filter->output_feature_maps = output_feature_maps;
+    hl_filter->input_feature_maps = input_feature_maps;
+    hl_filter->filter_height = height;
+    hl_filter->filter_width = width;
+
+    *filter = (hl_filter_descriptor)hl_filter;
+}
+
+
+void hl_destroy_filter_descriptor(hl_filter_descriptor filter)
+{
+    CHECK_NOTNULL(filter);
+
+    cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
+    CHECK_NOTNULL(hl_filter->desc);
+
+    CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc));
+
+    hl_filter->desc = NULL;
+
+    free(filter);
+}
+
+void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
+                                      hl_tensor_descriptor image,
+                                      hl_filter_descriptor filter,
+                                      int padding_height,
+                                      int padding_width,
+                                      int stride_height,
+                                      int stride_width)
+{
+    CHECK_NOTNULL(conv);
+
+    cudnn_convolution_descriptor hl_conv =
+        (cudnn_convolution_descriptor)malloc(sizeof(_cudnn_convolution_descriptor));
+    CHECK_NOTNULL(hl_conv);
+
+    CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
+
+    cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+    CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(
+                hl_conv->desc,
+                padding_height,
+                padding_width,
+                stride_height,
+                stride_width,
+                1,
+                1,
+                mode));
+
+    hl_conv->input_image = image;
+    hl_conv->filter = filter;
+    hl_conv->padding_height = padding_height;
+    hl_conv->padding_width = padding_width;
+    hl_conv->stride_height = stride_height;
+    hl_conv->stride_width = stride_width;
+    hl_conv->upscalex = 1;
+    hl_conv->upscaley = 1;
+    hl_conv->mode = mode;
+
+    *conv = (hl_convolution_descriptor)hl_conv;
+}
+
+void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
+                                     hl_tensor_descriptor image,
+                                     hl_filter_descriptor filter,
+                                     int padding_height,
+                                     int padding_width,
+                                     int stride_height,
+                                     int stride_width)
+{
+    CHECK_NOTNULL(conv);
+    CHECK_NOTNULL(image);
+    CHECK_NOTNULL(filter);
+
+    cudnnConvolutionDescriptor_t  conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+    cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+    CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(
+                conv_desc,
+                padding_height,
+                padding_width,
+                stride_height,
+                stride_width,
+                1,
+                1,
+                mode));
+
+    cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
+    hl_conv->input_image = image;
+    hl_conv->filter = filter;
+    hl_conv->padding_height = padding_height;
+    hl_conv->padding_width = padding_width;
+    hl_conv->stride_height = stride_height;
+    hl_conv->stride_width = stride_width;
+    hl_conv->upscalex = 1;
+    hl_conv->upscaley = 1;
+    hl_conv->mode = mode;
+}
+
+void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv)
+{
+    CHECK_NOTNULL(conv);
+
+    cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
+    CHECK_NOTNULL(hl_conv->desc);
+
+    CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc));
+    hl_conv->desc = NULL;
+
+    free(conv);
+}
+
+void hl_convolution_forward(hl_tensor_descriptor input,
+                            real* input_data,
+                            hl_tensor_descriptor output,
+                            real* output_data,
+                            hl_filter_descriptor filter,
+                            real* filter_data,
+                            hl_convolution_descriptor conv,
+                            void* gpuWorkSpace,
+                            size_t sizeInBytes,
+                            int convFwdAlgo) {
+    CHECK_NOTNULL(input);
+    CHECK_NOTNULL(output);
+    CHECK_NOTNULL(filter);
+    CHECK_NOTNULL(conv);
+    CHECK_NOTNULL(input_data);
+    CHECK_NOTNULL(output_data);
+    CHECK_NOTNULL(filter_data);
+    cudnnTensorDescriptor_t       src_desc = GET_TENSOR_DESCRIPTOR(input);
+    cudnnTensorDescriptor_t       dest_desc = GET_TENSOR_DESCRIPTOR(output);
+    cudnnFilterDescriptor_t       filter_desc = GET_FILTER_DESCRIPTOR(filter);
+    cudnnConvolutionDescriptor_t  conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+    real alpha = 1.0f;
+    real beta = 1.0f;
+    CHECK_CUDNN(dynload::cudnnConvolutionForward(
+                t_resource.cudnn_handle,
+                &alpha,
+                src_desc,
+                input_data,
+                filter_desc,
+                filter_data,
+                conv_desc,
+                static_cast<cudnnConvolutionFwdAlgo_t>(convFwdAlgo),
+                gpuWorkSpace,
+                sizeInBytes,
+                &beta,
+                dest_desc,
+                output_data));
+  CHECK_SYNC("hl_convolution_forward failed");
+}
+
+void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
+                                     real* bias_data,
+                                     hl_tensor_descriptor output,
+                                     real* output_data)
+{
+    CHECK_NOTNULL(bias);
+    CHECK_NOTNULL(output);
+    CHECK_NOTNULL(bias_data);
+    CHECK_NOTNULL(output_data);
+
+    cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output);
+    cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
+    real alpha = 1.0f;
+    real beta = 1.0f;
+
+    CHECK_CUDNN(dynload::cudnnAddTensor(
+                t_resource.cudnn_handle,
+#if CUDNN_VERSION < 4000
+                CUDNN_ADD_SAME_C,
+#endif
+                &alpha,
+                bias_desc,
+                bias_data,
+                &beta,
+                output_desc,
+                output_data));
+  CHECK_SYNC("hl_convolution_forward_add_bias failed");
+}
+
+void hl_convolution_backward_bias(hl_tensor_descriptor bias,
+                                  real* bias_grad_data,
+                                  hl_tensor_descriptor output,
+                                  real* output_grad_data)
+{
+    CHECK_NOTNULL(bias);
+    CHECK_NOTNULL(output);
+    CHECK_NOTNULL(bias_grad_data);
+    CHECK_NOTNULL(output_grad_data);
+
+    real alpha = 1.0f;
+    real beta = 1.0f;
+    cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+    cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
+    CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(
+                t_resource.cudnn_handle,
+                &alpha,
+                diff_desc,
+                output_grad_data,
+                &beta,
+                bias_desc,
+                bias_grad_data));
+  CHECK_SYNC("hl_convolution_backward_bias failed");
+}
+
+void hl_convolution_backward_filter(hl_tensor_descriptor input,
+                                    real* input_data,
+                                    hl_tensor_descriptor output,
+                                    real* output_grad_data,
+                                    hl_filter_descriptor filter,
+                                    real* filter_grad_data,
+                                    hl_convolution_descriptor conv,
+                                    void* gpuWorkSpace,
+                                    size_t sizeInBytes,
+                                    int convBwdFilterAlgo) {
+
+    CHECK_NOTNULL(input);
+    CHECK_NOTNULL(output);
+    CHECK_NOTNULL(filter);
+    CHECK_NOTNULL(conv);
+    CHECK_NOTNULL(input_data);
+    CHECK_NOTNULL(output_grad_data);
+    CHECK_NOTNULL(filter_grad_data);
+
+    real alpha = 1.0f;
+    real beta = 1.0f;
+    cudnnTensorDescriptor_t       src_desc = GET_TENSOR_DESCRIPTOR(input);
+    cudnnTensorDescriptor_t       diff_desc = GET_TENSOR_DESCRIPTOR(output);
+    cudnnConvolutionDescriptor_t  conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+    cudnnFilterDescriptor_t       grad_desc = GET_FILTER_DESCRIPTOR(filter);
+
+    CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter(
+                t_resource.cudnn_handle,
+                &alpha,
+                src_desc,
+                input_data,
+                diff_desc,
+                output_grad_data,
+                conv_desc,
+#if CUDNN_VERSION >= 4000
+                static_cast<cudnnConvolutionBwdFilterAlgo_t>(convBwdFilterAlgo),
+                gpuWorkSpace,
+                sizeInBytes,
+#endif
+                &beta,
+                grad_desc,
+                filter_grad_data));
+  CHECK_SYNC("hl_convolution_backward_filter failed");
+}
+
+void hl_convolution_backward_data(hl_tensor_descriptor input,
+                                  real* input_data_grad,
+                                  hl_tensor_descriptor output,
+                                  real* output_grad_data,
+                                  hl_filter_descriptor filter,
+                                  real* filter_data,
+                                  hl_convolution_descriptor conv,
+                                  void* gpuWorkSpace,
+                                  size_t sizeInBytes,
+                                  int convBwdDataAlgo) {
+    real alpha = 1.0f;
+    real beta = 1.0f;
+    cudnnFilterDescriptor_t       filter_desc = GET_FILTER_DESCRIPTOR(filter);
+    cudnnTensorDescriptor_t       diff_desc = GET_TENSOR_DESCRIPTOR(output);
+    cudnnTensorDescriptor_t       grad_desc = GET_TENSOR_DESCRIPTOR(input);
+    cudnnConvolutionDescriptor_t  conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+
+    CHECK_CUDNN(dynload::cudnnConvolutionBackwardData(
+                t_resource.cudnn_handle,
+                &alpha,
+                filter_desc,
+                filter_data,
+                diff_desc,
+                output_grad_data,
+                conv_desc,
+#if CUDNN_VERSION >= 4000
+                static_cast<cudnnConvolutionBwdDataAlgo_t>(convBwdDataAlgo),
+                gpuWorkSpace,
+                sizeInBytes,
+#endif
+                &beta,
+                grad_desc,
+                input_data_grad));
+  CHECK_SYNC("hl_convolution_backward_data failed");
+}
+
+
+void hl_softmax_forward(real *input,
+                        real *output,
+                        int height,
+                        int width)
+{
+#ifndef HPPL_TYPE_DOUBLE
+    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+    CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
+                t_resource.cudnn_desc,
+                CUDNN_TENSOR_NCHW,
+                data_type,
+                height,
+                width,
+                1,
+                1));
+
+    real alpha = 1.0f;
+    real beta = 0.0f;
+    CHECK_CUDNN(dynload::cudnnSoftmaxForward(
+                t_resource.cudnn_handle,
+                CUDNN_SOFTMAX_ACCURATE,
+                CUDNN_SOFTMAX_MODE_CHANNEL,
+                &alpha,
+                t_resource.cudnn_desc,
+                input,
+                &beta,
+                t_resource.cudnn_desc,
+                output));
+  CHECK_SYNC("hl_softmax_forward failed");
+}
+
+void hl_softmax_backward(real *output_value,
+                         real *output_grad,
+                         int height,
+                         int width)
+{
+#ifndef HPPL_TYPE_DOUBLE
+    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+    CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
+                t_resource.cudnn_desc,
+                CUDNN_TENSOR_NCHW,
+                data_type,
+                height,
+                width,
+                1,
+                1));
+
+    real alpha = 1.0f;
+    real beta = 0.0f;
+    CHECK_CUDNN(dynload::cudnnSoftmaxBackward(
+                t_resource.cudnn_handle,
+                CUDNN_SOFTMAX_ACCURATE,
+                CUDNN_SOFTMAX_MODE_CHANNEL,
+                &alpha,
+                t_resource.cudnn_desc,
+                output_value,
+                t_resource.cudnn_desc,
+                output_grad,
+                &beta,
+                t_resource.cudnn_desc,
+                output_grad));
+  CHECK_SYNC("hl_softmax_backward failed");
+}
+
+void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
+                                    real *input,
+                                    hl_tensor_descriptor outputDesc,
+                                    real *output,
+                                    hl_tensor_descriptor bnParamDesc,
+                                    real *scale,
+                                    real *bias,
+                                    double factor,
+                                    real *runningMean,
+                                    real *runningInvVar,
+                                    double epsilon,
+                                    real *savedMean,
+                                    real *savedVar) {
+#if CUDNN_VERSION >= 4000
+  if ((NULL != runningMean && NULL == runningInvVar) ||
+      (NULL == runningMean && NULL != runningInvVar)) {
+    LOG(FATAL) << "runningMean and runningInvVar can be NULL "
+              << "but only at the same time.";
+  }
+  if ((NULL != savedMean && NULL == savedVar) ||
+      (NULL == savedMean && NULL != savedVar)) {
+    LOG(FATAL) << "savedMean and savedVar can be NULL "
+               << "but only at the same time.";
+  }
+
+  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
+  cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
+  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(bnParamDesc);
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
+  CHECK_CUDNN(dynload::cudnnBatchNormalizationForwardTraining(
+              t_resource.cudnn_handle, mode, &alpha, &beta, xDesc,
+              input, yDesc, output, bnDesc, scale, bias, factor,
+              runningMean, runningInvVar, epsilon, savedMean, savedVar));
+
+  CHECK_SYNC("hl_batch_norm_forward_training failed");
+#else
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4000. "
+             << "But cudnn lib version is " << g_cudnn_lib_version;
+#endif
+}
+
+void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
+                                    real *input,
+                                    hl_tensor_descriptor outputDesc,
+                                    real *output,
+                                    hl_tensor_descriptor bnParamDesc,
+                                    real *scale,
+                                    real *bias,
+                                    real *estimatedMean,
+                                    real *estimatedInvVar,
+                                    double epsilon) {
+#if CUDNN_VERSION >= 4000
+  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
+  cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
+  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(bnParamDesc);
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
+  CHECK_CUDNN(dynload::cudnnBatchNormalizationForwardInference(
+              t_resource.cudnn_handle, mode, &alpha, &beta, xDesc,
+              input, yDesc, output, bnDesc, scale, bias,
+              estimatedMean, estimatedInvVar, epsilon));
+
+  CHECK_SYNC("hl_batch_norm_forward_inference failed");
+#else
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4000. "
+             << "But cudnn lib version is " << g_cudnn_lib_version;
+#endif
+}
+
+void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
+                            real *input,
+                            hl_tensor_descriptor outGradDesc,
+                            real *outGrad,
+                            hl_tensor_descriptor inGradDesc,
+                            real *inGrad,
+                            hl_tensor_descriptor dBnParamDesc,
+                            real *scale,
+                            real *scaleGrad,
+                            real *biasGrad,
+                            double epsilon,
+                            real *savedMean,
+                            real *savedInvVar) {
+#if CUDNN_VERSION >= 4000
+  if ((NULL != savedMean && NULL == savedInvVar) ||
+      (NULL == savedMean && NULL != savedInvVar)) {
+    LOG(FATAL) << "savedMean and savedVar can be NULL "
+               << "but only at the same time.";
+  }
+
+  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
+  cudnnTensorDescriptor_t dyDesc = GET_TENSOR_DESCRIPTOR(outGradDesc);
+  cudnnTensorDescriptor_t dxDesc = GET_TENSOR_DESCRIPTOR(inGradDesc);
+  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(dBnParamDesc);
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
+  CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(
+              t_resource.cudnn_handle, mode, &alpha, &beta,
+#if CUDNN_VERSION >= 5000
+              &alpha, &beta,
+#endif
+              xDesc, input, dyDesc, outGrad, dxDesc, inGrad,
+              bnDesc, scale, scaleGrad, biasGrad, epsilon,
+              savedMean, savedInvVar));
+
+  CHECK_SYNC("hl_batch_norm_backward failed");
+#else
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4000. "
+             << "But cudnn lib version is " << g_cudnn_lib_version;
+#endif
+}
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
new file mode 100644
index 00000000000000..23aa910c574855
--- /dev/null
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -0,0 +1,760 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <sys/time.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <mutex>
+#include "hl_cuda.h"
+#include "hl_cuda.ph"
+#include "hl_thread.ph"
+#include "hl_dso_loader.h"
+#include "paddle/utils/Logging.h"
+
+namespace dynload {
+
+std::once_flag curand_dso_flag;
+void* curand_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load curand routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#ifdef PADDLE_USE_DSO
+#define DYNAMIC_LOAD_CURAND_WRAP(__name)                           \
+  struct DynLoad__##__name {                                       \
+    template <typename... Args>                                    \
+    curandStatus_t operator()(Args... args) {                      \
+       typedef curandStatus_t (*curandFunc)(Args...);              \
+       std::call_once(curand_dso_flag, GetCurandDsoHandle,         \
+                      &curand_dso_handle);                         \
+       void* p_##__name = dlsym(curand_dso_handle, #__name);       \
+       return reinterpret_cast<curandFunc>(p_##__name)(args...);   \
+    }                                                              \
+  } __name;  /* struct DynLoad__##__name */
+#else
+#define DYNAMIC_LOAD_CURAND_WRAP(__name)                           \
+  struct DynLoad__##__name {                                       \
+    template <typename... Args>                                    \
+    curandStatus_t operator()(Args... args) {                      \
+       return __name(args...);                                     \
+    }                                                              \
+  } __name;  /* struct DynLoad__##__name */
+#endif
+
+/* include all needed curand functions in HPPL */
+#define CURAND_RAND_ROUTINE_EACH(__macro)    \
+  __macro(curandCreateGenerator)             \
+  __macro(curandSetStream)                   \
+  __macro(curandSetPseudoRandomGeneratorSeed)\
+  __macro(curandGenerateUniform)             \
+  __macro(curandGenerateUniformDouble)
+
+CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
+
+#undef CURAND_RAND_ROUTINE_EACH
+#undef DYNAMIC_LOAD_CURAND_WRAP
+
+std::once_flag cudart_dso_flag;
+void* cudart_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cuda routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#ifdef PADDLE_USE_DSO
+#define DYNAMIC_LOAD_CUDART_WRAP(__name)                            \
+  struct DynLoad__##__name {                                        \
+    template <typename... Args>                                     \
+    cudaError_t operator()(Args... args) {                          \
+      typedef cudaError_t (*cudartFunc)(Args...);                   \
+      std::call_once(cudart_dso_flag, GetCudartDsoHandle,           \
+                     &cudart_dso_handle);                           \
+      void* p_##__name = dlsym(cudart_dso_handle, #__name);         \
+      return reinterpret_cast<cudartFunc>(p_##__name)(args...);     \
+    }                                                               \
+  } __name;  /* struct DynLoad__##__name */
+#else
+#define DYNAMIC_LOAD_CUDART_WRAP(__name)                            \
+  struct DynLoad__##__name {                                        \
+    template <typename... Args>                                     \
+    cudaError_t operator()(Args... args) {                          \
+      return __name(args...);                                       \
+    }                                                               \
+  } __name;  /* struct DynLoad__##__name */
+#endif
+
+#ifdef PADDLE_USE_DSO
+  struct DynLoad__cudaGetErrorString {
+    template <typename... Args>
+    const char* operator()(Args... args) {
+      typedef const char* (*cudaFunc)(Args...);
+      std::call_once(cudart_dso_flag, GetCudartDsoHandle,
+                     &cudart_dso_handle);
+      void* p_func = dlsym(cudart_dso_handle, "cudaGetErrorString");
+      return reinterpret_cast<cudaFunc>(p_func)(args...);
+    }
+  } cudaGetErrorString;  /* struct DynLoad__cudaGetErrorString */
+#else
+struct DynLoad__cudaGetErrorString {
+  template <typename... Args>
+  const char* operator()(Args... args) {
+    return cudaGetErrorString(args...);
+  }
+} cudaGetErrorString;  /* struct DynLoad__cudaGetErrorString */
+#endif
+
+/* include all needed cuda functions in HPPL */
+#define CUDA_ROUTINE_EACH(__macro)        \
+  __macro(cudaMalloc)                     \
+  __macro(cudaHostAlloc)                  \
+  __macro(cudaFree)                       \
+  __macro(cudaFreeHost)                   \
+  __macro(cudaMemcpy)                     \
+  __macro(cudaMemset)                     \
+  __macro(cudaMemcpyAsync)                \
+  __macro(cudaSetDevice)                  \
+  __macro(cudaGetDevice)                  \
+  __macro(cudaGetDeviceCount)             \
+  __macro(cudaGetDeviceProperties)        \
+  __macro(cudaDeviceSynchronize)          \
+  __macro(cudaDeviceCanAccessPeer)        \
+  __macro(cudaDeviceEnablePeerAccess)     \
+  __macro(cudaStreamCreate)               \
+  __macro(cudaStreamDestroy)              \
+  __macro(cudaStreamSynchronize)          \
+  __macro(cudaStreamWaitEvent)            \
+  __macro(cudaEventCreate)                \
+  __macro(cudaEventRecord)                \
+  __macro(cudaEventQuery)                 \
+  __macro(cudaEventDestroy)               \
+  __macro(cudaEventSynchronize)           \
+  __macro(cudaEventElapsedTime)           \
+  __macro(cudaSetDeviceFlags)             \
+  __macro(cudaGetLastError)               \
+  __macro(cudaFuncSetCacheConfig)         \
+  __macro(cudaRuntimeGetVersion)
+
+CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
+
+#undef CUDA_ROUNTINE_EACH
+#undef DYNAMIC_LOAD_CUDART_WRAP
+
+}  /* namespace dynload */
+
+/**
+ * @brief   global resource.
+ */
+int                     g_system_device_num = 0;    /* system device number */
+int                     device_num = 0;             /* use    device number */
+hl_device_prop          *g_device;                  /* device info table */
+__thread thread_device_resources *t_device;         /* device resources table */
+int g_cuda_lib_version = 0;
+
+/* number of global stream */
+#define  NUMBER_OF_GLOBAL_STREAM    (HPPL_THREAD_STREAM_1)
+/* number of thread stream */
+#define  NUMBER_OF_THREAD_STREAM    (HPPL_STREAM_END - HPPL_THREAD_STREAM_1)
+/* sizeof of device memory */
+#define  HPPL_GPU_MEMORY_SIZE                (256*4)
+
+/**
+ * Check build-in cuda function using glog and it also
+ * support << operator for more details error info.
+ */
+cudaError_t cudaStat;
+#define CHECK_CUDA(cuda_func)                                 \
+  cudaStat = cuda_func;                                     \
+  CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "         \
+      << dynload::cudaGetErrorString(cudaStat) << ". "      \
+
+/**
+ * @brief   thread resource.
+ */
+__thread _hl_thread_resource t_resource = {
+                                           {0},     /* stream */
+                                           0,       /* handle */
+                                           0,       /* gen */
+                                           0,       /* cudnn_handle */
+                                           0,       /* cudnn_desc */
+                                           NULL,    /* gen_mutex */
+                                           NULL,    /* gpu_mem */
+                                           NULL,    /* cpu_mem */
+                                           0,       /* event */
+                                           -1,      /* device */
+                                           0,       /* major */
+                                           false};  /* is_init */
+
+__thread cudaStream_t default_stream = 0;
+__thread bool g_sync_flag = true;
+bool hl_start_flag = false;
+
+#define gettid() syscall(SYS_gettid)
+
+void hl_init(int device) {
+  CHECK(hl_start_flag)
+    << "[Init failed] hl_start() did not succeed.";
+
+  /* thread has been initialized */
+  if (true == t_resource.is_init) {
+    hl_set_device(device);
+    return;
+  }
+
+  /* create thread devcie resources */
+  char *tmp;
+  thread_device_resources device_res;
+  tmp = (char *)malloc(g_system_device_num*sizeof(thread_device_resources*) +
+                       device_num*sizeof(_thread_device_resources));
+  CHECK_NOTNULL(tmp);
+  t_device = (thread_device_resources*)tmp;
+  device_res = (thread_device_resources)((char*)tmp +
+               g_system_device_num*sizeof(thread_device_resources*));
+  memset(t_device, 0, g_system_device_num*sizeof(thread_device_resources*));
+
+  char *tmp_stream = (char *)
+      malloc(device_num*NUMBER_OF_THREAD_STREAM*sizeof(cudaStream_t));
+  CHECK_NOTNULL(tmp_stream);
+
+  int num = 0;
+  for (int dev = 0; dev < g_system_device_num; dev++) {
+    if (!g_device[dev]) {
+      continue;
+    }
+
+    t_device[dev] = &device_res[num];
+    t_device[dev]->stream = (cudaStream_t*)(tmp_stream +
+        num*NUMBER_OF_THREAD_STREAM*sizeof(cudaStream_t));
+
+    hl_create_thread_resources(dev, t_device[dev]);
+    num++;
+  }
+
+  hl_cudnn_desc_init(&t_resource.cudnn_desc);
+
+  /* thread initialization is complete */
+  t_resource.is_init = true;
+  /* set device */
+  t_resource.device = -1;
+  hl_set_device(device);
+}
+
+void hl_fini() {
+  if (false == t_resource.is_init) {
+    return;
+  }
+
+  /* hppl stream fini */
+  t_resource.device = -1;
+  for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) {
+    t_resource.stream[i] = 0;
+  }
+
+  char* tmp = (char*)t_device;
+  char* tmp_stream = NULL;
+  for (int dev = 0; dev < g_system_device_num; dev++) {
+    if (!t_device[dev]) {
+      continue;
+    }
+    if (!tmp_stream) {
+        tmp_stream = (char*)t_device[dev]->stream;
+    }
+    for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
+      CHECK_CUDA(dynload::cudaStreamDestroy(t_device[dev]->stream[j]));
+    }
+
+    /* free device memory */
+    hl_free_mem_device(t_device[dev]->gpu_mem);
+    hl_free_mem_host(t_device[dev]->cpu_mem);
+    CHECK_CUDA(dynload::cudaEventDestroy(t_device[dev]->mem_event));
+  }
+
+  free(tmp);
+  free(tmp_stream);
+  t_resource.is_init = false;
+}
+
+int hl_get_device_count() {
+  return device_num;
+}
+
+void hl_set_device(int device) {
+  if (device == t_resource.device) {
+    return;
+  }
+
+  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
+    << "Device: " << device << " is not specified in startup.";
+
+  CHECK_CUDA(dynload::cudaSetDevice(device));
+
+  /* switch thread stream */
+  for (int i = 0; i < NUMBER_OF_GLOBAL_STREAM; i++) {
+    t_resource.stream[i] = g_device[device]->device_resources->stream[i];
+  }
+
+  if (true == t_resource.is_init) {
+    for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) {
+      t_resource.stream[i] =
+        t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM];
+    }
+    t_resource.gpu_mem = t_device[device]->gpu_mem;
+    t_resource.cpu_mem = t_device[device]->cpu_mem;
+    t_resource.event   = t_device[device]->mem_event;
+  }
+
+  t_resource.handle = g_device[device]->device_resources->handle;
+  t_resource.gen = g_device[device]->device_resources->gen;
+  t_resource.cudnn_handle = g_device[device]->device_resources->cudnn_handle;
+  t_resource.gen_mutex = g_device[device]->device_resources->gen_mutex;
+  t_resource.device = device;
+  t_resource.major = g_device[device]->major;
+  default_stream = t_resource.stream[0];
+}
+
+int hl_get_device() {
+  int device;
+  CHECK_CUDA(dynload::cudaGetDevice(&device));
+  return device;
+}
+
+void* hl_malloc_device(size_t size) {
+  void *dest_d;
+
+  CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
+  CHECK_CUDA(dynload::cudaMalloc((void**)&dest_d, size));
+
+  return dest_d;
+}
+
+void hl_free_mem_device(void *dest_d) {
+  CHECK_NOTNULL(dest_d);
+
+  cudaError_t err = dynload::cudaFree(dest_d);
+  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
+    << hl_get_device_error_string();
+}
+
+void* hl_malloc_host(size_t size) {
+  void *dest_h;
+
+  CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
+  CHECK_CUDA(dynload::cudaHostAlloc((void**)&dest_h, size, cudaHostAllocDefault));
+
+  return dest_h;
+}
+
+void hl_free_mem_host(void *dest_h) {
+  CHECK_NOTNULL(dest_h);
+
+  cudaError_t err = dynload::cudaFreeHost(dest_h);
+  CHECK (cudaSuccess == err || cudaErrorCudartUnloading == err)
+    << hl_get_device_error_string();
+}
+
+void hl_memcpy(void *dst, void *src, size_t size) {
+  if (0 == size) {
+    return;
+  }
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(src);
+  CHECK_CUDA(dynload::cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+}
+
+void hl_memset_device(void *dest_d, int value, size_t size) {
+  CHECK_CUDA(dynload::cudaMemset(dest_d, value, size));
+}
+
+void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
+  if (0 == size) {
+    return;
+  }
+  CHECK_NOTNULL(src_h);
+  CHECK_NOTNULL(dest_d);
+  CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size,
+             cudaMemcpyHostToDevice));
+}
+
+void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
+  if (0 == size) {
+    return;
+  }
+  CHECK_NOTNULL(dest_h);
+  CHECK_NOTNULL(src_d);
+  CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size,
+             cudaMemcpyDeviceToHost));
+}
+
+void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
+  if (0 == size) {
+    return;
+  }
+  CHECK_NOTNULL(dest_d);
+  CHECK_NOTNULL(src_d);
+  CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_d, size,
+             cudaMemcpyDeviceToDevice));
+}
+
+void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
+  cudaStream_t cu_stream;
+
+  if (0 == size) {
+    return;
+  }
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(src);
+  CHECK_LT(stream, HPPL_STREAM_END);
+  cu_stream = t_resource.stream[stream];
+
+  CHECK_CUDA(dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault,
+             cu_stream));
+}
+
+void hl_start() {
+  hl_specify_devices_start(NULL, 0);
+  /* set default device */
+  hl_set_device(0);
+}
+
+bool hl_device_can_access_peer(int device, int peerDevice) {
+  int canAccessPeer;
+  CHECK_CUDA(dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device,
+             peerDevice));
+
+  if (canAccessPeer == 1) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void hl_device_enable_peer_access(int peerDevice) {
+  cudaError_t err = dynload::cudaDeviceEnablePeerAccess(peerDevice, 0);
+  if (cudaErrorPeerAccessAlreadyEnabled == err) {
+    dynload::cudaGetLastError();
+  } else {
+    CHECK_CUDA(err);
+  }
+}
+
+void hl_create_global_resources(hl_device_prop device_prop) {
+  struct cudaDeviceProp cu_prop;
+  int device = device_prop->device;
+  global_device_resources device_res = device_prop->device_resources;
+
+  CHECK_CUDA(dynload::cudaSetDevice(device));
+  /* device properties */
+  CHECK_CUDA(dynload::cudaGetDeviceProperties(&cu_prop, device));
+
+  device_prop->major = cu_prop.major;
+  device_prop->minor = cu_prop.minor;
+  strncpy(device_prop->device_name, cu_prop.name, 256);
+  device_prop->device_mem = cu_prop.totalGlobalMem;
+
+  /* create device stream */
+  for (int j = 0; j < NUMBER_OF_GLOBAL_STREAM; j++) {
+    CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j]));
+  }
+
+  /* cublas init */
+  hl_cublas_init(&device_res->handle, device_res->stream[0]);
+
+  /* create curand gen */
+  CHECK_EQ(dynload::curandCreateGenerator(&device_res->gen,
+           CURAND_RNG_PSEUDO_DEFAULT), CURAND_STATUS_SUCCESS)
+           << "[Start failed] Curand init failed.";
+
+  CHECK_EQ(dynload::curandSetStream(device_res->gen,
+           device_res->stream[0]), CURAND_STATUS_SUCCESS)
+           << "[Start failed] Curand set stream failed!";
+
+  /* create cudnn handle */
+  hl_cudnn_init(&device_res->cudnn_handle, device_res->stream[0]);
+
+  int seed = gettid();
+  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(
+           device_res->gen, seed+device), CURAND_STATUS_SUCCESS);
+
+  device_res->gen_mutex =
+    (pthread_mutex_t*)(malloc(sizeof (pthread_mutex_t)));
+  pthread_mutex_init(device_res->gen_mutex, NULL);
+
+  CHECK_CUDA(dynload::cudaRuntimeGetVersion(&g_cuda_lib_version));
+}
+
+int hl_get_cuda_version() {
+  return g_cuda_lib_version;
+}
+
+void hl_create_thread_resources(int device, thread_device_resources device_res) {
+  CHECK_CUDA(dynload::cudaSetDevice(device));
+
+  /* create thread stream */
+  for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
+    CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j]));
+  }
+
+  /* allocation device memory */
+  device_res->gpu_mem = (real*)hl_malloc_device(HPPL_GPU_MEMORY_SIZE);
+
+  /* allocation host memory */
+  device_res->cpu_mem = (real*)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
+
+  CHECK_CUDA(dynload::cudaEventCreate(&device_res->mem_event));
+}
+
+void hl_specify_devices_start(int* device, int number) {
+  if (hl_start_flag) return;
+
+  /* 1. get the number of devices */
+  CHECK_CUDA(dynload::cudaGetDeviceCount(&g_system_device_num));
+  CHECK_NE(g_system_device_num, 0) << "[Start failed] there is no GPU device";
+  if (device == NULL) {
+    number = g_system_device_num;
+  }
+
+  /* 2. check device & create device property table */
+  CHECK_LE(number, g_system_device_num)
+    << "[Start failed] System does not have enough device. "
+    << "Device number: " << g_system_device_num
+    << "Input number: " << number;
+
+  char *tmp;
+  hl_device_prop device_prop;
+  tmp = (char *)malloc(g_system_device_num*sizeof(hl_device_prop*) +
+                       number*sizeof(_hl_device_prop));
+  CHECK(tmp) << "[Start failed] System memory is not enough.";
+
+  g_device = (hl_device_prop*)tmp;
+  device_prop = (hl_device_prop)((char*)tmp +
+                g_system_device_num*sizeof(hl_device_prop*));
+  memset(g_device, 0, g_system_device_num*sizeof(hl_device_prop*));
+  int num = 0;
+  for (int i = 0; i < number; i++) {
+    int dev;
+    if (device == NULL) {
+      dev = i;
+    } else {
+      dev = device[i];
+    }
+
+    CHECK_LT(dev, g_system_device_num)
+      << "[Start failed] The specified device number is "
+      << "out of range. Max device number: " << g_system_device_num - 1
+      << " Specified devcie number: "<< dev;
+
+    if (g_device[dev]) {
+      /* Warning */
+      LOG(WARNING) <<"[Warning] Repeat specify device: " << dev;
+      continue;
+    }
+
+    g_device[dev] = &device_prop[num];
+    g_device[dev]->device = dev;
+    num++;
+  }
+  device_num = num;
+
+  /* 3.  create global device resources */
+  char *tmp_res = (char *)malloc(device_num*sizeof(_global_device_resources));
+  CHECK_NOTNULL(tmp_res);
+
+  char *tmp_stream =
+    (char *)malloc(device_num*NUMBER_OF_GLOBAL_STREAM*sizeof(cudaStream_t));
+  CHECK_NOTNULL(tmp_stream);
+
+  num = 0;
+  for (int i = 0; i < g_system_device_num; i++) {
+    if (!g_device[i]) {
+      continue;
+    }
+
+    g_device[i]->device_resources = (global_device_resources)(tmp_res +
+      num*sizeof(_global_device_resources));
+    g_device[i]->device_resources->stream = (cudaStream_t*)(tmp_stream +
+      num*NUMBER_OF_GLOBAL_STREAM*sizeof(cudaStream_t));
+
+    hl_create_global_resources(g_device[i]);
+    num++;
+  }
+
+  /* hl_start() is ok */
+  hl_start_flag = true;
+  /* set default device */
+  if (device == NULL) {
+      hl_set_device(0);
+  } else {
+      hl_set_device(device[0]);
+  }
+}
+
+void hl_rand(real *dest_d, size_t num) {
+  pthread_mutex_lock(t_resource.gen_mutex);
+  CHECK_EQ(
+#ifndef HPPL_TYPE_DOUBLE
+  dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
+#else
+  dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
+#endif
+  CURAND_STATUS_SUCCESS);
+  pthread_mutex_unlock(t_resource.gen_mutex);
+  CHECK_SYNC("hl_rand failed");
+}
+
+void hl_srand(unsigned int seed) {
+  pthread_mutex_lock(t_resource.gen_mutex);
+  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(
+           t_resource.gen, seed), CURAND_STATUS_SUCCESS);
+  pthread_mutex_unlock(t_resource.gen_mutex);
+}
+
+void hl_set_sync_flag(bool flag) {
+  g_sync_flag = flag;
+}
+
+bool hl_get_sync_flag() {
+  return g_sync_flag;
+}
+
+void hl_stream_synchronize(hl_stream_t stream) {
+  cudaStream_t cu_stream;
+
+  CHECK_LT(stream, HPPL_STREAM_END)
+    << __func__ <<": the parameter stream is error.";
+
+  cu_stream = t_resource.stream[stream];
+  CHECK_CUDA(dynload::cudaStreamSynchronize(cu_stream));
+}
+
+void hl_create_event(hl_event_t *event) {
+  CHECK_NOTNULL(event);
+
+  struct _hl_event_st* st_event =
+    (struct _hl_event_st*)malloc(sizeof(struct _hl_event_st));
+
+  CHECK_CUDA(dynload::cudaEventCreate(&st_event->cu_event));
+
+  *event = st_event;
+}
+
+float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
+  float time;
+  CHECK_NOTNULL(start);
+  CHECK_NOTNULL(end);
+
+  CHECK_CUDA(dynload::cudaEventElapsedTime(&time,
+             start->cu_event, end->cu_event));
+  return time;
+}
+
+void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
+  cudaStream_t cu_stream;
+
+  CHECK_NOTNULL(event);
+  CHECK_LT(stream, HPPL_STREAM_END)
+    << __func__ <<": the parameter stream is error.";
+
+  cu_stream = t_resource.stream[stream];
+  CHECK_CUDA(dynload::cudaEventRecord(
+             event->cu_event, cu_stream));
+}
+
+void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
+  cudaStream_t cu_stream;
+
+  CHECK_NOTNULL(event);
+  CHECK_LT(stream, HPPL_STREAM_END)
+    << __func__ <<": the parameter stream is error.";
+
+  cu_stream = t_resource.stream[stream];
+  CHECK_CUDA(dynload::cudaStreamWaitEvent(
+             cu_stream, event->cu_event, 0));
+}
+
+void hl_destroy_event(hl_event_t event) {
+  CHECK_NOTNULL(event);
+  CHECK_CUDA(dynload::cudaEventDestroy(event->cu_event));
+
+  free(event);
+  event = NULL;
+}
+
+void hl_event_synchronize(hl_event_t event) {
+  CHECK_NOTNULL(event);
+  CHECK_CUDA(dynload::cudaEventSynchronize(event->cu_event));
+}
+
+void hl_get_device_name(char *name, int len, int device) {
+  CHECK_NOTNULL(name);
+  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
+    << "Device("<< device <<") is not specified in startup.";
+
+  strncpy(name, g_device[device]->device_name , len);
+}
+
+void hl_get_device_memory(size_t *mem_size, int device) {
+  CHECK_NOTNULL(mem_size);
+  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
+    << "Device("<< device <<") is not specified in startup.";
+
+  *mem_size = g_device[device]->device_mem;
+}
+
+void hl_get_device_compute_capability(int *major, int *minor, int device) {
+  CHECK_NOTNULL(major);
+  CHECK_NOTNULL(minor);
+  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
+    << "Device("<< device << ") is not specified in startup.";
+
+  *major = g_device[device]->major;
+  *minor = g_device[device]->minor;
+}
+
+int hl_get_device_last_error() {
+  return (int)dynload::cudaGetLastError();
+}
+
+const char* hl_get_device_error_string() {
+  cudaError_t err = dynload::cudaGetLastError();
+  return dynload::cudaGetErrorString(err);
+}
+
+const char* hl_get_device_error_string(size_t err) {
+  return dynload::cudaGetErrorString((cudaError_t)err);
+}
+
+void hl_device_synchronize() {
+  CHECK_CUDA(dynload::cudaDeviceSynchronize());
+}
+void hl_set_device_flags_block() {
+  CHECK_CUDA(dynload::cudaSetDeviceFlags(
+             cudaDeviceScheduleBlockingSync));
+}
+
+void hl_cuda_event_query(hl_event_t event, bool& isNotReady) {
+  cudaError_t err = dynload::cudaEventQuery(event->cu_event);
+  CHECK(cudaSuccess == err || cudaErrorNotReady == err);
+
+  if (cudaErrorNotReady == err) {
+    isNotReady = true;
+  }
+}
diff --git a/paddle/cuda/src/hl_cuda_lstm.cu b/paddle/cuda/src/hl_cuda_lstm.cu
new file mode 100644
index 00000000000000..64699c9f6d4504
--- /dev/null
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
@@ -0,0 +1,700 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "hl_base.h"
+#include "hl_cuda_cublas.h"
+#include "hl_device_functions.cuh"
+#include "hl_activation_functions.h"
+#include "paddle/utils/Logging.h"
+
+typedef hppl::Active<real>::forward  t_forward;
+typedef hppl::Active<real>::backward t_backward;
+
+bool hl_lstm_sequence_parallel(int frameSize) {
+  if (frameSize == 32 || frameSize == 64) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+class frameValue {
+public:
+  real *value_;
+  __device__ frameValue(real *value) : value_(value) {}
+  template <int reversed, int frameSize>
+  __device__ inline void init(int start, int length, int idx) {
+    if (reversed == 0) {
+      value_ += start * frameSize + idx;
+    } else {
+      value_ += (start + length - 1) * frameSize + idx;
+    }
+  }
+  __device__ inline real *getPtr() const {return value_;}
+  __device__ inline real getValue() {return *value_;}
+  __device__ inline void setValue(real value) {*value_ = value;}
+  template <int reversed, int frameSize>
+  __device__ inline void nextFrame() {
+    if (reversed == 0) {
+      value_ += frameSize;
+    } else {
+      value_ -= frameSize;
+    }
+  }
+};
+
+__device__ __forceinline__
+void ptx_sync(const int id, const int barriers) {
+  asm volatile("bar.sync %0, %1;" : : "r"(id), "r"(barriers) : "memory");
+}
+
+__device__ __forceinline__
+void ptx_arrive(const int id, const int barriers) {
+  asm volatile("bar.arrive %0, %1;" : : "r"(id), "r"(barriers) : "memory");
+}
+
+template<int valueSize, int frameSize>
+__device__ __forceinline__ real
+forward_sequence(real value,
+                 real *shValue,
+                 real *state,
+                 real *preOutput,
+                 real *output,
+                 real check,
+                 int index,
+                 t_forward activeNode,
+                 t_forward activeGate,
+                 t_forward activeState) {
+  real out;
+  real prevOut;
+  real state_r;
+  const int idx = index % frameSize;
+  const int idy = index / frameSize;
+  // assert(index < valueSize);
+
+  if (idy == 0) {
+    value = activeNode(value);
+    shValue[index] = value;
+  }
+  if (idy == 1 || idy == 2) {
+    state_r = state[idx];
+    value += state_r * check;
+    value = activeGate(value);
+    shValue[index] = value;
+  }
+  ptx_sync(1, valueSize);
+  if (idy == 3) {
+    state_r = state[idx];
+    state_r = state_r * shValue[idx + frameSize * 2];
+    state_r += shValue[idx] * shValue[idx + frameSize];
+    state[idx] = state_r;
+    ptx_arrive(2, frameSize * 2);
+    value += state_r * check;
+    value = activeGate(value);
+    shValue[index] = value;
+    ptx_sync(3, frameSize * 2);
+    prevOut = preOutput[idx];
+    out = prevOut * value;
+    output[idx] = out;
+  }
+  if (idy == 0) {
+    ptx_sync(2, frameSize * 2);
+    prevOut = state[idx];
+     prevOut = activeState(prevOut);
+    preOutput[idx] = prevOut;
+    ptx_arrive(3, frameSize * 2);
+  }
+  return value;
+}
+
+#define     OUTPUT_BARRIER_ID               10
+#define     OUTPUT_BARRIER_ID2              11
+template<int valueSize, int frameSize, int reversed,
+         int computeThreads, int blockSize>
+__global__ void KeLstmForward(real *gateValue,
+                              real *state,
+                              real *output,
+                              real *preOutput,
+                              real *checkIg,
+                              real *checkFg,
+                              real *checkOg,
+                              real *weight,
+                              const int *starts,
+                              hl_activation_mode_t active_node,
+                              hl_activation_mode_t active_gate,
+                              hl_activation_mode_t active_state) {
+  __shared__ real shValue[valueSize];
+  __shared__ real shState[frameSize];
+  __shared__ real shPrevOutput[frameSize];
+  __shared__ real shOutput[frameSize];
+
+  const int index = threadIdx.x;
+  int start = starts[blockIdx.x];
+  int length = starts[blockIdx.x + 1] - start;
+
+  /* init */
+  real check;
+  real value;
+  frameValue frameGate(gateValue);
+  frameValue frameState(state);
+  frameValue frameOutput(output);
+  frameValue framePreOutput(preOutput);
+  if (index < valueSize) {
+    const int idx = index % frameSize;
+    const int idy = index / frameSize;
+    frameGate.init<reversed, valueSize>(start, length, index);
+    value = frameGate.getValue();
+    if (idy == 0) {
+      shState[idx] = 0.0;
+    } else if (idy == 1) {
+      check = checkIg[idx];
+    } else if (idy == 2) {
+      check = checkFg[idx];
+    } else if (idy == 3) {
+      check = checkOg[idx];
+    }
+
+    if (idy == 3) {
+      frameState.init<reversed, frameSize>(start, length, idx);
+      frameOutput.init<reversed, frameSize>(start, length, idx);
+      framePreOutput.init<reversed, frameSize>(start, length, idx);
+    }
+
+    ptx_sync(1, valueSize);
+  }
+
+  for (int i = 0; i < length; ++i) {
+    if (index < valueSize) {
+      if (valueSize == 128) {
+        if (i != 0) {
+          ptx_sync(OUTPUT_BARRIER_ID2, blockSize);
+          value += shValue[index];
+        }
+      }
+      value = forward_sequence<valueSize, frameSize>(
+        value, shValue, shState, shPrevOutput, shOutput, check, index,
+        hppl::gpu::forward[active_node],
+        hppl::gpu::forward[active_gate],
+        hppl::gpu::forward[active_state]);
+      const int idx = index % frameSize;
+      const int idy = index / frameSize;
+      if (valueSize == 128) {
+        if (idy == 3) {
+          ptx_arrive(OUTPUT_BARRIER_ID, frameSize + 128);
+        }
+      }
+      if (valueSize == 256) {
+        ptx_sync(OUTPUT_BARRIER_ID, valueSize);
+      }
+      frameGate.setValue(value);
+      if (idy == 3) {
+        frameState.setValue(shState[idx]);
+        frameOutput.setValue(shOutput[idx]);
+        framePreOutput.setValue(shPrevOutput[idx]);
+        frameState.nextFrame<reversed, frameSize>();
+        frameOutput.nextFrame<reversed, frameSize>();
+        framePreOutput.nextFrame<reversed, frameSize>();
+      }
+      if (i != length - 1) {
+        frameGate.nextFrame<reversed, valueSize>();
+        value = frameGate.getValue();
+      }
+    }
+    if (i != length - 1) {
+      if (valueSize == 128) {
+        if (valueSize <= index) {
+          real B_r[frameSize];
+          const int computeIdx = index - valueSize;
+          if (i == 0) {
+            #pragma unroll
+            for (int n = 0; n < frameSize; n++) {
+              B_r[n] = weight[n * valueSize + computeIdx];
+            }
+          }
+          ptx_sync(OUTPUT_BARRIER_ID, frameSize + 128);
+          real A_r[frameSize];
+          for (int n = 0; n < frameSize; n++) {
+            A_r[n] = shOutput[n];
+          }
+          real sum = 0.0f;
+          for (int n = 0; n < frameSize; n++) {
+            sum += A_r[n]*B_r[n];
+          }
+          shValue[computeIdx] = sum;
+          ptx_arrive(OUTPUT_BARRIER_ID2, blockSize);
+        }
+      }
+      if (valueSize == 256) {
+        real B_r[frameSize];
+        if (i == 0) {
+          #pragma unroll
+          for (int n = 0; n < frameSize; n++) {
+            B_r[n] = weight[n * valueSize + index];
+          }
+        }
+        real sum = 0.0f;
+        for (int n = 0; n < frameSize; n++) {
+          sum += shOutput[n]*B_r[n];
+        }
+        value += sum;
+      }
+    }
+  }
+}
+
+void hl_lstm_parallel_forward(real *gateValue,
+                              real *stateValue,
+                              real *preOutputValue,
+                              real *outputValue,
+                              real *checkIg,
+                              real *checkFg,
+                              real *checkOg,
+                              real *weight,
+                              const int *sequence,
+                              int frameSize,
+                              int numSequences,
+                              bool reversed,
+                              hl_activation_mode_t active_node,
+                              hl_activation_mode_t active_gate,
+                              hl_activation_mode_t active_state) {
+  CHECK(frameSize == 32 || frameSize == 64);
+  dim3 grid(numSequences, 1);
+  if (!reversed) {
+    if (frameSize == 32) {
+      KeLstmForward<128, 32, 0, 128, 256>
+               <<<grid, 256, 0, STREAM_DEFAULT>>>
+               (gateValue, stateValue, outputValue, preOutputValue,
+               checkIg, checkFg, checkOg, weight, sequence,
+               active_node, active_gate, active_state);
+    } else if (frameSize == 64) {
+      KeLstmForward<256, 64, 0, 256, 256>
+               <<<grid, 256, 0, STREAM_DEFAULT>>>
+               (gateValue, stateValue, outputValue, preOutputValue,
+               checkIg, checkFg, checkOg, weight, sequence,
+               active_node, active_gate, active_state);
+    }
+  } else {
+    if (frameSize == 32) {
+      KeLstmForward<128, 32, 1, 128, 256>
+               <<<grid, 256, 0, STREAM_DEFAULT>>>
+               (gateValue, stateValue, outputValue, preOutputValue,
+               checkIg, checkFg, checkOg, weight, sequence,
+               active_node, active_gate, active_state);
+    } else if (frameSize == 64) {
+      KeLstmForward<256, 64, 1, 256, 256>
+               <<<grid, 256, 0, STREAM_DEFAULT>>>
+               (gateValue, stateValue, outputValue, preOutputValue,
+               checkIg, checkFg, checkOg, weight, sequence,
+               active_node, active_gate, active_state);
+    }
+  }
+  CHECK_SYNC("hl_lstm_parallel_forward failed");
+}
+
+__device__ __forceinline__
+void transpose_32x32(real a[], const int idx) {
+  int addr = idx % 32;
+  #pragma unroll
+  for (int k = 1; k < 32; k++) {
+    // rSrc[k] = __shfl(rSrc[k], (threadIdx.x + k) % 32, 32);
+    addr = __shfl(addr, (idx + 1) % 32, 32);
+    a[k] = __shfl(a[k], addr, 32);
+  }
+
+  #pragma unroll
+  for (int tid = 0; tid < 31; tid++) {
+    real tmp = (idx > tid) ? a[0] : a[1];
+    #pragma unroll
+    for (int k = 31; k > 0; k--) {
+      a[(k + 1) % 32] = (idx > tid) ? a[k] : a[(k + 1) % 32];
+    }
+    a[1] = tmp;
+  }
+
+  addr = (32 - idx) % 32;
+  #pragma unroll
+  for (int k = 0; k < 32; k++) {
+    a[k] = __shfl(a[k], addr, 32);
+    addr = __shfl(addr, (idx + 31) % 32, 32);
+  }
+}
+
+template<int valueSize, int frameSize>
+__device__ void
+backward_sequence(real rGateValue,
+                  real rOutputGrad,
+                  real rPreOutputValue,
+                  real &rGateGrad,
+                  real &rStateGrad,
+                  real *shStateGrad,
+                  real *shStateValue,
+                  real *shGateValue,
+                  real rCheck,
+                  real &rGateValuePrev,
+                  int index,
+                  t_backward activeNode,
+                  t_backward activeGate,
+                  t_backward activeState) {
+  const int frameIdx = index % frameSize;
+  const int frameIdy = index / frameSize;
+  if (frameIdy == 3) {
+    real rPrevOutputGrad;
+    rPrevOutputGrad = rOutputGrad * rGateValue;
+    rStateGrad = activeState(rPrevOutputGrad, rPreOutputValue);
+    rGateGrad = rOutputGrad * rPreOutputValue;
+    rGateGrad = activeGate(rGateGrad, rGateValue);
+    rStateGrad += rGateGrad * rCheck;
+    shStateGrad[index] = rStateGrad;
+    ptx_arrive(3, valueSize);
+  } else if (frameIdy == 1) {
+    shGateValue[frameIdx + frameSize] = rGateValue;
+    rStateGrad = rGateGrad * rCheck;
+    shStateGrad[index] = rStateGrad;
+    ptx_sync(3, valueSize);
+    rStateGrad += shStateGrad[frameIdx + frameSize *2];
+    rStateGrad += shStateGrad[frameIdx + frameSize *3];
+    rGateGrad = rStateGrad * shGateValue[frameIdx];
+    rGateGrad = activeGate(rGateGrad, rGateValue);
+  } else if (frameIdy == 2) {
+    rStateGrad = rStateGrad * rGateValuePrev;
+    rStateGrad += rGateGrad * rCheck;
+    shStateGrad[index] = rStateGrad;
+    ptx_sync(3, valueSize);
+    rStateGrad += shStateGrad[frameIdx + frameSize];
+    rStateGrad += shStateGrad[frameIdx + frameSize *3];
+    rGateValuePrev = rGateValue;
+    rGateGrad = rStateGrad * shStateValue[frameIdx];
+    rGateGrad = activeGate(rGateGrad, rGateValue);
+  } else if (frameIdy == 0) {
+    shGateValue[frameIdx] = rGateValue;
+    ptx_sync(3, valueSize);
+    rStateGrad = shStateGrad[frameIdx + frameSize];
+    rStateGrad += shStateGrad[frameIdx + frameSize *2];
+    rStateGrad += shStateGrad[frameIdx + frameSize *3];
+    rGateGrad = rStateGrad * shGateValue[frameIdx + frameSize];
+    rGateGrad = activeNode(rGateGrad, rGateValue);
+  }
+}
+
+template<int valueSize, int frameSize>
+__device__ void load_weight(real rWeight[], real *weight, const int index) {
+  if (valueSize == 128) {
+    weight += index;
+    #pragma unroll
+    for (int n = 0; n < frameSize; n++) {
+      rWeight[n] = weight[n*valueSize];
+    }
+    transpose_32x32(rWeight, index % 32);
+  }
+  if (valueSize == 256) {
+    int id = (index / 32) % 2;
+    weight += index - id * 32 + id * 32 * valueSize;
+    #pragma unroll
+    for (int n = 0; n < 32; n++) {
+      rWeight[n] = weight[n*valueSize];
+      rWeight[n + 32] = weight[n*valueSize + 32];
+    }
+    transpose_32x32(rWeight, index % 32);
+    transpose_32x32(&rWeight[32], index % 32);
+  }
+}
+
+template<int valueSize, int frameSize, int reversed>
+__global__ void KeLstmBackward(real *gateValue,
+                               real *gateGrad,
+                               real *stateValue,
+                               real *stateGrad,       /* do not need save */
+                               real *preOutputValue,
+                               real *preOutputGrad,   /* do not need save */
+                               real *checkIg,
+                               real *checkIgGrad,
+                               real *checkFg,
+                               real *checkFgGrad,
+                               real *checkOg,
+                               real *checkOgGrad,
+                               real *outputGrad,
+                               real *weightValue,
+                               const int *starts,
+                               hl_activation_mode_t active_node,
+                               hl_activation_mode_t active_gate,
+                               hl_activation_mode_t active_state) {
+  __shared__ real shGateValue[valueSize];
+  __shared__ real shStateGrad[valueSize];
+  __shared__ real shStateValue[frameSize];
+  __shared__ real shGateGrad[4][frameSize];
+  __shared__ real shOutputGrad[4][frameSize];
+  const int index = threadIdx.x;
+  int start = starts[blockIdx.x];
+  int length = starts[blockIdx.x + 1] - start;
+
+  const int frameIdx = index % frameSize;
+  const int frameIdy = index / frameSize;
+  real rCheck;
+  real rCheckGrad;
+  real rGateGrad;
+  real rStateGrad;
+  real rGateValuePrev;
+  real rPreOutputValue;
+  real rOutputGrad;
+  real rGateValue;
+  real rStateValue;
+
+  frameValue frameGateValue(gateValue);
+  frameValue frameGateGrad(gateGrad);
+  frameValue framePreOutputValue(preOutputValue);
+  frameValue frameStateValue(stateValue);
+  frameValue frameOutputGrad(outputGrad);
+  if (frameIdy == 0) {
+  } else if (frameIdy == 1) {
+    rCheck = checkIg[frameIdx];
+  } else if (frameIdy == 2) {
+    rCheck = checkFg[frameIdx];
+    rGateValuePrev = 0.0;
+    rStateGrad = 0.0;
+  } else if (frameIdy == 3) {
+    rCheck = checkOg[frameIdx];
+    framePreOutputValue.init<!reversed, frameSize>(start, length, frameIdx);
+    frameOutputGrad.init<!reversed, frameSize>(start, length, frameIdx);
+    rOutputGrad = frameOutputGrad.getValue();
+    rPreOutputValue = framePreOutputValue.getValue();
+    frameStateValue.init<!reversed, frameSize>(start, length, frameIdx);
+    rStateValue = frameStateValue.getValue();
+  }
+
+  frameGateValue.init<!reversed, valueSize>(start, length, index);
+  frameGateGrad.init<!reversed, valueSize>(start, length, index);
+  rGateValue = frameGateValue.getValue();
+  rGateGrad = 0.0;
+  rCheckGrad = 0.0;
+
+  real B_r[frameSize];
+  load_weight<valueSize, frameSize>(B_r, weightValue, index);
+
+  for (int i = 0; i < length; ++i) {
+    if (frameIdy == 3) {
+      if (i != length -1) {
+        frameStateValue.nextFrame<!reversed, frameSize>();
+        shStateValue[frameIdx] = frameStateValue.getValue();
+      } else {
+        shStateValue[frameIdx] = 0.0;
+      }
+    }
+    backward_sequence<valueSize, frameSize>(
+        rGateValue, rOutputGrad, rPreOutputValue, rGateGrad,
+        rStateGrad, shStateGrad, shStateValue, shGateValue,
+        rCheck, rGateValuePrev, index,
+        hppl::gpu::backward[active_node],
+        hppl::gpu::backward[active_gate],
+        hppl::gpu::backward[active_state]);
+    if (frameIdy == 3) {
+      rCheckGrad += rGateGrad * rStateValue;
+      rStateValue = shStateValue[frameIdx];
+    }
+
+    frameGateGrad.setValue(rGateGrad);
+    frameGateGrad.nextFrame<!reversed, valueSize>();
+
+    if (i != length - 1) {
+      if (frameIdy == 3) {
+        framePreOutputValue.nextFrame<!reversed, frameSize>();
+        rPreOutputValue = framePreOutputValue.getValue();
+        frameOutputGrad.nextFrame<!reversed, frameSize>();
+        rOutputGrad = frameOutputGrad.getValue();
+      } else if (frameIdy == 2) {
+        rCheckGrad += rGateGrad * shStateValue[frameIdx];
+      } else if (frameIdy == 1) {
+        rCheckGrad += rGateGrad * shStateValue[frameIdx];
+      }
+
+      frameGateValue.nextFrame<!reversed, valueSize>();
+      rGateValue = frameGateValue.getValue();
+      shGateGrad[frameIdy][frameIdx] = rGateGrad;
+      if (valueSize == 128) {
+        real sum = 0.0f;
+        #pragma unroll
+        for (int n = 0; n < frameSize; n++) {
+          sum += shGateGrad[frameIdy][n]*B_r[n];
+        }
+        if (frameIdy == 3) {
+          rOutputGrad += sum;
+        } else {
+          shOutputGrad[frameIdy][frameIdx] = sum;
+        }
+      }
+      if (valueSize == 256) {
+        ptx_sync(5, valueSize);
+        real A_r[frameSize];
+        for (int n = 0; n < frameSize; n++) {
+          A_r[n] = shGateGrad[frameIdy][n];
+        }
+        real sum = 0.0f;
+        for (int n = 0; n < frameSize; n++) {
+          sum += A_r[n]*B_r[n];
+        }
+        if (frameIdy == 3) {
+          rOutputGrad += sum;
+        } else {
+          shOutputGrad[frameIdy][frameIdx] = sum;
+        }
+      }
+
+      if (frameIdy == 3) {
+        ptx_sync(6, valueSize);
+        #pragma unroll
+        for (int i = 0; i < 3; i ++) {
+          rOutputGrad += shOutputGrad[i][frameIdx];
+        }
+      } else {
+        ptx_arrive(6, valueSize);
+      }
+    }
+  }
+
+  /* TODO: Temporary save & merger in another kernel */
+  if (frameIdy == 1) {
+    if (checkIgGrad) atomicAdd(checkIgGrad+frameIdx, rCheckGrad);
+  } else if (frameIdy == 2) {
+    if (checkFgGrad) atomicAdd(checkFgGrad+frameIdx, rCheckGrad);
+  } else if (frameIdy == 3) {
+    if (checkOgGrad) atomicAdd(checkOgGrad+frameIdx, rCheckGrad);
+  }
+}
+
+void hl_lstm_parallel_backward_data(real *gateValue,
+                                    real *gateGrad,
+                                    real *stateValue,
+                                    real *stateGrad,
+                                    real *preOutputValue,
+                                    real *preOutputGrad,
+                                    real *outputGrad,
+                                    real *checkIg,
+                                    real *checkIgGrad,
+                                    real *checkFg,
+                                    real *checkFgGrad,
+                                    real *checkOg,
+                                    real *checkOgGrad,
+                                    real *weight,
+                                    const int *sequence,
+                                    int frameSize,
+                                    int numSequences,
+                                    bool reversed,
+                                    hl_activation_mode_t active_node,
+                                    hl_activation_mode_t active_gate,
+                                    hl_activation_mode_t active_state) {
+  CHECK(frameSize == 32 || frameSize == 64 ||
+        frameSize == 128 || frameSize == 256);
+  dim3 grid(numSequences, 1);
+  if (!reversed) {
+    if (frameSize == 32) {
+      KeLstmBackward<128, 32, 0><<<grid, 128, 0, STREAM_DEFAULT>>>
+          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
+          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
+          checkOgGrad, outputGrad, weight, sequence,
+          active_node, active_gate, active_state);
+    } else if (frameSize == 64) {
+      KeLstmBackward<256, 64, 0><<<grid, 256, 0, STREAM_DEFAULT>>>
+          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
+          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
+          checkOgGrad, outputGrad, weight, sequence,
+          active_node, active_gate, active_state);
+    } else if (frameSize == 128) {
+      KeLstmBackward<512, 128, 0><<<grid, 512, 0, STREAM_DEFAULT>>>
+          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
+          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
+          checkOgGrad, outputGrad, weight, sequence,
+          active_node, active_gate, active_state);
+    } else if (frameSize == 256) {
+      KeLstmBackward<1024, 256, 0><<<grid, 1024, 0, STREAM_DEFAULT>>>
+          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
+          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
+          checkOgGrad, outputGrad, weight, sequence,
+          active_node, active_gate, active_state);
+    }
+  } else {
+    if (frameSize == 32) {
+      KeLstmBackward<128, 32, 1><<<grid, 128, 0, STREAM_DEFAULT>>>
+          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
+          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
+          checkOgGrad, outputGrad, weight, sequence,
+          active_node, active_gate, active_state);
+    } else if (frameSize == 64) {
+      KeLstmBackward<256, 64, 1><<<grid, 256, 0, STREAM_DEFAULT>>>
+          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
+          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
+          checkOgGrad, outputGrad, weight, sequence,
+          active_node, active_gate, active_state);
+    } else if (frameSize == 128) {
+      KeLstmBackward<512, 128, 1><<<grid, 512, 0, STREAM_DEFAULT>>>
+          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
+          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
+          checkOgGrad, outputGrad, weight, sequence,
+          active_node, active_gate, active_state);
+    } else if (frameSize == 256) {
+      KeLstmBackward<1024, 256, 1><<<grid, 1024, 0, STREAM_DEFAULT>>>
+          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
+          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
+          checkOgGrad, outputGrad, weight, sequence,
+          active_node, active_gate, active_state);
+    }
+  }
+  CHECK_SYNC("hl_lstm_parallel_backward_data");
+}
+
+template<int B_X, int B_Y>
+__global__ void KeSetGradZero(real *gateGrad,
+    const int *starts, int valueSize, int numSequences, bool reversed) {
+  // const int tid = threadIdx.x;
+
+  const int frameIdx = blockIdx.x * B_X + threadIdx.x;
+  const int numSeqId = blockIdx.y * B_Y + threadIdx.y;
+
+  if (numSeqId >= numSequences || frameIdx >= valueSize) return;
+
+  if (!reversed) {
+    int seqId = starts[numSeqId];
+    gateGrad[seqId * valueSize + frameIdx] = 0.0;
+  } else {
+    int seqId = starts[numSeqId + 1] - 1;
+    gateGrad[seqId * valueSize + frameIdx] = 0.0;
+  }
+}
+
+void hl_lstm_parallel_backward_weight(real *weightGrad,
+                                      real *outputValue,
+                                      real *gateGrad,
+                                      const int *sequence,
+                                      int frameSize,
+                                      int batchSize,
+                                      int numSequences,
+                                      bool reversed) {
+  int valueSize = 4 * frameSize;
+  dim3 threads(32, 32);
+  dim3 grid((valueSize + 32 - 1) / 32, (numSequences + 32 - 1) / 32);
+  KeSetGradZero<32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>
+           (gateGrad, sequence, valueSize, numSequences, reversed);
+
+  if (!reversed) {
+    hl_matrix_mul(outputValue,
+      HPPL_OP_T, gateGrad + valueSize, HPPL_OP_N, weightGrad,
+      frameSize, valueSize, batchSize - 1,
+      1.0, 1.0);
+  } else {
+    hl_matrix_mul(outputValue + frameSize,
+      HPPL_OP_T, gateGrad, HPPL_OP_N, weightGrad,
+      frameSize, valueSize, batchSize - 1,
+      1.0, 1.0);
+  }
+  CHECK_SYNC("hl_lstm_parallel_backward_weight");
+}
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
new file mode 100644
index 00000000000000..15799919fa137c
--- /dev/null
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -0,0 +1,681 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "hl_base.h"
+#include "hl_matrix.h"
+#include "hl_matrix_ops.cuh"
+#include "hl_matrix_apply.cuh"
+#include "hl_sequence.h"
+#include "paddle/utils/Logging.h"
+
+DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b);
+void hl_matrix_add(real *A_d,
+                   real *B_d,
+                   real *C_d,
+                   int dimM,
+                   int dimN,
+                   real alpha,
+                   real beta) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+
+  hl_gpu_apply_ternary_op
+    <real, ternary::_add<real>, 0, 0>(ternary::_add<real>(alpha, beta),
+                                      A_d,
+                                      B_d,
+                                      C_d,
+                                      dimM,
+                                      dimN,
+                                      dimN,
+                                      dimN,
+                                      dimN);
+  CHECK_SYNC("hl_matrix_add failed");
+}
+
+#ifdef HPPL_TYPE_DOUBLE
+    #define THRESHOLD   128
+#else
+    #define THRESHOLD   64
+#endif
+__device__ __forceinline__
+void findMax(real* I,
+             real* dfMax_s,
+             int blockSize,
+             int base,
+             int curIdx,
+             int nextIdx,
+             int dimN,
+             real* max) {
+  dfMax_s[base] = -1.0e20;
+  while (curIdx < dimN) {
+    if (dfMax_s[base] < I[nextIdx]) {
+      dfMax_s[base] = I[nextIdx];
+    }
+    nextIdx += blockSize;
+    curIdx += blockSize;
+  }
+  __syncthreads();
+
+  for (int stride = blockSize >> 1; stride > 0; stride >>= 1) {
+    __syncthreads();
+    if (base < stride) {
+      nextIdx = base + stride;
+      if (dfMax_s[base] < dfMax_s[nextIdx]) {
+          dfMax_s[base] = dfMax_s[nextIdx];
+      }
+    }
+  }
+
+  if (0 == base)  {
+    max[0] = dfMax_s[0];
+  }
+  __syncthreads();
+}
+
+__device__ __forceinline__
+void subMaxAndExp(real* I,
+                  real* O,
+                  int curIdx,
+                  int nextIdx,
+                  int blockSize,
+                  int dimN,
+                  real max) {
+  real val;
+  while (curIdx < dimN) {
+    val = I[nextIdx] - max;
+    if (val < -THRESHOLD) {
+      val = -THRESHOLD;
+    }
+    I[nextIdx] = val;
+#ifndef HPPL_TYPE_DOUBLE
+    O[nextIdx] = __expf(val);
+#else
+    O[nextIdx] = exp(val);
+#endif
+    nextIdx += blockSize;
+    curIdx += blockSize;
+  }
+  __syncthreads();
+}
+
+__device__ __forceinline__
+void valueSum(real* O,
+              real* dfMax_s,
+              int blockSize,
+              int base,
+              int curIdx,
+              int nextIdx,
+              int dimN) {
+  dfMax_s[base] = 0;
+  while (curIdx < dimN) {
+    dfMax_s[base] += O[nextIdx];
+    nextIdx += blockSize;
+    curIdx += blockSize;
+  }
+  __syncthreads();
+
+  for (int stride = blockSize >> 1; stride > 0; stride >>= 1) {
+    __syncthreads();
+    if (base < stride) {
+      nextIdx = base + stride;
+      dfMax_s[base] += dfMax_s[nextIdx];
+    }
+  }
+  __syncthreads();
+}
+
+__device__ __forceinline__
+void divSum(real* O,
+            real sum,
+            int curIdx,
+            int nextIdx,
+            int blockSize,
+            int dimN) {
+  while (curIdx < dimN) {
+    O[nextIdx] /= sum;
+    nextIdx += blockSize;
+    curIdx += blockSize;
+  }
+}
+
+__device__ __forceinline__
+void softmax(real* I,
+             real* O,
+             real* dfMax_s,
+             int blockSize,
+             int base,
+             int curIdx,
+             int nextIdx,
+             int dimN) {
+  __shared__ real max;
+
+  // find the max number
+  findMax(I, dfMax_s, blockSize, base, curIdx,
+          nextIdx, dimN, &max);
+
+  // sub max Value and do Exp operation
+  subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max);
+
+  // add dimN values into blockDim.x buffer
+  // sum is in dfMax_s[0]
+  valueSum(O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
+
+  // divided by sum
+  divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN);
+}
+
+template<int blockSize>
+__global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
+  int base = threadIdx.x;
+  __shared__ real dfMax_s[blockSize];
+  int nextIdx = blockIdx.x * dimN + base;
+  int curIdx = base;
+
+  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
+}
+
+void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  dim3 block(512, 1);
+  dim3 grid(dimM, 1);
+  KeMatrixSoftMax<512>
+           <<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
+  CHECK_SYNC("hl_matrix_softmax failed");
+}
+
+template<int blockSize>
+__global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
+  int base = threadIdx.x;
+  int bid = blockIdx.x;
+  __shared__ real dfMax_s[blockSize];
+
+  int start = index[bid];
+  int dimN = index[bid + 1] - start;
+
+  int nextIdx = start + base;
+  int curIdx = base;
+
+  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
+}
+
+void hl_sequence_softmax_forward(real *A_d,
+                                 real *C_d,
+                                 const int* index,
+                                 int numSequence) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  dim3 block(512, 1);
+  dim3 grid(numSequence, 1);
+  KeSequenceSoftMax<512>
+           <<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
+  CHECK_SYNC("hl_sequence_softmax_forward failed");
+}
+
+__global__ void KeMatrixDerivative(real *grad_d,
+                                   real *output_d,
+                                   real *sftmaxSum_d,
+                                   int dimM,
+                                   int dimN) {
+  int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
+  int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
+  int index;
+
+  if (rowIdx < dimM && colIdx < dimN) {
+    index = rowIdx*dimN + colIdx;
+    grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]);
+  }
+}
+
+void hl_matrix_softmax_derivative(real *grad_d,
+                                  real *output_d,
+                                  real *sftmaxSum_d,
+                                  int dimM,
+                                  int dimN) {
+  CHECK_NOTNULL(grad_d);
+  CHECK_NOTNULL(output_d);
+  CHECK_NOTNULL(sftmaxSum_d);
+
+  int blocksX = (dimM + 0) / 1;
+  int blocksY = (dimN + 1024 -1) / 1024;
+  dim3 threads(1, 1024);
+  dim3 grid(blocksX, blocksY);
+
+  KeMatrixDerivative<<< grid, threads, 0, STREAM_DEFAULT >>>
+           (grad_d, output_d, sftmaxSum_d, dimM, dimN);
+  CHECK_SYNC("hl_matrix_softmax_derivative failed");
+}
+
+template<int blockSize>
+__global__ void KeMatrixClassificationError(real* in_A,
+                                            int* in_B,
+                                            real* out_C,
+                                            int dimM,
+                                            int dimN) {
+  __shared__ real max_s[blockSize];
+  __shared__ int max_l[blockSize];
+  int cnt = (dimN + blockSize -1) / blockSize;
+  int tid = threadIdx.x;
+  int lmt = tid;
+  int index = 0;
+  real t;
+
+  max_s[tid] = -1e30f;
+  for (int ii = 0; ii < cnt && lmt < dimN; ii++) {
+    index = blockIdx.y*dimN + lmt;
+    t = in_A[index];
+    if (max_s[tid] < t) {
+      max_s[tid] = t;
+      max_l[tid] = lmt;
+    }
+    lmt += blockSize;
+  }
+  __syncthreads();
+
+  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
+    if (tid < stride) {
+      if (max_s[tid] < max_s[tid + stride]) {
+        max_s[tid] = max_s[tid + stride];
+        max_l[tid] = max_l[tid + stride];
+      }
+    }
+    __syncthreads();
+  }
+  __syncthreads();
+
+  if (tid == 0) {
+    out_C[blockIdx.y] = (max_l[0] == in_B[blockIdx.y] ? 0 : 1.0f);
+  }
+}
+
+void hl_matrix_classification_error(real* A_d,
+                                    int* B_d,
+                                    real* C_d,
+                                    int dimM,
+                                    int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+
+  int blocksX = 1;
+  int blocksY = dimM;
+  dim3 threads(1024, 1);
+  dim3 grid(blocksX, blocksY);
+  KeMatrixClassificationError<1024><<< grid, threads, 0, STREAM_DEFAULT >>>
+           (A_d, B_d, C_d, dimM, dimN);
+  CHECK_SYNC("hl_matrix_classification_error");
+}
+
+__global__ void KeMatrixCrossEntropy(real* O,
+                                     real* E,
+                                     int* label,
+                                     int dimM,
+                                     int dimN) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int newBase;
+  if (index < dimM) {
+    newBase = label[index];
+    newBase = newBase % dimN;
+    E[index] = -log(O[index * dimN + newBase]);
+  }
+}
+
+void hl_matrix_cross_entropy(real* A_d,
+                             real* C_d,
+                             int* label_d,
+                             int dimM,
+                             int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  int blocks = (dimM + 1024 - 1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocks, 1);
+  KeMatrixCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>>
+           (A_d, C_d, label_d, dimM, dimN);
+  CHECK_SYNC("hl_matrix_cross_entropy failed");
+}
+
+__global__ void KeMatrixCrossEntropyBp(real* grad_d,
+                                       real* output_d,
+                                       int* label_d,
+                                       int dimM,
+                                       int dimN) {
+  int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
+  int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
+  int index;
+  if (rowIdx < dimM && colIdx < dimN) {
+    index = rowIdx*dimN + colIdx;
+    if (label_d[rowIdx] == colIdx) {
+      grad_d[index] -= 1.0f / output_d[index];
+    }
+  }
+}
+
+void hl_matrix_cross_entropy_bp(real* grad_d,
+                                real* output_d,
+                                int* label_d,
+                                int dimM,
+                                int dimN) {
+  CHECK_NOTNULL(grad_d);
+  CHECK_NOTNULL(output_d);
+  CHECK_NOTNULL(label_d);
+
+  int blocksX = (dimM + 0)/1;
+  int blocksY = (dimN + 1024 -1) / 1024;
+  dim3 threads(1, 1024);
+  dim3 grid(blocksX, blocksY);
+  KeMatrixCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>>
+           (grad_d, output_d, label_d, dimM, dimN);
+  CHECK_SYNC("hl_matrix_cross_entropy_bp failed");
+}
+
+void hl_matrix_zero_mem(real* data, int num) {
+  hl_gpu_apply_unary_op(
+        unary::Zero<real>(), data, 1, num, num);
+}
+
+__global__ void KeParamReluForward(real* output,
+                                   real* input,
+                                   real* w,
+                                   int width,
+                                   int height,
+                                   int partial_sum) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int ty = blockIdx.y * blockDim.y + threadIdx.y;
+  if (tx < width && ty < height) {
+    int index = ty * width + tx;
+    output[index] = input[index] > 0 ? input[index] :
+        input[index] * w[tx / partial_sum];
+  }
+}
+
+void hl_param_relu_forward(real* output,
+                           real* input,
+                           real* w,
+                           int width,
+                           int height,
+                           int partial_sum) {
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(w);
+  dim3 threads(16, 16);
+  int blockX = (width + 16 - 1) / 16;
+  int blockY = (height + 16 -1) / 16;
+  dim3 grid(blockX, blockY);
+  KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>
+    (output, input, w, width, height, partial_sum);
+  CHECK_SYNC("hl_param_relu_forward failed");
+}
+
+template<int blockSize>
+__global__ void KeParamReluBackWardW(real* grad_w,
+                                     real* grad_o,
+                                     real* input,
+                                     int width,
+                                     int height,
+                                     int partial_sum) {
+  const int tid = threadIdx.x;
+  __shared__ real temp[blockSize];
+  grad_o += partial_sum * blockIdx.x;
+  input += partial_sum * blockIdx.x;
+  real tmp = 0.0;
+  for (int index = tid; index < partial_sum * height; index += blockSize) {
+    int row = index / partial_sum;
+    int offset = row * width + (index - row * partial_sum);
+    if (input[offset] < 0) {
+      tmp += grad_o[offset] * input[offset];
+    }
+  }
+  temp[tid] = tmp;
+  __syncthreads();
+  for (int s = blockSize / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      temp[tid] += temp[tid + s];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    grad_w[blockIdx.x] += temp[0];
+  }
+}
+
+void hl_param_relu_backward_w(real* grad_w,
+                              real* grad_o,
+                              real* input,
+                              int width,
+                              int height,
+                              int partial_sum) {
+  CHECK_NOTNULL(grad_w);
+  CHECK_NOTNULL(grad_o);
+  CHECK_NOTNULL(input);
+  const int blockSize = 1024;
+  int grid_num = width / partial_sum;
+  dim3 threads(blockSize, 1);
+  dim3 grid(grid_num, 1);
+  KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
+    (grad_w, grad_o, input, width, height, partial_sum);
+  CHECK_SYNC("hl_param_relu_backward_w failed");
+}
+
+__global__ void KeParamReluBackwardDiff(real* grad_o,
+                                        real* input,
+                                        real* w,
+                                        real* diff,
+                                        int width,
+                                        int height,
+                                        int partial_sum) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int ty = blockIdx.y * blockDim.y + threadIdx.y;
+  if (tx < width && ty < height) {
+    int index = ty * width + tx;
+    diff[index] += grad_o[index] * (input[index] > 0 ? 1 : w[tx / partial_sum]);
+  }
+}
+
+void hl_param_relu_backward_diff(real* grad_o,
+                                 real* data,
+                                 real* w,
+                                 real* diff,
+                                 int width,
+                                 int height,
+                                 int partial_sum) {
+  CHECK_NOTNULL(grad_o);
+  CHECK_NOTNULL(data);
+  CHECK_NOTNULL(w);
+  CHECK_NOTNULL(diff);
+  dim3 threads(16, 16);
+  int blockX = (width + 16 - 1) / 16;
+  int blockY = (height + 16 -1) / 16;
+  dim3 grid(blockX, blockY);
+  KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>
+      (grad_o, data, w, diff, width, height, partial_sum);
+  CHECK_SYNC("hl_param_relu_backward_diff failed");
+}
+
+template<int blockSize>
+__global__ void KeCosSim(real* output,
+                         real* input1,
+                         real* input2,
+                         int width,
+                         int input1_height,
+                         int input2_height,
+                         real scale) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  __shared__ real xx[blockSize];
+  __shared__ real yy[blockSize];
+  __shared__ real xy[blockSize];
+
+  xx[tid] = 0.0;
+  yy[tid] = 0.0;
+  xy[tid] = 0.0;
+  __syncthreads();
+
+  input1 += ty * width;
+  if (input2_height > 1) {
+    input2 += ty * width;
+  }
+  for (int index = tid; index < width; index += blockSize) {
+    real x = input1[index];
+    real y = input2[index];
+    xx[tid] += x * x;
+    yy[tid] += y * y;
+    xy[tid] += x * y;
+  }
+  __syncthreads();
+
+  for (int s = blockSize / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      xx[tid] += xx[tid + s];
+      yy[tid] += yy[tid + s];
+      xy[tid] += xy[tid + s];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    output[ty] = scale * xy[0] / (sqrt(xx[0]) * sqrt(yy[0]));
+  }
+}
+
+void hl_cossim(real* output,
+               real* input1,
+               real* input2,
+               int width,
+               int input1_height,
+               int input2_height,
+               real scale) {
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(input1);
+  CHECK_NOTNULL(input2);
+  const int blockSize = 256;
+  dim3 threads(blockSize, 1);
+  dim3 grid(1, input1_height);
+
+  KeCosSim<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
+    (output, input1, input2, width, input1_height, input2_height, scale);
+  CHECK_SYNC("hl_cossim failed");
+}
+
+template<int blockSize>
+__global__ void KeCosSimDerivative(real* grad,
+                                   real* output,
+                                   real* prevOutX,
+                                   real* prevOutY,
+                                   real* prevGradX,
+                                   real* prevGradY,
+                                   int width,
+                                   int input1_height,
+                                   int input2_height,
+                                   real scale) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  __shared__ real xx[blockSize];
+  __shared__ real yy[blockSize];
+  __shared__ real xy[blockSize];
+
+  xx[tid] = 0.0;
+  yy[tid] = 0.0;
+  xy[tid] = 0.0;
+  __syncthreads();
+
+  prevOutX += ty * width;
+  prevGradX += ty * width;
+  if (input2_height > 1) {
+    prevOutY += ty * width;
+    prevGradY += ty * width;
+  }
+  for (int index = tid; index < width; index += blockSize) {
+    real x = prevOutX[index];
+    real y = prevOutY[index];
+    xx[tid] += x * x;
+    yy[tid] += y * y;
+    xy[tid] += x * y;
+  }
+  __syncthreads();
+
+  for (int s = blockSize / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      xx[tid] += xx[tid + s];
+      yy[tid] += yy[tid + s];
+      xy[tid] += xy[tid + s];
+    }
+    __syncthreads();
+  }
+  if (xy[0] == 0) {
+    real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
+    for (int index = tid; index < width; index += blockSize) {
+      prevGradX[index] +=
+        scale * grad[ty] * prevOutY[index] * reciprocal;
+      if (input2_height > 1) {
+        prevGradY[index] +=
+          scale * grad[ty] * prevOutX[index] * reciprocal;
+      } else {
+        atomicAdd(prevGradY + index,
+          scale * grad[ty] * prevOutX[index] * reciprocal);
+      }
+    }
+  } else {
+    real reciprocalXY = 1.0 / xy[0];
+    real reciprocalSquareSumX = 1.0 / xx[0];
+    real reciprocalSquareSumY = 1.0 / yy[0];
+    for (int index = tid; index < width; index += blockSize) {
+      prevGradX[index] += output[ty] * grad[ty] *
+        (prevOutY[index] * reciprocalXY -
+         prevOutX[index] * reciprocalSquareSumX);
+      if (input2_height > 1) {
+        prevGradY[index] += output[ty] * grad[ty] *
+          (prevOutX[index] * reciprocalXY -
+           prevOutY[index] * reciprocalSquareSumY);
+      } else {
+        atomicAdd(prevGradY + index, output[ty] * grad[ty] *
+          (prevOutX[index] * reciprocalXY -
+           prevOutY[index] * reciprocalSquareSumY));
+      }
+    }
+  }
+}
+
+
+void hl_cossim_derivative(real* grad,
+                          real* output,
+                          real* prevOutX,
+                          real* prevOutY,
+                          real* prevGradX,
+                          real* prevGradY,
+                          int width,
+                          int input1_height,
+                          int input2_height,
+                          real scale) {
+  CHECK_NOTNULL(grad);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(prevOutX);
+  CHECK_NOTNULL(prevOutY);
+  CHECK_NOTNULL(prevGradX);
+  CHECK_NOTNULL(prevGradY);
+  const int blockSize = 256;
+  dim3 threads(blockSize, 1);
+  dim3 grid(1, input1_height);
+  KeCosSimDerivative<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
+    (grad, output, prevOutX, prevOutY, prevGradX, prevGradY, width,
+        input1_height, input2_height, scale);
+  CHECK_SYNC("hl_cossim_derivate failed");
+}
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
new file mode 100644
index 00000000000000..f88a2682fd0604
--- /dev/null
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -0,0 +1,502 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "hl_device_functions.cuh"
+#include "paddle/utils/Logging.h"
+
+__global__ void KeMaxSequenceForward(real *input,
+                                     const int *sequence,
+                                     real* output,
+                                     int *index,
+                                     int numSequences,
+                                     int dim) {
+  int dimIdx = threadIdx.x;
+  int sequenceId = blockIdx.x;
+  if (sequenceId >= numSequences) return;
+  int start = sequence[sequenceId];
+  int end = sequence[sequenceId+1];
+
+  for (int i = dimIdx; i < dim; i += blockDim.x) {
+    real tmp = -HL_FLOAT_MAX;
+    int tmpId = -1;
+    for (int insId = start; insId < end; insId++) {
+      if (tmp < input[insId*dim + i]) {
+        tmp = input[insId*dim + i];
+        tmpId = insId;
+      }
+    }
+    output[sequenceId*dim + i] = tmp;
+    index[sequenceId*dim + i] = tmpId;
+  }
+}
+
+void hl_max_sequence_forward(real* input,
+                             const int* sequence,
+                             real* output,
+                             int *index,
+                             int numSequences,
+                             int dim) {
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(index);
+
+  dim3 threads(256, 1);
+  dim3 grid(numSequences, 1);
+  KeMaxSequenceForward<<< grid, threads, 0, STREAM_DEFAULT >>>
+      (input, sequence, output, index, numSequences, dim);
+  CHECK_SYNC("hl_max_sequence_forward failed");
+}
+
+__global__ void KeMaxSequenceBackward(real *outputGrad,
+                                      int *index,
+                                      real* inputGrad,
+                                      int numSequences,
+                                      int dim) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int colIdx = idx % dim;
+  if (idx < numSequences*dim) {
+    int insId = index[idx];
+    inputGrad[insId * dim + colIdx] += outputGrad[idx];
+  }
+}
+
+void hl_max_sequence_backward(real* outputGrad,
+                              int *index,
+                              real* inputGrad,
+                              int numSequences,
+                              int dim) {
+  CHECK_NOTNULL(outputGrad);
+  CHECK_NOTNULL(index);
+  CHECK_NOTNULL(inputGrad);
+
+  unsigned int blocks = (numSequences * dim + 128 - 1) / 128;
+  dim3 threads(128, 1);
+  dim3 grid(blocks, 1);
+  KeMaxSequenceBackward<<< grid, threads, 0, STREAM_DEFAULT >>>
+      (outputGrad, index, inputGrad, numSequences, dim);
+  CHECK_SYNC("hl_max_sequence_backward failed");
+}
+
+template <bool padding>
+__global__ void KeContextProjectionForward(real* input,
+                                           const int* sequence,
+                                           real* weightData,
+                                           real* output,
+                                           int inputDim,
+                                           int contextLength,
+                                           int contextStart,
+                                           int beginPad) {
+  int idx = threadIdx.x;
+  int blockSize = blockDim.x;
+  int sequenceId = blockIdx.x;
+  int seqStart = sequence[sequenceId];
+  int seqEnd = sequence[sequenceId+1];
+  real value = 0;
+
+  int instances = seqEnd - seqStart + contextLength - 1;
+  output += seqStart * inputDim * contextLength;
+  input += seqStart * inputDim;
+  for (int k = 0; k <= inputDim / blockSize; k++) {
+    if (idx < inputDim) {
+      for (int i = 0; i < instances; i++) {
+        // i + contextStart;
+        if ((i + contextStart) < 0) {
+          if (padding) {
+            value = weightData[i * inputDim + idx];
+          } else {
+            continue;
+          }
+        } else if ((i + contextStart) >= (seqEnd - seqStart)) {
+          if (padding) {
+            value =
+              weightData[(beginPad + i + contextStart - (seqEnd - seqStart)) *
+                         inputDim + idx];
+          } else {
+            continue;
+          }
+        } else {
+          value = input[(i + contextStart) * inputDim + idx];
+        }
+
+        int outx = (i - contextLength) < 0 ? i : (contextLength - 1);
+        int outy = (i - contextLength) < 0 ? 0 : (i - (contextLength - 1));
+        real* output_r =
+          output + outy * inputDim * contextLength + outx * inputDim;
+        for (int j = outy; j < seqEnd - seqStart; j++) {
+          output_r[idx] += value;
+          if (j - outy == outx) break;
+          output_r += (contextLength - 1) * inputDim;
+        }
+      }
+    }
+    idx += blockSize;
+  }
+}
+
+void hl_context_projection_forward(real* input,
+                                   const int* sequence,
+                                   real* weightData,
+                                   real* output,
+                                   int numSequences,
+                                   int inputDim,
+                                   int contextLength,
+                                   int contextStart,
+                                   int beginPad,
+                                   bool isPadding) {
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(output);
+  CHECK(!isPadding || weightData);
+
+  int blockSize = 128;
+  int blocksX = numSequences;
+  int blocksY = 1;
+  dim3 threads(blockSize, 1);
+  dim3 grid(blocksX, blocksY);
+
+  if (isPadding) {
+    KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
+      (input, sequence, weightData, output, inputDim,
+       contextLength, contextStart, beginPad);
+  } else  {
+    KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
+      (input, sequence, weightData, output, inputDim,
+       contextLength, contextStart, beginPad);
+  }
+  CHECK_SYNC("hl_context_projection_forward failed");
+}
+
+__global__ void KeContextProjectionBackwardData(real* outputGrad,
+                                                const int* sequence,
+                                                real* inputGrad,
+                                                int inputDim,
+                                                int contextLength,
+                                                int contextStart) {
+  int idx = threadIdx.x;
+  int blockSize = blockDim.x;
+  int sequenceId = blockIdx.x;
+  int seqStart = sequence[sequenceId];
+  int seqEnd = sequence[sequenceId+1];
+  real value = 0;
+
+  int instances = seqEnd - seqStart + contextLength - 1;
+  outputGrad += seqStart * inputDim * contextLength;
+  inputGrad += seqStart * inputDim;
+  for (int k = 0; k <= inputDim / blockSize; k++) {
+    if (idx < inputDim) {
+      for (int i = 0; i < instances; i++) {
+        if ((i + contextStart) < 0) {
+          continue;
+        } else if ((i + contextStart) >= (seqEnd - seqStart)) {
+          continue;
+        } else {
+          // value = 0;
+          value = inputGrad[(i + contextStart) * inputDim + idx];
+        }
+
+        int outx = (i - contextLength) < 0 ? i : (contextLength - 1);
+        int outy = (i - contextLength) < 0 ? 0 : (i - (contextLength - 1));
+        real* output_r =
+          outputGrad + outy * inputDim * contextLength + outx * inputDim;
+        for (int j = outy; j < seqEnd - seqStart; j++) {
+          value += output_r[idx];
+          if (j - outy == outx) break;
+          output_r += (contextLength - 1) * inputDim;
+        }
+        inputGrad[(i + contextStart) * inputDim + idx] = value;
+      }
+    }
+    idx += blockSize;
+  }
+}
+
+void hl_context_projection_backward_data(real* outputGrad,
+                                         const int* sequence,
+                                         real* inputGrad,
+                                         int numSequences,
+                                         int inputDim,
+                                         int contextLength,
+                                         int contextStart) {
+  CHECK_NOTNULL(outputGrad);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(inputGrad);
+
+  int blockSize = 128;
+  int blocksX = numSequences;
+  int blocksY = 1;
+  dim3 threads(blockSize, 1);
+  dim3 grid(blocksX, blocksY);
+  KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
+    (outputGrad, sequence, inputGrad, inputDim, contextLength, contextStart);
+  CHECK_SYNC("hl_context_projection_backward_data failed");
+}
+
+template<int THREADS_X, int THREADS_Y>
+__global__ void KeContextProjectionBackwardWeight(real* outputGrad,
+                                                  const int* sequence,
+                                                  real* weightGrad,
+                                                  int numSequences,
+                                                  int weightDim,
+                                                  int contextLength,
+                                                  int contextStart,
+                                                  int beginPad) {
+  __shared__ real sum_s[THREADS_Y][THREADS_X];
+  int padOfBlock = (weightDim + THREADS_X - 1) / THREADS_X;
+  const int idx = threadIdx.x;
+  const int idy = threadIdx.y;
+  int padId = blockIdx.x / padOfBlock;
+  int weightIdx = idx + THREADS_X * (blockIdx.x % padOfBlock);
+  int instanceId;
+  real value = 0;
+  real* output_r;
+
+  sum_s[idy][idx] = 0.0f;
+  if (weightIdx < weightDim) {
+    for (int seqId = idy; seqId < numSequences; seqId += THREADS_Y) {
+      int seqStart = sequence[seqId];
+      int seqEnd = sequence[seqId+1];
+      output_r = outputGrad + seqStart * weightDim * contextLength;
+
+      if (contextStart < 0) {
+        if (padId + contextStart < 0) {
+          instanceId = padId;
+        } else {
+          // beginPad > 0;
+          instanceId = (padId - beginPad) + (seqEnd - seqStart) - contextStart;
+        }
+      } else {
+        if (padId + (seqEnd - seqStart) < contextStart) {
+          continue;
+        } else {
+          // beginPad == 0;
+          instanceId = padId + (seqEnd - seqStart) - contextStart;
+        }
+      }
+
+      int outx = (instanceId - contextLength) < 0 ?
+                 instanceId : (contextLength - 1);
+      int outy = (instanceId - contextLength) < 0 ?
+                 0 : (instanceId - (contextLength - 1));
+      output_r += outy * weightDim * contextLength + outx * weightDim;
+      for (int j = outy; j < seqEnd - seqStart; j++) {
+        value += output_r[weightIdx];
+        if (j - outy == outx) break;
+        output_r += (contextLength - 1) * weightDim;
+      }
+    }
+    sum_s[idy][idx] = value;
+  }
+  __syncthreads();
+
+  for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
+    if (idy < stride) {
+      sum_s[idy][idx] += sum_s[idy + stride][idx];
+    }
+    __syncthreads();
+  }
+  __syncthreads();
+
+  if (weightIdx < weightDim) {
+    if (idy == 0) {
+      weightGrad[padId * weightDim + weightIdx] += sum_s[0][idx];
+    }
+  }
+}
+
+void hl_context_projection_backward_weight(real* outputGrad,
+                                           const int* sequence,
+                                           real* weightGrad,
+                                           int numSequences,
+                                           int weightDim,
+                                           int totalPad,
+                                           int contextLength,
+                                           int contextStart,
+                                           int beginPad) {
+  CHECK_NOTNULL(outputGrad);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(weightGrad);
+
+  int threadsX = 32;
+  int threadsY = 32;
+  int blocksX = totalPad * ((weightDim + threadsX - 1) / threadsX);
+  dim3 threads(threadsX, threadsY);
+  dim3 grid(blocksX, 1);
+
+  KeContextProjectionBackwardWeight<32, 32>
+    <<< grid, threads, 0, STREAM_DEFAULT >>>
+    (outputGrad, sequence, weightGrad, numSequences, weightDim,
+     contextLength, contextStart, beginPad);
+  CHECK_SYNC("hl_context_projection_backward_weight failed");
+}
+
+template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
+__global__ void KeMatrixAddRows(real* output,
+                                real* table,
+                                int* ids,
+                                int numSamples,
+                                int tableSize,
+                                int dim) {
+  int idx = threadIdx.x;
+  int idy = threadIdx.y;
+  int sampleId = blockIdx.x + idy * gridDimX;
+
+  while (sampleId < numSamples) {
+    int tableId = ids[sampleId];
+    if ((0 <= tableId) && (tableId < tableSize)) {
+      real *outputData = output + sampleId * dim;
+      real *tableData = table + tableId * dim;
+      for (int i = idx; i < dim; i += blockDimX) {
+        if (AddRow == 0) {
+          outputData[i] += tableData[i];
+        } else {
+          atomicAdd(&tableData[i], outputData[i]);
+        }
+      }
+    }
+    sampleId += blockDimY*gridDimX;
+  }
+}
+
+template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd>
+__global__
+void KeSequence2Batch(real *batch,
+                      real *sequence,
+                      int *batchIndex,
+                      int seqWidth,
+                      int batchCount) {
+  int idx = threadIdx.x;
+  int idy = threadIdx.y;
+  int id = blockIdx.x + idy * gridDimX;
+  while (id < batchCount) {
+    int seqId = batchIndex[id];
+    real* batchData = batch + id*seqWidth;
+    real* seqData = sequence + seqId*seqWidth;
+    for (int i = idx; i < seqWidth; i += blockDimX) {
+      if (seq2batch) {
+        if (isAdd) {
+          batchData[i] += seqData[i];
+        } else {
+          batchData[i] = seqData[i];
+        }
+      } else {
+        if (isAdd) {
+          seqData[i] += batchData[i];
+        } else {
+          seqData[i] = batchData[i];
+        }
+      }
+    }
+    id += blockDimY*gridDimX;
+  }
+}
+
+void hl_sequence2batch_copy(real *batch,
+                            real *sequence,
+                            int *batchIndex,
+                            int seqWidth,
+                            int batchCount,
+                            bool seq2batch) {
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(batch);
+  CHECK_NOTNULL(batchIndex);
+
+  dim3 threads(128, 8);
+  dim3 grid(8, 1);
+  if (seq2batch) {
+    KeSequence2Batch<128, 8, 8, 1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
+      (batch, sequence, batchIndex, seqWidth, batchCount);
+  } else {
+    KeSequence2Batch<128, 8, 8, 0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
+      (batch, sequence, batchIndex, seqWidth, batchCount);
+  }
+  CHECK_SYNC("hl_sequence2batch_copy failed");
+}
+
+void hl_sequence2batch_add(real *batch,
+                           real *sequence,
+                           int *batchIndex,
+                           int seqWidth,
+                           int batchCount,
+                           bool seq2batch) {
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(batch);
+  CHECK_NOTNULL(batchIndex);
+
+  dim3 threads(128, 8);
+  dim3 grid(8, 1);
+  if (seq2batch) {
+    KeSequence2Batch<128, 8, 8, 1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
+      (batch, sequence, batchIndex, seqWidth, batchCount);
+  } else {
+    KeSequence2Batch<128, 8, 8, 0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
+      (batch, sequence, batchIndex, seqWidth, batchCount);
+  }
+  CHECK_SYNC("hl_sequence2batch_add failed");
+}
+
+__device__ inline float my_rsqrt(float x) {
+  return rsqrtf(x);
+}
+
+__device__ inline double my_rsqrt(double x) {
+  return rsqrt(x);
+}
+
+__global__ void KeSequenceAvgForward(real* dst,
+                                     real* src,
+                                     const int* starts,
+                                     int height,
+                                     int width,
+                                     const int mode) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int row = gid / width;
+  int col = gid % width;
+
+  if (gid < height * width) {
+    int start = starts[row];
+    int end = starts[row + 1];
+    int seqLength = end - start;
+    if (seqLength == 0) return;
+    real sum = 0.0;
+    for (int i = 0; i < seqLength; i++) {
+      sum += src[(start + i) * width + col];
+    }
+    sum = mode == 1 ? sum :
+        (mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength));
+    dst[row * width + col] = sum;
+  }
+}
+
+void hl_sequence_avg_forward(real* dst,
+                             real* src,
+                             const int* starts,
+                             int height,
+                             int width,
+                             const int mode) {
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(src);
+  CHECK_NOTNULL(starts);
+
+  int block = 512;
+  int grid = DIVUP(width * height, 512);
+
+  CHECK(mode == 0 || mode == 1 || mode == 2)
+    << "mode error in hl_sequence_avg_forward!";
+
+  KeSequenceAvgForward<<< grid, block, 0, STREAM_DEFAULT >>>
+           (dst, src, starts, height, width, mode);
+  CHECK_SYNC("hl_sequence_avg_forward failed");
+}
diff --git a/paddle/cuda/src/hl_cuda_sparse.cu b/paddle/cuda/src/hl_cuda_sparse.cu
new file mode 100644
index 00000000000000..b42568afdaaf59
--- /dev/null
+++ b/paddle/cuda/src/hl_cuda_sparse.cu
@@ -0,0 +1,1333 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "hl_cuda.h"
+#include "hl_sparse.h"
+#include "hl_sparse.ph"
+#include "hl_matrix_ops.cuh"
+#include "hl_matrix_apply.cuh"
+#include "hl_cuda_sparse.cuh"
+#include "paddle/utils/Logging.h"
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(mul_scalar, ONE_PARAMETER, a = a * p);
+DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
+
+void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
+                         real *C_d,
+                         int dimM,
+                         int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+  CHECK(dimM > 0 && dimN > 0 && A_d->rows == dimM && A_d->cols == dimN);
+  CHECK(A_d->format == HL_SPARSE_CSR) << "matrix format error!";
+
+  if (A_d->nnz == 0) {
+    hl_gpu_apply_unary_op(
+        unary::Zero<real>(), C_d, dimM, dimN, dimN);
+    return;
+  }
+
+  /* nnz != 0 */
+  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
+  CHECK((A_d2->csr_val || A_d->type == HL_NO_VALUE) &&
+        A_d2->csr_row && A_d2->csr_col) << "parameter transa error!";
+
+  int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
+  int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
+  dim3 threads(CU_CSR2DENSE_THREAD_X, CU_CSR2DENSE_THREAD_X);
+  dim3 grid(blocksX, blocksY);
+
+  if (A_d->type == HL_NO_VALUE) {
+    KeSMatrixCsr2Dense<0>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(A_d2->csr_val,
+                                             A_d2->csr_row,
+                                             A_d2->csr_col,
+                                             C_d,
+                                             dimM,
+                                             dimN);
+  } else if (A_d->type == HL_FLOAT_VALUE) {
+    KeSMatrixCsr2Dense<1>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(A_d2->csr_val,
+                                             A_d2->csr_row,
+                                             A_d2->csr_col,
+                                             C_d,
+                                             dimM,
+                                             dimN);
+  } else {
+  }
+  CHECK_SYNC("hl_matrix_csr2dense failed");
+}
+
+void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
+                         real *C_d,
+                         int dimM,
+                         int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+  CHECK(dimM > 0 && dimN > 0 && A_d->rows == dimM && A_d->cols == dimN);
+  CHECK(A_d->format == HL_SPARSE_CSC) << "matrix format error!";
+
+  if (A_d->nnz == 0) {
+    hl_gpu_apply_unary_op(
+        unary::Zero<real>(), C_d, dimM, dimN, dimN);
+    return;
+  }
+
+  /* nnz != 0 */
+  hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
+  CHECK((A_d2->csc_val || A_d->type == HL_NO_VALUE) &&
+        A_d2->csc_row && A_d2->csc_col) << "parameter transa error!";
+
+  int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
+  int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
+  dim3 threads(CU_CSR2DENSE_THREAD_X, CU_CSR2DENSE_THREAD_X);
+  dim3 grid(blocksX, blocksY);
+
+  if (A_d->type == HL_NO_VALUE) {
+    KeSMatrixCsc2Dense<0>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(A_d2->csc_val,
+                                             A_d2->csc_row,
+                                             A_d2->csc_col,
+                                             C_d,
+                                             dimM,
+                                             dimN);
+  } else if (A_d->type == HL_FLOAT_VALUE) {
+    KeSMatrixCsc2Dense<1>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(A_d2->csc_val,
+                                             A_d2->csc_row,
+                                             A_d2->csc_col,
+                                             C_d,
+                                             dimM,
+                                             dimN);
+  } else {
+  }
+  CHECK_SYNC("hl_matrix_csc2dense failed");
+}
+
+void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
+                             hl_matrix_format_t format,
+                             hl_matrix_value_t  value_type,
+                             int dimM,
+                             int dimN,
+                             int nnz) {
+  CHECK_NOTNULL(A_d);
+  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
+    << "sparse matrix format error!";
+  CHECK(value_type == HL_FLOAT_VALUE || value_type == HL_NO_VALUE)
+    << "sparse matrix value type error!";
+  /* avoid malloc 0 bytes */
+  int nnz_s = (nnz == 0 ? 1 : nnz);
+
+  if (format == HL_SPARSE_CSR) {
+    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
+
+    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
+                              + sizeof(_hl_csr_matrix));
+    CHECK_NOTNULL(tmp);
+
+    hl_csr_matrix csr = (hl_csr_matrix)(tmp+sizeof(_hl_sparse_matrix_s));
+    csr->sparsity = -1.0;
+
+    if (value_type == HL_NO_VALUE) {
+      csr->csr_val = NULL;
+      csr->nnz_s = nnz_s;
+      csr->row_s = dimM+1;
+      csr->csr_row = (int*)hl_malloc_device((dimM+1)*sizeof(int));
+      csr->csr_col = (int*)hl_malloc_device((nnz_s)*sizeof(int));
+
+      *A_d = (hl_sparse_matrix_s)tmp;
+      (*A_d)->matrix = (hl_matrix_s)csr;
+    } else if (value_type == HL_FLOAT_VALUE) {
+      csr->nnz_s = nnz_s;
+      csr->row_s = dimM+1;
+      csr->csr_val = (real*)hl_malloc_device((nnz_s)*sizeof(real));
+      csr->csr_row = (int*)hl_malloc_device((dimM+1)*sizeof(int));
+      csr->csr_col = (int*)hl_malloc_device((nnz_s)*sizeof(int));
+
+      *A_d = (hl_sparse_matrix_s)tmp;
+      (*A_d)->matrix = (hl_matrix_s)csr;
+    }
+  } else if (format == HL_SPARSE_CSC) {
+    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
+
+    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
+                              + sizeof(_hl_csc_matrix));
+    CHECK_NOTNULL(tmp);
+
+    hl_csc_matrix csc = (hl_csc_matrix)(tmp+sizeof(_hl_sparse_matrix_s));
+    csc->sparsity = -1.0f;
+
+    if (value_type == HL_NO_VALUE) {
+      csc->csc_val = NULL;
+      csc->nnz_s = nnz_s;
+      csc->col_s = dimN+1;
+      csc->csc_row = (int*)hl_malloc_device((nnz_s)*sizeof(int));
+      csc->csc_col = (int*)hl_malloc_device((dimN+1)*sizeof(int));
+
+      *A_d = (hl_sparse_matrix_s)tmp;
+      (*A_d)->matrix = (hl_matrix_s)csc;
+    } else if (value_type == HL_FLOAT_VALUE) {
+      csc->nnz_s = nnz_s;
+      csc->col_s = dimN+1;
+      csc->csc_val = (real*)hl_malloc_device((nnz_s)*sizeof(real));
+      csc->csc_row = (int*)hl_malloc_device((nnz_s)*sizeof(int));
+      csc->csc_col = (int*)hl_malloc_device((dimN+1)*sizeof(int));
+
+      *A_d = (hl_sparse_matrix_s)tmp;
+      (*A_d)->matrix = (hl_matrix_s)csc;
+    }
+  }
+
+  (*A_d)->format = format;
+  (*A_d)->type = value_type;
+  (*A_d)->rows = dimM;
+  (*A_d)->cols = dimN;
+  (*A_d)->nnz = nnz;
+}
+
+void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {
+  CHECK_NOTNULL(A_d);
+  CHECK(A_d->format == HL_SPARSE_CSR || A_d->format == HL_SPARSE_CSC)
+    << "sparse matrix format error!";
+
+  if (A_d->matrix == NULL) {
+    free(A_d);
+    return;
+  }
+
+  if (A_d->format == HL_SPARSE_CSR) {
+    hl_csr_matrix csr = (hl_csr_matrix)A_d->matrix;
+    if (csr->csr_val != NULL) {
+      hl_free_mem_device(csr->csr_val);
+      csr->csr_val = NULL;
+    }
+
+    if (csr->csr_row != NULL) {
+      hl_free_mem_device(csr->csr_row);
+      csr->csr_row = NULL;
+    }
+
+    if (csr->csr_col != NULL) {
+      hl_free_mem_device(csr->csr_col);
+      csr->csr_col = NULL;
+    }
+
+    A_d->matrix = NULL;
+    free(A_d);
+  } else if (A_d->format == HL_SPARSE_CSC) {
+    hl_csc_matrix csc = (hl_csc_matrix)A_d->matrix;
+    if (csc->csc_val != NULL) {
+      hl_free_mem_device(csc->csc_val);
+      csc->csc_val = NULL;
+    }
+
+    if (csc->csc_row != NULL) {
+      hl_free_mem_device(csc->csc_row);
+      csc->csc_row = NULL;
+    }
+
+    if (csc->csc_col != NULL) {
+      hl_free_mem_device(csc->csc_col);
+      csc->csc_col = NULL;
+    }
+
+    A_d->matrix = NULL;
+    free(A_d);
+  }
+}
+
+void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
+                                void * dest_d,
+                                size_t size,
+                                hl_matrix_format_t format,
+                                hl_matrix_value_t  value_type,
+                                int dimM,
+                                int dimN,
+                                int nnz) {
+  CHECK_NOTNULL(A_d);
+  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
+    << "sparse matrix format error!";
+
+  if (format == HL_SPARSE_CSR) {
+    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
+
+    size_t size_ = (dimM+1)*sizeof(int) + nnz*sizeof(int);
+    if (value_type != HL_NO_VALUE) {
+      size_ += nnz*sizeof(real);
+    }
+    CHECK_LE(size_, size) << "dest_d size(" << size
+      << ") too small, should bigger than(" << size_ << ")!";
+
+    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
+                              + sizeof(_hl_csr_matrix));
+    CHECK_NOTNULL(tmp);
+
+    hl_csr_matrix csr = (hl_csr_matrix)(tmp+sizeof(_hl_sparse_matrix_s));
+
+    if (value_type == HL_NO_VALUE) {
+      csr->csr_val = NULL;
+      csr->csr_row = (int*)dest_d;
+      csr->csr_col = (int*)((char*)dest_d + (dimM+1)*sizeof(int));
+    } else {
+      csr->csr_val = (real*)dest_d;
+      csr->csr_row = (int*)((char*)dest_d + nnz*sizeof(real));
+      csr->csr_col = (int*)((char*)dest_d +
+                            nnz*sizeof(real) +
+                            (dimM+1)*sizeof(int));
+    }
+    csr->nnz_s = nnz;
+    csr->row_s = dimM+1;
+    csr->sparsity = -1.0;
+    *A_d = (hl_sparse_matrix_s)tmp;
+    (*A_d)->matrix = (hl_matrix_s)csr;
+  } else if (format == HL_SPARSE_CSC) {
+    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
+
+    size_t size_ = (dimN+1)*sizeof(int) + nnz*sizeof(int);
+    if (value_type != HL_NO_VALUE) {
+      size_ += nnz*sizeof(real);
+    }
+    CHECK_LE(size_, size) << "dest_d size(" << size
+      << ") too small, should bigger than(" << size_ << ")!";
+
+    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
+                              + sizeof(_hl_csc_matrix));
+    CHECK_NOTNULL(tmp);
+
+    hl_csc_matrix csc = (hl_csc_matrix)(tmp+sizeof(_hl_sparse_matrix_s));
+    if (value_type == HL_NO_VALUE) {
+      csc->csc_val = NULL;
+      csc->csc_col = (int*)dest_d;
+      csc->csc_row = (int*)((char*)dest_d + (dimN+1)*sizeof(int));
+    } else {
+      csc->csc_val = (real*)dest_d;
+      csc->csc_col = (int*)((char*)dest_d + nnz*sizeof(real));
+      csc->csc_row = (int*)((char*)dest_d +
+                            nnz*sizeof(real) +
+                            (dimN+1)*sizeof(int));
+    }
+    csc->nnz_s = nnz;
+    csc->col_s = dimN+1;
+    csc->sparsity = -1.0f;
+    *A_d = (hl_sparse_matrix_s)tmp;
+    (*A_d)->matrix = (hl_matrix_s)csc;
+  }
+
+  (*A_d)->format = format;
+  (*A_d)->type = value_type;
+  (*A_d)->rows = dimM;
+  (*A_d)->cols = dimN;
+  (*A_d)->nnz = nnz;
+}
+
+void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
+                                real* value_d,
+                                int* rows_d,
+                                int* cols_d,
+                                hl_matrix_format_t format,
+                                hl_matrix_value_t  value_type,
+                                int dimM,
+                                int dimN,
+                                int nnz) {
+  CHECK_NOTNULL(A_d);
+  CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
+
+  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
+    << "sparse matrix format error!";
+
+  if (format == HL_SPARSE_CSR) {
+    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
+                              + sizeof(_hl_csr_matrix));
+    CHECK_NOTNULL(tmp);
+
+    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
+    csr->csr_row = rows_d;
+    csr->csr_col = cols_d;
+    csr->csr_val = value_d;
+    csr->nnz_s = nnz;
+    csr->row_s = dimM + 1;
+    csr->sparsity = -1.0;
+    *A_d = (hl_sparse_matrix_s)tmp;
+    (*A_d)->matrix = (hl_matrix_s)csr;
+  } else if (format == HL_SPARSE_CSC) {
+    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
+                              + sizeof(_hl_csc_matrix));
+    CHECK_NOTNULL(tmp);
+
+    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
+    csc->csc_row = rows_d;
+    csc->csc_col = cols_d;
+    csc->csc_val = value_d;
+    csc->nnz_s = nnz;
+    csc->col_s = dimN + 1;
+    csc->sparsity = -1.0f;
+    *A_d = (hl_sparse_matrix_s)tmp;
+    (*A_d)->matrix = (hl_matrix_s)csc;
+  }
+
+  (*A_d)->format = format;
+  (*A_d)->type = value_type;
+  (*A_d)->rows = dimM;
+  (*A_d)->cols = dimN;
+  (*A_d)->nnz = nnz;
+}
+
+void hl_destruct_sparse_matrix(hl_sparse_matrix_s A_d) {
+  CHECK_NOTNULL(A_d);
+  free(A_d);
+}
+
+void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
+                          real *csr_val,
+                          int *csr_row,
+                          int *csr_col,
+                          hl_stream_t stream) {
+  CHECK_NOTNULL(csr_matrix);
+  CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
+    << "csr_matrix is not csr format!";
+  CHECK_NOTNULL(csr_matrix->matrix);
+
+  hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
+  CHECK_LE(csr_matrix->nnz, csr->nnz_s)
+    << "copy size " << csr_matrix->nnz
+    << " is big than alloc size " << csr->nnz_s;
+
+  CHECK_LE((csr_matrix->rows+1), csr->row_s)
+    << "copy size " << (csr_matrix->rows + 1)
+    << " is big than alloc size " << csr->row_s;
+
+  CHECK(csr_matrix->type == HL_FLOAT_VALUE ||
+        csr_matrix->type == HL_NO_VALUE)
+        << "sparse matrix value type error!";
+
+  if (csr_matrix->type == HL_NO_VALUE) {
+    if (csr_row == NULL && csr_col == NULL) {
+      return;
+    } else if (csr_row != NULL && csr_col != NULL) {
+      hl_memcpy_async(csr->csr_row,
+                      csr_row,
+                      (csr_matrix->rows+1)*sizeof(int),
+                      stream);
+
+      hl_memcpy_async(csr->csr_col,
+                      csr_col,
+                      (csr_matrix->nnz)*sizeof(int),
+                      stream);
+    } else {
+      LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
+    }
+  } else if (csr_matrix->type == HL_FLOAT_VALUE) {
+    if (csr_val == NULL && csr_row == NULL && csr_col == NULL) {
+      return;
+    } else if (csr_val != NULL && csr_row == NULL && csr_col == NULL) {
+      hl_memcpy_async(csr->csr_val,
+                      csr_val,
+                      (csr_matrix->nnz)*sizeof(real),
+                      stream);
+    } else if (csr_val != NULL && csr_row != NULL && csr_col != NULL) {
+      hl_memcpy_async(csr->csr_val,
+                      csr_val,
+                      (csr_matrix->nnz)*sizeof(real),
+                      stream);
+      hl_memcpy_async(csr->csr_row,
+                      csr_row,
+                      (csr_matrix->rows+1)*sizeof(int),
+                      stream);
+      hl_memcpy_async(csr->csr_col,
+                      csr_col,
+                      (csr_matrix->nnz)*sizeof(int),
+                      stream);
+    } else {
+      LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
+    }
+  }
+
+  csr->sparsity = ((float)csr_matrix->nnz) /
+                  ((float)csr_matrix->rows) /
+                  ((float)csr_matrix->cols);
+}
+
+void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
+                          real *csc_val,
+                          int *csc_row,
+                          int *csc_col,
+                          hl_stream_t stream) {
+  CHECK_NOTNULL(csc_matrix);
+  CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
+    << "csc_matrix is not csc format error!";
+
+  hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
+  CHECK_LE(csc_matrix->nnz, csc->nnz_s)
+    << "copy size " << csc_matrix->nnz
+    << " is big than alloc size " << csc->nnz_s;
+
+  CHECK_LE((csc_matrix->cols+1), csc->col_s)
+    << "copy size " <<(csc_matrix->cols + 1)
+    << " is big than alloc size " << csc->col_s;
+
+  CHECK(csc_matrix->type == HL_FLOAT_VALUE ||
+        csc_matrix->type == HL_NO_VALUE)
+        << "sparse matrix value type error!";
+
+  if (csc_matrix->type == HL_NO_VALUE) {
+    if (csc_row == NULL && csc_col == NULL) {
+      return;
+    } else if (csc_row != NULL && csc_col != NULL) {
+      hl_memcpy_async(csc->csc_row,
+                      csc_row,
+                      (csc_matrix->nnz)*sizeof(int),
+                      stream);
+      hl_memcpy_async(csc->csc_col,
+                      csc_col,
+                      (csc_matrix->cols+1)*sizeof(int),
+                      stream);
+    } else {
+      LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
+    }
+  } else if (csc_matrix->type == HL_FLOAT_VALUE) {
+    if (csc_val == NULL && csc_row == NULL && csc_col == NULL) {
+      return;
+    } else if (csc_val != NULL && csc_row == NULL && csc_col == NULL) {
+      hl_memcpy_async(csc->csc_val,
+                      csc_val,
+                      (csc_matrix->nnz)*sizeof(real),
+                      stream);
+    } else if (csc_val != NULL && csc_row != NULL && csc_col != NULL) {
+      hl_memcpy_async(csc->csc_val,
+                      csc_val,
+                      (csc_matrix->nnz)*sizeof(real),
+                      stream);
+      hl_memcpy_async(csc->csc_row,
+                      csc_row,
+                      (csc_matrix->nnz)*sizeof(int),
+                      stream);
+      hl_memcpy_async(csc->csc_col,
+                      csc_col,
+                      (csc_matrix->cols+1)*sizeof(int),
+                      stream);
+    } else {
+      LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
+    }
+  }
+
+  csc->sparsity = ((float)csc_matrix->nnz) /
+                  ((float)csc_matrix->rows) /
+                  ((float)csc_matrix->cols);
+}
+
+void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
+                             hl_sparse_matrix_s src,
+                             hl_stream_t stream) {
+  CHECK(dst && src && dst->matrix && src->matrix)
+    << "parameter dst or src is null pointer!";
+  CHECK_EQ(dst->format, src->format)
+    << "sparse matrix format does not match!";
+  CHECK(dst->type != HL_FLOAT_VALUE || src->type != HL_NO_VALUE)
+    << "src sparse matrix is no value, dst sparse matrix has value!";
+
+  if (dst->format == HL_SPARSE_CSR) {
+    dst->rows = src->rows;
+    dst->cols = src->cols;
+    dst->nnz  = src->nnz;
+    hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
+    hl_memcpy_csr_matrix(dst,
+                         csr->csr_val,
+                         csr->csr_row,
+                         csr->csr_col,
+                         stream);
+  } else if (dst->format == HL_SPARSE_CSC) {
+    dst->rows = src->rows;
+    dst->cols = src->cols;
+    dst->nnz  = src->nnz;
+    hl_csc_matrix csc = (hl_csc_matrix)src->matrix;
+    hl_memcpy_csc_matrix(dst,
+                         csc->csc_val,
+                         csc->csc_row,
+                         csc->csc_col,
+                         stream);
+  } else {
+    LOG(FATAL) << "sparse matrix format error!";
+  }
+}
+
+void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
+                             real *B_d, hl_trans_op_t transb,
+                             real *C_d,
+                             int dimM, int dimN, int dimK,
+                             real alpha, real beta) {
+  CHECK_EQ(transb, HPPL_OP_N);
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+  CHECK(dimM > 0 && dimN > 0 && dimK > 0);
+  CHECK_EQ(A_d->format, HL_SPARSE_CSR) << "matrix format error!";
+
+  if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) ||
+      (HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) {
+      LOG(FATAL) << "parameter error!";
+  }
+
+  if (A_d->nnz == 0) {
+    if (beta != 1.0) {
+        hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
+                              C_d,
+                              dimM,
+                              dimN,
+                              dimN);
+    } else {
+      return;
+    }
+  }
+
+  /* nnz != 0 */
+  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
+  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
+       A_d2->csr_row == NULL ||
+       A_d2->csr_col == NULL) {
+    LOG(FATAL) << "parameter error!";
+  }
+
+  if (HPPL_OP_N == transa) {
+    int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N;
+    int blocksY = (dimM + CU_CSRMM_THREAD_Y - 1) / CU_CSRMM_THREAD_Y;
+    dim3 threads(CU_CSRMM_THREAD_X, CU_CSRMM_THREAD_Y);
+    dim3 grid(blocksX, blocksY);
+
+    /* sparsity pattern */
+    // A_d->sparsity;
+    if (A_d->type == HL_NO_VALUE) {
+      KeSMatrixCsrMulDense<0>
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+                                               A_d2->csr_val,
+                                               A_d2->csr_col,
+                                               A_d2->csr_row,
+                                               B_d,
+                                               dimM,
+                                               dimN,
+                                               dimK,
+                                               alpha,
+                                               beta);
+    } else {
+      KeSMatrixCsrMulDense<1>
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+                                               A_d2->csr_val,
+                                               A_d2->csr_col,
+                                               A_d2->csr_row,
+                                               B_d,
+                                               dimM,
+                                               dimN,
+                                               dimK,
+                                               alpha,
+                                               beta);
+    }
+  } else if (HPPL_OP_T == transa) {
+    if (beta != 1.0) {
+      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
+                            C_d,
+                            dimM,
+                            dimN,
+                            dimN);
+    }
+
+    int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) /
+                  CU_CSC_MUL_DENSE_BLOCK_N;
+    int blocksY = (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) /
+                  CU_CSC_MUL_DENSE_BLOCK_K;
+    dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
+    dim3 grid(blocksX, blocksY);
+    if (A_d->type == HL_NO_VALUE) {
+      KeSMatrixCscMulDense<0>
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+                                               A_d2->csr_val,
+                                               A_d2->csr_col,
+                                               A_d2->csr_row,
+                                               B_d,
+                                               dimM,
+                                               dimN,
+                                               dimK,
+                                               alpha,
+                                               beta);
+    } else {
+      KeSMatrixCscMulDense<1>
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+                                               A_d2->csr_val,
+                                               A_d2->csr_col,
+                                               A_d2->csr_row,
+                                               B_d,
+                                               dimM,
+                                               dimN,
+                                               dimK,
+                                               alpha,
+                                               beta);
+    }
+  } else {
+    LOG(FATAL) << "parameter transa error!";
+  }
+
+  CHECK_SYNC("hl_matrix_csr_mul_dense failed");
+}
+
+void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
+                             hl_sparse_matrix_s B_d, hl_trans_op_t transb,
+                             real *C_d,
+                             int dimM, int dimN, int dimK,
+                             real alpha, real beta) {
+  CHECK_EQ(transa, HPPL_OP_N);
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+
+  if (dimM <= 0 || dimN <= 0 || dimK <= 0 ||
+      ((transb == HPPL_OP_N) && (B_d->rows != dimK || B_d->cols != dimN)) ||
+      ((transb == HPPL_OP_T) && (B_d->rows != dimN || B_d->cols != dimK))) {
+    LOG(FATAL) << "parameter dims error!";
+  }
+
+  CHECK_EQ(B_d->format, HL_SPARSE_CSC)
+    << "matrix format error!";
+
+  if (B_d->nnz == 0) {
+    if (beta != 1.0) {
+      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
+                            C_d,
+                            dimM,
+                            dimN,
+                            dimN);
+    } else {
+      return;
+    }
+  }
+
+  /* nnz != 0 */
+  hl_csc_matrix B_d2 = (hl_csc_matrix)(B_d->matrix);
+  if ((B_d2->csc_val == NULL && B_d->type != HL_NO_VALUE) ||
+       B_d2->csc_row == NULL ||
+       B_d2->csc_col == NULL) {
+    LOG(FATAL) << "parameter B is null!";
+  }
+
+  if (transb == HPPL_OP_N) {
+    int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST;
+    int blocksY = (dimN + CU_CSCMM_BLOCK_N_BEST - 1) / CU_CSCMM_BLOCK_N_BEST;
+    dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST);
+    dim3 grid(blocksX, blocksY);
+
+    if (B_d->type == HL_NO_VALUE) {
+      KeSMatrixDenseMulCsc<0>
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+                                               A_d,
+                                               B_d2->csc_val,
+                                               B_d2->csc_row,
+                                               B_d2->csc_col,
+                                               dimM,
+                                               dimN,
+                                               dimK,
+                                               alpha,
+                                               beta);
+    } else {
+      KeSMatrixDenseMulCsc<1>
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+                                               A_d,
+                                               B_d2->csc_val,
+                                               B_d2->csc_row,
+                                               B_d2->csc_col,
+                                               dimM,
+                                               dimN,
+                                               dimK,
+                                               alpha,
+                                               beta);
+    }
+  } else if (transb == HPPL_OP_T) {
+    if (beta != 1.0) {
+      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
+                            C_d,
+                            dimM,
+                            dimN,
+                            dimN);
+    }
+    int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X;
+    int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M;
+    dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
+    dim3 grid(blocksX, blocksY);
+    if (B_d->type == HL_NO_VALUE) {
+      KeSMatrixDenseMulCsr<0>
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+                                               A_d,
+                                               B_d2->csc_val,
+                                               B_d2->csc_col,
+                                               B_d2->csc_row,
+                                               dimM,
+                                               dimN,
+                                               dimK,
+                                               alpha,
+                                               beta);
+    } else {
+      KeSMatrixDenseMulCsr<1>
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+                                               A_d,
+                                               B_d2->csc_val,
+                                               B_d2->csc_col,
+                                               B_d2->csc_row,
+                                               dimM,
+                                               dimN,
+                                               dimK,
+                                               alpha,
+                                               beta);
+    }
+  } else {
+    LOG(FATAL) << "parameter transb error!";
+  }
+
+  CHECK_SYNC("hl_matrix_dense_mul_csc failed");
+}
+
+void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
+                             hl_sparse_matrix_s B_d, hl_trans_op_t transb,
+                             real *C_d,
+                             int dimM, int dimN, int dimK,
+                             real alpha, real beta) {
+  CHECK_EQ(transa, HPPL_OP_N);
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+
+  if (dimM <= 0 || dimN <= 0 || dimK <= 0
+      || (transb == HPPL_OP_N && (B_d->rows != dimK || B_d->cols != dimN))
+      || (transb == HPPL_OP_T && (B_d->rows != dimN || B_d->cols != dimK))) {
+    LOG(FATAL) << "parameter dims error!";
+  }
+
+  CHECK_EQ(B_d->format, HL_SPARSE_CSR)
+    << "matrix format error!";
+
+  if (B_d->nnz == 0) {
+    if (beta != 1.0) {
+      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
+                            C_d,
+                            dimM,
+                            dimN,
+                            dimN);
+    } else {
+      return;
+    }
+  }
+
+  /* nnz != 0 */
+  hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix);
+  if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) ||
+       B_d2->csr_row == NULL ||
+       B_d2->csr_col == NULL) {
+    LOG(FATAL) << "parameter transa error!";
+  }
+
+  if (transb == HPPL_OP_N) {
+    if (beta != 1.0) {
+      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
+                            C_d,
+                            dimM,
+                            dimN,
+                            dimN);
+    }
+
+    int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X;
+    int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M;
+    dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
+    dim3 grid(blocksX, blocksY);
+    if (B_d->type == HL_NO_VALUE) {
+      KeSMatrixDenseMulCsr<0>
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+                                               A_d,
+                                               B_d2->csr_val,
+                                               B_d2->csr_row,
+                                               B_d2->csr_col,
+                                               dimM,
+                                               dimN,
+                                               dimK,
+                                               alpha,
+                                               beta);
+    } else {
+      KeSMatrixDenseMulCsr<1>
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+                                               A_d,
+                                               B_d2->csr_val,
+                                               B_d2->csr_row,
+                                               B_d2->csr_col,
+                                               dimM,
+                                               dimN,
+                                               dimK,
+                                               alpha,
+                                               beta);
+    }
+  } else if (transb == HPPL_OP_T) {
+    int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST;
+    int blocksY = (dimN + CU_CSCMM_BLOCK_N_BEST - 1) / CU_CSCMM_BLOCK_N_BEST;
+    dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST);
+    dim3 grid(blocksX, blocksY);
+    if (B_d->type == HL_NO_VALUE) {
+      KeSMatrixDenseMulCsc<0>
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+                                               A_d,
+                                               B_d2->csr_val,
+                                               B_d2->csr_col,
+                                               B_d2->csr_row,
+                                               dimM,
+                                               dimN,
+                                               dimK,
+                                               alpha,
+                                               beta);
+    } else {
+      KeSMatrixDenseMulCsc<1>
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+                                               A_d,
+                                               B_d2->csr_val,
+                                               B_d2->csr_col,
+                                               B_d2->csr_row,
+                                               dimM,
+                                               dimN,
+                                               dimK,
+                                               alpha,
+                                               beta);
+    }
+  } else {
+    LOG(FATAL) << "parameter transb error!";
+  }
+
+  CHECK_SYNC("hl_matrix_dense_mul_csr failed");
+}
+
+void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
+                             real *B_d, hl_trans_op_t transb,
+                             real *C_d,
+                             int dimM, int dimN, int dimK,
+                             real alpha, real beta) {
+  CHECK_EQ(transb, HPPL_OP_N);
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+  CHECK(dimM > 0 && dimN > 0 && dimK > 0) << "parameter error!";
+  CHECK_EQ(A_d->format, HL_SPARSE_CSC) << "matrix format error!";
+
+  if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) ||
+      (HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) {
+    LOG(FATAL) << "parameter error!";
+  }
+
+  if (A_d->nnz == 0) {
+    if (beta != 1.0) {
+      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
+                            C_d,
+                            dimM,
+                            dimN,
+                            dimN);
+    } else {
+      return;
+    }
+  }
+
+  /* nnz != 0 */
+  hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
+  if ((A_d2->csc_val == NULL && A_d->type != HL_NO_VALUE) ||
+       A_d2->csc_row == NULL ||
+       A_d2->csc_col == NULL) {
+    LOG(FATAL) << "parameter error!";
+  }
+
+  if (HPPL_OP_N == transa) {
+    if (beta != 1.0) {
+      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
+                            C_d,
+                            dimM,
+                            dimN,
+                            dimN);
+    }
+
+    int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N -1)/CU_CSC_MUL_DENSE_BLOCK_N;
+    int blocksY = (dimK + CU_CSC_MUL_DENSE_BLOCK_K -1)/CU_CSC_MUL_DENSE_BLOCK_K;
+    dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
+    dim3 grid(blocksX, blocksY);
+    if (A_d->type == HL_NO_VALUE) {
+      KeSMatrixCscMulDense<0>
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+                                               A_d2->csc_val,
+                                               A_d2->csc_row,
+                                               A_d2->csc_col,
+                                               B_d,
+                                               dimM,
+                                               dimN,
+                                               dimK,
+                                               alpha,
+                                               beta);
+    } else {
+      KeSMatrixCscMulDense<1>
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+                                               A_d2->csc_val,
+                                               A_d2->csc_row,
+                                               A_d2->csc_col,
+                                               B_d,
+                                               dimM,
+                                               dimN,
+                                               dimK,
+                                               alpha,
+                                               beta);
+    }
+  } else if (HPPL_OP_T == transa) {
+    int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N;
+    int blocksY = (dimM + CU_CSRMM_THREAD_Y - 1) / CU_CSRMM_THREAD_Y;
+    dim3 threads(CU_CSRMM_THREAD_X, CU_CSRMM_THREAD_Y);
+    dim3 grid(blocksX, blocksY);
+
+    /* sparsity pattern */
+    // A_d->sparsity;
+    if (A_d->type == HL_NO_VALUE) {
+      KeSMatrixCsrMulDense<0>
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+                                               A_d2->csc_val,
+                                               A_d2->csc_row,
+                                               A_d2->csc_col,
+                                               B_d,
+                                               dimM,
+                                               dimN,
+                                               dimK,
+                                               alpha,
+                                               beta);
+    } else {
+      KeSMatrixCsrMulDense<1>
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+                                               A_d2->csc_val,
+                                               A_d2->csc_row,
+                                               A_d2->csc_col,
+                                               B_d,
+                                               dimM,
+                                               dimN,
+                                               dimK,
+                                               alpha,
+                                               beta);
+    }
+  } else {
+    LOG(FATAL) << "parameter transa error!";
+  }
+
+  CHECK_SYNC("hl_matrix_csc_mul_dense failed");
+}
+
+void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
+                          real *B_d, hl_trans_op_t transb,
+                          hl_sparse_matrix_s  C_d,
+                          int dimM, int dimN, int dimK,
+                          real alpha, real beta) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+  CHECK(dimM > 0 && dimN > 0 && dimK > 0) << "parameter error!";
+  CHECK_NE(C_d->type, HL_NO_VALUE) << "C value type error!";
+
+  if (C_d->nnz == 0) return;
+
+  if (C_d->format == HL_SPARSE_CSC) {
+    hl_csc_matrix C_d2 = (hl_csc_matrix)(C_d->matrix);
+    if (C_d2->csc_val == NULL ||
+        C_d2->csc_row == NULL ||
+        C_d2->csc_col == NULL) {
+      LOG(FATAL) << "parameter error!";
+    }
+
+    if (beta != 1.0) {
+      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
+                            C_d2->csc_val,
+                            1,
+                            C_d->nnz,
+                            C_d->nnz);
+    }
+
+    int blocksX = dimN;
+    int blocksY = 1;
+    dim3 threads(CU_CSCMM_DMD2CSC_THREAD_X, 1);
+    dim3 grid(blocksX, blocksY);
+    bool transA = transa == HPPL_OP_T ? 1 : 0;
+    bool transB = transb == HPPL_OP_T ? 1 : 0;
+    KeSMatrixDenseMulDense2CSC
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d2->csc_val,
+                                             C_d2->csc_row,
+                                             C_d2->csc_col,
+                                             A_d,
+                                             B_d,
+                                             transA,
+                                             transB,
+                                             dimM,
+                                             dimN,
+                                             dimK,
+                                             alpha,
+                                             beta);
+    CHECK_SYNC("hl_sparse_matrix_mul failed");
+  } else {
+    hl_csr_matrix C_d2 = (hl_csr_matrix)(C_d->matrix);
+    if ((C_d2->csr_val == NULL && C_d->type != HL_NO_VALUE) ||
+         C_d2->csr_row == NULL ||
+         C_d2->csr_col == NULL) {
+      LOG(FATAL) << "parameter error!";
+    }
+
+    if (beta != 1.0) {
+      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
+                            C_d2->csr_val,
+                            1,
+                            C_d->nnz,
+                            C_d->nnz);
+    }
+
+    bool transA = transa == HPPL_OP_T ? 1 : 0;
+    bool transB = transb == HPPL_OP_T ? 1 : 0;
+    if (!transB) {
+      int blocksX = dimM;
+      int blocksY = 1;
+      dim3 threads(CU_CSCMM_DMD2CSR_THREAD_X, 1);
+      dim3 grid(blocksX, blocksY);
+
+      KeSMatrixDenseMulDense2CSR
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d2->csr_val,
+                                               C_d2->csr_row,
+                                               C_d2->csr_col,
+                                               A_d,
+                                               B_d,
+                                               transA,
+                                               transB,
+                                               dimM,
+                                               dimN,
+                                               dimK,
+                                               alpha,
+                                               beta);
+     CHECK_SYNC("hl_sparse_matrix_mul failed");
+    } else {
+      CHECK(!transA) << "Not supported A is trans and B is not trans!";
+
+      dim3 block(CU_BLOCK_SIZE, 1);
+      int avgNnzPerRow = C_d2->nnz_s / dimM;
+      avgNnzPerRow = avgNnzPerRow > 0 ? avgNnzPerRow : 1;
+      int gridx = DIVUP(avgNnzPerRow, CU_BLOCK_SIZE);
+      dim3 grid(gridx, dimM);
+      KeSMatrixDenseMulDenseTrans2CSR
+         <<<grid, block, 0, STREAM_DEFAULT>>>(C_d2->csr_val,
+                                               C_d2->csr_row,
+                                               C_d2->csr_col,
+                                               A_d,
+                                               B_d,
+                                               transA,
+                                               transB,
+                                               dimM,
+                                               dimN,
+                                               dimK,
+                                               alpha,
+                                               beta);
+     CHECK_SYNC("hl_sparse_matrix_mul failed");
+   }
+  }
+}
+
+void hl_memcpy_from_csc_matrix(real *csc_val,
+                               size_t val_size,
+                               int *csc_row,
+                               size_t row_size,
+                               int *csc_col,
+                               size_t col_size,
+                               hl_sparse_matrix_s csc_matrix,
+                               hl_stream_t stream) {
+  CHECK_NOTNULL(csc_matrix);
+  CHECK_NOTNULL(csc_row);
+  CHECK_NOTNULL(csc_col);
+
+  CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
+     << "csc_matrix is not csc format error!";
+
+  if (csc_matrix->nnz > row_size ||
+      csc_matrix->cols + 1 > static_cast<int>(col_size)) {
+    LOG(FATAL) << "size not match!";
+  }
+
+  hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
+  hl_memcpy_async((void*)csc_row,
+                  (void*)csc->csc_row,
+                  (csc_matrix->nnz) * sizeof(int),
+                  stream);
+  hl_memcpy_async((void*)csc_col,
+                  (void*)csc->csc_col,
+                  (csc_matrix->cols + 1) * sizeof(int),
+                  stream);
+  if (csc_matrix->type == HL_FLOAT_VALUE) {
+    if (csc_val != NULL) {
+      CHECK_LE(csc_matrix->nnz, val_size) << "size not match!";
+      hl_memcpy_async((void*)csc_val,
+                      (void*)csc->csc_val,
+                      (csc_matrix->nnz)*sizeof(real),
+                      stream);
+    } else {
+      LOG(FATAL) << "parameter csr_val is null pointer!";
+    }
+  }
+}
+
+void hl_memcpy_from_csr_matrix(real *csr_val,
+                               size_t val_size,
+                               int *csr_row,
+                               size_t row_size,
+                               int *csr_col,
+                               size_t col_size,
+                               hl_sparse_matrix_s csr_matrix,
+                               hl_stream_t stream) {
+  CHECK_NOTNULL(csr_matrix);
+  CHECK_NOTNULL(csr_row);
+  CHECK_NOTNULL(csr_col);
+  CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
+    << "csr_matrix is not csr format error!";
+
+  if (csr_matrix->nnz > col_size ||
+      csr_matrix->rows + 1 > static_cast<int>(row_size)) {
+    LOG(FATAL) << "size not match!";
+  }
+
+  hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
+  hl_memcpy_async((void*)csr_row,
+                  (void*)csr->csr_row,
+                  (csr_matrix->rows+1)*sizeof(int),
+                  stream);
+  hl_memcpy_async((void*)csr_col,
+                  (void*)csr->csr_col,
+                  (csr_matrix->nnz)*sizeof(int),
+                  stream);
+  if (csr_matrix->type == HL_FLOAT_VALUE) {
+    if (csr_val != NULL) {
+      CHECK_LE(csr_matrix->nnz, val_size) << "size not match!";
+      hl_memcpy_async((void*)csr_val,
+                      (void*)csr->csr_val,
+                      (csr_matrix->nnz)*sizeof(real),
+                      stream);
+    } else {
+      LOG(FATAL) << "parameter csr_val is null pointer!";
+    }
+  }
+}
+
+void hl_sparse_matrix_column_sum(real* A_d, hl_sparse_matrix_s B_d, int dimM,
+                                 int dimN, real scale) {
+  if (B_d->format == HL_SPARSE_CSR) {
+    hl_matrix_csr_column_sum(A_d, B_d, dimM, dimN, scale);
+  } else {
+    LOG(FATAL) << "Not support CSC format error!";
+  }
+}
+
+void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d,
+                              int dimM, int dimN, real scale) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+
+  if (dimM <= 0 || dimN <= 0 || (B_d->rows != dimM || B_d->cols != dimN)) {
+    LOG(FATAL) << "parameter dims error!";
+  }
+
+  hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix);
+  if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) ||
+      B_d2->csr_row == NULL || B_d2->csr_col == NULL) {
+    LOG(FATAL) << "parameter B is null!";
+  }
+
+  if (B_d2->nnz_s == 0) return;
+
+  int nnz = B_d2->nnz_s;
+  int block = 512;
+  int grid = DIVUP(nnz, 512);
+  KeSMatrixCsrColumnSum<<<grid, block, 0, STREAM_DEFAULT>>>(
+      A_d, B_d2->csr_val, B_d2->csr_col, nnz);
+
+  CHECK_SYNC("hl_matrix_csr_column_sum failed");
+}
+
+void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
+                               real* B_d, real scale) {
+  if (A_d->format == HL_SPARSE_CSR) {
+    hl_matrix_csr_add_bias(A_d, B_d, scale);
+  } else {
+    LOG(FATAL) << "Not support CSC format error!";
+  }
+}
+
+void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d,
+                            real scale) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+
+  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
+  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
+      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
+    LOG(FATAL) << "parameter A_d is null!";
+  }
+
+  if (A_d2->nnz_s == 0) return;
+
+  int nnz = A_d2->nnz_s;
+  int block = 512;
+  int grid = DIVUP(nnz, 512);
+  KeSMatrixCsrAddBias<<<grid, block, 0, STREAM_DEFAULT>>>(
+      A_d2->csr_val, A_d2->csr_col, B_d, scale, nnz);
+
+  CHECK_SYNC("hl_sparse_matrix_add_bias failed");
+}
+
+void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, real *B_d, int dimM,
+                                int dimN, real alpha, real beta) {
+  if (A_d->format == HL_SPARSE_CSR) {
+    hl_matrix_csr_add_dense(A_d, B_d, dimM, dimN, alpha, beta);
+  } else {
+    LOG(FATAL) << "Not support CSC format error!";
+  }
+}
+
+void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM,
+                             int dimN, real alpha, real beta) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+
+  if (dimM <= 0 || dimN <= 0 || A_d->rows != dimM || A_d->cols != dimN) {
+    LOG(FATAL) << "parameter dim error!";
+  }
+
+  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
+  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
+      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
+    LOG(FATAL) << "parameter A_d is null!";
+  }
+
+  if (A_d2->nnz_s == 0) return;
+
+  int gridX = DIVUP((A_d2->nnz_s / dimM), 512);
+  gridX = gridX > 0 ? gridX : 1;
+  dim3 block(512, 1);
+  dim3 grid(gridX, dimM);
+  KeSMatrixCsrAddDense<<<grid, block, 0, STREAM_DEFAULT>>>(
+    A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, B_d, alpha, beta, dimM, dimN);
+
+  CHECK_SYNC("hl_sparse_matrix_add_dense failed");
+}
+
+int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
+  __sparse_get_return__(sMat, row);
+}
+
+int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
+  __sparse_get_return__(sMat, col);
+}
+
+real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
+  __sparse_get_return__(sMat, val);
+}
diff --git a/paddle/cuda/src/hl_cuda_sparse.cuh b/paddle/cuda/src/hl_cuda_sparse.cuh
new file mode 100644
index 00000000000000..becb6c66492c1b
--- /dev/null
+++ b/paddle/cuda/src/hl_cuda_sparse.cuh
@@ -0,0 +1,1007 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "hl_device_functions.cuh"
+
+template <int VALUE_TYPE>
+__device__ real findvalue(real* csr_val,
+                          int* csr_col,
+                          int col_start,
+                          int col_end,
+                          int index) {
+  int start = col_start;
+  int end = col_end-1;
+  int mid = -1;
+
+  while (start < end) {
+    mid = start + ((end - start) / 2);
+    if (csr_col[mid] < index)
+      start = mid + 1;
+    else
+      end = mid;
+  }
+
+  if ((start < col_end) && (csr_col[start] == index)) {
+    real ret = VALUE_TYPE == 0 ? 1.0 : csr_val[start];
+    return ret;
+  } else {
+    return 0.0;
+  }
+}
+
+#define     CU_CSR2DENSE_THREAD_X   16
+#define     CU_CSR2DENSE_THREAD_Y   16
+template <int VALUE_TYPE>
+__global__ void KeSMatrixCsr2Dense(real * csr_val,
+                                   int * csr_row,
+                                   int * csr_col,
+                                   real * C_d,
+                                   const int dimM,
+                                   const int dimN) {
+  const int row = blockIdx.y*blockDim.y+threadIdx.y;
+  const int col = blockIdx.x*blockDim.x+threadIdx.x;
+
+  if (row >= dimM || col >= dimN) {
+    return;
+  }
+
+  int start = csr_row[row];
+  int end = csr_row[row+1];
+
+  real sum = findvalue<VALUE_TYPE>(csr_val, csr_col, start, end, col);
+  C_d[row*dimN + col] = sum;
+}
+
+template <int VALUE_TYPE>
+__global__ void KeSMatrixCsc2Dense(real * csc_val,
+                                   int * csc_row,
+                                   int * csc_col,
+                                   real * C_d,
+                                   const int dimM,
+                                   const int dimN) {
+  const int row = blockIdx.y*blockDim.y+threadIdx.y;
+  const int col = blockIdx.x*blockDim.x+threadIdx.x;
+
+  if (row >= dimM || col >= dimN) {
+    return;
+  }
+
+  int start = csc_col[col];
+  int end = csc_col[col+1];
+
+  real sum = findvalue<VALUE_TYPE>(csc_val, csc_row, start, end, row);
+  C_d[row*dimN + col] = sum;
+}
+
+#define     CU_CSRMM_N                  4
+#define     CU_CSRMM_THREAD_X           32
+#define     CU_CSRMM_THREAD_Y           32
+#define     CU_CSRMM_BLOCK_N            (32*CU_CSRMM_N)
+#define     CU_CSRMM_SHARED_ELEMENT     (2*CU_CSRMM_THREAD_X)
+template <int VALUE_TYPE>
+__global__ void KeSMatrixCsrMulDense(real *C_d,
+                                     real * csr_val,
+                                     int * csr_col,
+                                     int * csr_row,
+                                     real *B_d,
+                                     int dimM,
+                                     int dimN,
+                                     int dimK,
+                                     real alpha,
+                                     real beta) {
+  const int idx = threadIdx.x;
+  const int idy = threadIdx.y;
+  const int index_m = blockIdx.y*CU_CSRMM_THREAD_Y+threadIdx.y;
+  int index_n = blockIdx.x*CU_CSRMM_BLOCK_N+threadIdx.x;
+
+  __shared__ real csr_val_sh[CU_CSRMM_THREAD_Y][CU_CSRMM_SHARED_ELEMENT];
+  __shared__ int csr_col_sh[CU_CSRMM_THREAD_Y][CU_CSRMM_SHARED_ELEMENT];
+
+  if (index_m >= dimM) {
+    return;
+  }
+
+  // possible optimization, cache this in shared memory
+  int csr_start = csr_row[index_m];
+  int csr_end = csr_row[index_m+1];
+  int csr_index =  csr_start + idx;
+
+  int csr_iter = (csr_end-csr_start)/CU_CSRMM_SHARED_ELEMENT;
+  int csr_rem = (csr_end-csr_start)%CU_CSRMM_SHARED_ELEMENT;
+
+  int index_k = -1;
+  real sum[CU_CSRMM_N] = {0};
+  real b_r[CU_CSRMM_N] = {0};
+
+  for (int csr_i = 0; csr_i < csr_iter; csr_i++) {
+    #pragma unroll
+    for (int i = 0; i < (CU_CSRMM_SHARED_ELEMENT/CU_CSRMM_THREAD_X); i++) {
+      if (VALUE_TYPE != 0) {
+        csr_val_sh[idy][idx + i*CU_CSRMM_THREAD_X] = csr_val[csr_index];
+      }
+      csr_col_sh[idy][idx + i*CU_CSRMM_THREAD_X] = csr_col[csr_index];
+      csr_index += CU_CSRMM_THREAD_X;
+    }
+
+    for (int index = 0; index < CU_CSRMM_SHARED_ELEMENT; index++) {
+      index_k = csr_col_sh[idy][index];
+      real a_r = VALUE_TYPE == 0 ? 1.0 : csr_val_sh[idy][index];
+      int tmp_index = index_n;
+      real *B_d_r = B_d + tmp_index;
+      #pragma unroll
+      for (int n = 0; n < CU_CSRMM_N; n++) {
+        if (tmp_index >= dimN) break;
+        b_r[n] = B_d_r[index_k*dimN];
+        B_d_r += CU_CSRMM_THREAD_X;
+        tmp_index += CU_CSRMM_THREAD_X;
+      }
+
+      #pragma unroll
+      for (int n = 0; n < CU_CSRMM_N; n++) {
+        sum[n] = VALUE_TYPE == 0 ? sum[n] + b_r[n] : sum[n] + a_r*b_r[n];
+      }
+    }
+    // __syncthreads();
+  }
+
+  if (csr_rem != 0) {
+    #pragma unroll
+    for (int i = 0; i < (CU_CSRMM_SHARED_ELEMENT/CU_CSRMM_THREAD_X); i++) {
+      if (csr_index < csr_end) {
+        if (VALUE_TYPE != 0) {
+            csr_val_sh[idy][idx + i*CU_CSRMM_THREAD_X] = csr_val[csr_index];
+        }
+        csr_col_sh[idy][idx + i*CU_CSRMM_THREAD_X] = csr_col[csr_index];
+      }
+      csr_index += CU_CSRMM_THREAD_X;
+    }
+    // __syncthreads();
+
+    #pragma unroll
+    for (int index = 0; index < csr_rem; index++) {
+      index_k = csr_col_sh[idy][index];
+      real a_r = VALUE_TYPE == 0 ? 1.0 : csr_val_sh[idy][index];
+      int tmp_index = index_n;
+      real *B_d_r = B_d + tmp_index;
+      #pragma unroll
+      for (int n = 0; n < CU_CSRMM_N; n++) {
+        if (tmp_index >= dimN) break;
+        b_r[n] = B_d_r[index_k*dimN];
+        B_d_r += CU_CSRMM_THREAD_X;
+        tmp_index += CU_CSRMM_THREAD_X;
+      }
+
+      #pragma unroll
+      for (int n = 0; n < CU_CSRMM_N; n++) {
+        sum[n] = VALUE_TYPE == 0 ? sum[n] + b_r[n] : sum[n] + a_r*b_r[n];
+      }
+    }
+  }
+
+  C_d += __mul24(index_m, dimN);
+  #pragma unroll
+  for (int n = 0; n < CU_CSRMM_N; n++) {
+    if (index_n < dimN) {
+      C_d[index_n] = alpha*sum[n] + beta*C_d[index_n];
+      index_n += CU_CSRMM_THREAD_X;
+    }
+  }
+}
+
+#define CU_CSC_MUL_DENSE_THREAD_N           1
+#define CU_CSC_MUL_DENSE_THREAD_X           32
+#define CU_CSC_MUL_DENSE_THREAD_Y           4
+#define CU_CSC_MUL_DENSE_BLOCK_K            (CU_CSC_MUL_DENSE_THREAD_Y)
+#define CU_CSC_MUL_DENSE_BLOCK_N            \
+        (CU_CSC_MUL_DENSE_THREAD_N * CU_CSC_MUL_DENSE_THREAD_X)
+#define CU_CSC_MUL_DENSE_SHARED_ELEMENT     (CU_CSC_MUL_DENSE_THREAD_X)
+template <int VALUE_TYPE>
+__global__ void KeSMatrixCscMulDense(real *C_d,
+                                     real * csc_val,
+                                     int * csc_row,
+                                     int * csc_col,
+                                     real *B_d,
+                                     int dimM,
+                                     int dimN,
+                                     int dimK,
+                                     real alpha,
+                                     real beta) {
+  const int idx = threadIdx.x;
+  const int idy = threadIdx.y;
+  const int index_k = blockIdx.y*CU_CSC_MUL_DENSE_BLOCK_K+threadIdx.y;
+  const int index_n = blockIdx.x*CU_CSC_MUL_DENSE_BLOCK_N+threadIdx.x;
+
+  if (index_k >= dimK) {
+    return;
+  }
+
+  __shared__
+  real csc_val_sh[CU_CSC_MUL_DENSE_THREAD_Y][CU_CSC_MUL_DENSE_SHARED_ELEMENT];
+  __shared__
+  int csc_row_sh[CU_CSC_MUL_DENSE_THREAD_Y][CU_CSC_MUL_DENSE_SHARED_ELEMENT];
+
+  // possible optimization, cache this in shared memory
+  int csc_start = csc_col[index_k];
+  int csc_end = csc_col[index_k+1];
+  int csc_index = csc_start + idx;
+  int csc_iter = (csc_end-csc_start)/CU_CSC_MUL_DENSE_SHARED_ELEMENT;
+  int csc_rem = (csc_end-csc_start)%CU_CSC_MUL_DENSE_SHARED_ELEMENT;
+  int index_m = -1;
+
+  real b_r[CU_CSC_MUL_DENSE_THREAD_N] = {0};
+  real *B_d_r;
+  real *C_d_r;
+  int index_n_t;
+  B_d += index_n + __mul24(index_k, dimN);
+  C_d += index_n;
+  for (int csr_i = 0; csr_i < csc_iter; csr_i++) {
+    #pragma unroll
+    for (int i = 0;
+         i < (CU_CSC_MUL_DENSE_SHARED_ELEMENT/CU_CSC_MUL_DENSE_THREAD_X); i++) {
+      if (VALUE_TYPE != 0) {
+        csc_val_sh[idy][idx + i*CU_CSC_MUL_DENSE_THREAD_X] = csc_val[csc_index];
+      }
+      csc_row_sh[idy][idx + i*CU_CSC_MUL_DENSE_THREAD_X] = csc_row[csc_index];
+      csc_index += CU_CSC_MUL_DENSE_THREAD_X;
+    }
+
+    #pragma unroll
+    for (int index = 0; index < CU_CSC_MUL_DENSE_SHARED_ELEMENT; index++) {
+      index_m = csc_row_sh[idy][index];
+      real a_r = VALUE_TYPE == 0 ? 1.0 : csc_val_sh[idy][index];
+      B_d_r = B_d;
+      C_d_r = C_d + __mul24(index_m, dimN);
+
+      index_n_t = index_n;
+      #pragma unroll
+      for (int n = 0; n < CU_CSC_MUL_DENSE_THREAD_N; n++) {
+        if (index_n_t < dimN) {
+          b_r[n] = B_d_r[0];
+          B_d_r += CU_CSC_MUL_DENSE_THREAD_X;
+          index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
+        }
+      }
+
+      index_n_t = index_n;
+      #pragma unroll
+      for (int n = 0; n < CU_CSC_MUL_DENSE_THREAD_N; n++) {
+        if (index_n_t < dimN) {
+          real tmp;
+          tmp = alpha*a_r*b_r[n];
+          atomicAdd(C_d_r, tmp);
+          C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
+          index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
+        }
+      }
+    }
+    // __syncthreads();
+  }
+
+  if (csc_rem != 0) {
+    #pragma unroll
+    for (int i = 0;
+         i < (CU_CSC_MUL_DENSE_SHARED_ELEMENT/CU_CSC_MUL_DENSE_THREAD_X); i++) {
+      if (csc_index < csc_end) {
+        if (VALUE_TYPE != 0) {
+          csc_val_sh[idy][idx + i * CU_CSC_MUL_DENSE_THREAD_X] =
+            csc_val[csc_index];
+        }
+        csc_row_sh[idy][idx + i * CU_CSC_MUL_DENSE_THREAD_X] =
+          csc_row[csc_index];
+      }
+      csc_index += CU_CSC_MUL_DENSE_THREAD_X;
+    }
+    // __syncthreads();
+
+    #pragma unroll
+    for (int index = 0; index < csc_rem; index++) {
+      index_m = csc_row_sh[idy][index];
+      real a_r = VALUE_TYPE == 0 ? 1.0 : csc_val_sh[idy][index];
+      B_d_r = B_d;
+      C_d_r = C_d + __mul24(index_m, dimN);
+
+      index_n_t = index_n;
+      #pragma unroll
+      for (int n = 0; n < CU_CSC_MUL_DENSE_THREAD_N; n++) {
+        if (index_n_t < dimN) {
+          b_r[n] = B_d_r[0];
+          B_d_r += CU_CSC_MUL_DENSE_THREAD_X;
+          index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
+        }
+      }
+
+      index_n_t = index_n;
+      #pragma unroll
+      for (int n = 0; n < CU_CSC_MUL_DENSE_THREAD_N; n++) {
+        if (index_n_t < dimN) {
+          real tmp;
+          tmp = alpha*a_r*b_r[n];
+          atomicAdd(C_d_r, tmp);
+          C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
+          index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
+        }
+      }
+    }
+  }
+}
+
+/* best perf */
+#ifndef HPPL_TYPE_DOUBLE
+#define CU_CSCMM_THREAD_M_BEST          9
+#else
+#define CU_CSCMM_THREAD_M_BEST          4
+#endif
+#define CU_CSCMM_THREAD_X_BEST          32
+#define CU_CSCMM_THREAD_Y_BEST          32
+#define CU_CSCMM_BLOCK_M_BEST  (CU_CSCMM_THREAD_M_BEST * CU_CSCMM_THREAD_X_BEST)
+#define CU_CSCMM_BLOCK_N_BEST  (CU_CSCMM_THREAD_Y_BEST)
+template <int VALUE_TYPE>
+__global__ void KeSMatrixDenseMulCsc(real *C_d,
+                                     const real *A_d,
+                                     const real *csc_val,
+                                     const int *csc_row,
+                                     const int *csc_col,
+                                     int dimM,
+                                     int dimN,
+                                     int dimK,
+                                     real alpha,
+                                     real beta) {
+  __shared__ real csc_val_sh[CU_CSCMM_BLOCK_N_BEST][CU_CSCMM_THREAD_X_BEST];
+  __shared__ int csc_row_sh[CU_CSCMM_BLOCK_N_BEST][CU_CSCMM_THREAD_X_BEST];
+  __shared__ real A_s[CU_CSCMM_BLOCK_M_BEST][CU_CSCMM_THREAD_Y_BEST+1];
+
+  int iter_k = dimK/CU_CSCMM_THREAD_Y_BEST;
+  int rem_k = dimK%CU_CSCMM_THREAD_Y_BEST;
+  const int idx = threadIdx.x;
+  const int idy = threadIdx.y;
+  const int index_n = blockIdx.y*CU_CSCMM_BLOCK_N_BEST+threadIdx.y;
+
+  int csc_start;
+  int csc_end;
+  if (index_n < dimN) {
+    csc_start = csc_col[index_n];
+    csc_end = csc_col[index_n+1];
+  } else {
+    csc_start = 0;
+    csc_end = 0;
+  }
+  int csc_index =  csc_start + idx;
+  int csc_iter = (csc_end-csc_start)/CU_CSCMM_THREAD_X_BEST;
+  int csc_rem = (csc_end-csc_start)%CU_CSCMM_THREAD_X_BEST;
+  int index_k = -1;
+
+  if (csc_index < csc_end) {
+    if (VALUE_TYPE != 0) {
+      csc_val_sh[idy][idx] = csc_val[csc_index];
+    }
+    csc_row_sh[idy][idx] = csc_row[csc_index];
+    csc_index += CU_CSCMM_THREAD_X_BEST;
+  }
+
+  const int ibx = blockIdx.x * CU_CSCMM_BLOCK_M_BEST;
+  int dim = ibx+idy;
+  A_d += idx + __mul24(dim, dimK);
+  #pragma unroll
+  for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
+    A_s[idy + m * 32][idx] = 0.0f;
+    if (dim + m * 32 < dimM && idx < dimK) {
+      A_s[idy + m * 32][idx] = A_d[m * 32 * dimK];
+    }
+  }
+  __syncthreads();
+
+  real b_r;
+  real a_r[CU_CSCMM_THREAD_M_BEST] = {0};
+  real sum[CU_CSCMM_THREAD_M_BEST] = {0};
+  real A_r_s[CU_CSCMM_THREAD_M_BEST] = {0};
+  int index = 0;
+  int block_end_k = 0;;
+  int index_iter_csc = csc_iter;
+
+  for (int i_k = 0; i_k < iter_k; i_k++) {
+    A_d += CU_CSCMM_THREAD_Y_BEST;
+    block_end_k += CU_CSCMM_THREAD_Y_BEST;
+    #pragma unroll
+    for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
+      if (dim + m*32 < dimM && (idx + (i_k+1)*CU_CSCMM_THREAD_Y_BEST < dimK)) {
+        A_r_s[m] = A_d[m*32*dimK];
+      } else {
+        A_r_s[m] = 0.0f;
+      }
+    }
+
+    if (index_iter_csc > 0) {
+      goto WARP_SYNC;
+    } else {
+      goto WARP_SYNC_2;
+    }
+
+    while (index_iter_csc) {
+      if (VALUE_TYPE != 0) {
+        csc_val_sh[idy][idx] = csc_val[csc_index];
+      }
+      csc_row_sh[idy][idx] = csc_row[csc_index];
+      csc_index += CU_CSCMM_THREAD_X_BEST;
+      index = 0;
+
+WARP_SYNC:
+      for (; index < CU_CSCMM_THREAD_X_BEST; index++) {
+        index_k = csc_row_sh[idy][index];
+        if (index_k >= block_end_k) {
+          goto BLOCK_SYNC;
+        }
+        b_r = VALUE_TYPE == 0 ? 1.0 : csc_val_sh[idy][index];
+        #pragma unroll
+        for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
+          a_r[m] = A_s[idx+m*32][index_k-i_k*CU_CSCMM_THREAD_Y_BEST];
+          sum[m] = VALUE_TYPE == 0 ? sum[m] + a_r[m] : sum[m] + a_r[m]*b_r;
+        }
+      }
+      index_iter_csc--;
+    }
+
+    if (csc_rem != 0) {
+      if (csc_iter != 0) {
+        if (csc_index < csc_end) {
+          if (VALUE_TYPE != 0) {
+            csc_val_sh[idy][idx] = csc_val[csc_index];
+          }
+          csc_row_sh[idy][idx] = csc_row[csc_index];
+          csc_index += CU_CSCMM_THREAD_X_BEST;
+        }
+        index = 0;
+      }
+      __threadfence_block();
+
+WARP_SYNC_2:
+      for (; index < csc_rem; index++) {
+        index_k = csc_row_sh[idy][index];
+        if (index_k >= block_end_k) {
+          goto BLOCK_SYNC;
+        }
+        b_r = VALUE_TYPE == 0 ? 1.0 : csc_val_sh[idy][index];
+        #pragma unroll
+        for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
+          a_r[m] = A_s[idx+m*32][index_k-i_k*CU_CSCMM_THREAD_Y_BEST];
+          sum[m] = VALUE_TYPE == 0 ? sum[m] + a_r[m] : sum[m] + a_r[m]*b_r;
+        }
+      }
+    }
+
+BLOCK_SYNC:
+    __syncthreads();
+    #pragma unroll
+    for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
+      A_s[idy+m*32][idx] = A_r_s[m];
+    }
+    __syncthreads();
+  }
+
+  if (rem_k != 0) {
+    if (index_iter_csc == 0) {
+      goto TEMP_TEST;
+    }
+
+    for (; index < CU_CSCMM_THREAD_X_BEST; index++) {
+      index_k = csc_row_sh[idy][index];
+      if (index_k >= dimK) {
+        break;
+      }
+
+      b_r = VALUE_TYPE == 0 ? 1.0 : csc_val_sh[idy][index];
+      #pragma unroll
+      for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
+        a_r[m] = A_s[idx+m*32][index_k-iter_k*CU_CSCMM_THREAD_Y_BEST];
+        sum[m] = VALUE_TYPE == 0 ? sum[m] + a_r[m] : sum[m] + a_r[m]*b_r;
+      }
+    }
+
+    if (csc_rem != 0) {
+      if (csc_index < csc_end) {
+        if (VALUE_TYPE != 0) {
+          csc_val_sh[idy][idx] = csc_val[csc_index];
+        }
+        csc_row_sh[idy][idx] = csc_row[csc_index];
+        csc_index += CU_CSCMM_THREAD_X_BEST;
+      }
+      index = 0;
+
+TEMP_TEST:
+      for (; index < csc_rem; index++) {
+        index_k = csc_row_sh[idy][index];
+        if (index_k >= dimK) {
+            break;
+        }
+        b_r = VALUE_TYPE == 0 ? 1.0 : csc_val_sh[idy][index];
+        #pragma unroll
+        for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
+          a_r[m] = A_s[idx+m*32][index_k-iter_k*CU_CSCMM_THREAD_Y_BEST];
+          sum[m] = VALUE_TYPE == 0 ? sum[m] + a_r[m] : sum[m] + a_r[m]*b_r;
+        }
+      }
+    }
+  }
+
+  __syncthreads();
+  #pragma unroll
+  for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
+    A_s[idx+m*32][idy] = alpha*sum[m];
+  }
+  __syncthreads();
+
+  int index_m_c = ibx + idy;
+  int index_n_c = blockIdx.y*CU_CSCMM_BLOCK_N_BEST + idx;
+  C_d += index_n_c + __mul24(index_m_c, dimN);
+  #pragma unroll
+  for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
+    if (index_m_c < dimM && index_n_c < dimN) {
+      C_d[0] = A_s[idy+m*32][idx] + beta*C_d[0];
+    }
+    index_m_c += 32;
+    C_d += dimN*32;
+  }
+}
+
+#define     CU_DM_CSR_THREAD_X           32
+#define     CU_DM_CSR_THREAD_Y           4
+#define     CU_DM_CSR_N                  4
+#define     CU_DM_CSR_BLOCK_M            (CU_DM_CSR_N*CU_DM_CSR_THREAD_Y)
+#define     CU_DM_CSR_BLOCK_K            (CU_DM_CSR_THREAD_X)
+#define     CU_DM_CSR_SHARED_ELEMENT     (1*CU_DM_CSR_THREAD_Y)
+template <int VALUE_TYPE>
+__global__ void KeSMatrixDenseMulCsr(real *C_d,
+                                     real *A_d,
+                                     real *csr_val,
+                                     const int *csr_row,
+                                     const int *csr_col,
+                                     int dimM,
+                                     int dimN,
+                                     int dimK,
+                                     real alpha,
+                                     real beta) {
+  const int idx = threadIdx.x;
+  const int idy = threadIdx.y;
+  int index_k = __mul24(blockIdx.x, CU_DM_CSR_THREAD_X) + threadIdx.x;
+  int index_m = __mul24(blockIdx.y, CU_DM_CSR_BLOCK_M) +
+    __mul24(threadIdx.y, CU_DM_CSR_N);
+
+  if (index_k >= dimK) {
+    return;
+  }
+
+  __shared__ real csr_val_sh[CU_DM_CSR_THREAD_X][CU_DM_CSR_SHARED_ELEMENT];
+  __shared__ int csr_col_sh[CU_DM_CSR_THREAD_X][CU_DM_CSR_SHARED_ELEMENT];
+
+  // possible optimization, cache this in shared memory
+  int csr_start = csr_row[index_k];
+  int csr_end = csr_row[index_k+1];
+  int csr_index =  csr_start + idy;
+  int csr_iter = (csr_end-csr_start)/CU_DM_CSR_SHARED_ELEMENT;
+  int csr_rem = (csr_end-csr_start)%CU_DM_CSR_SHARED_ELEMENT;
+
+  real tmp = 0.0;
+  int index_n = -1;
+  int index_m_t = index_m;
+  real a_r[CU_DM_CSR_N] = {0};
+  real *A_d_tmp = A_d + __mul24(index_m, dimK) + index_k;
+  real *A_d_r = A_d_tmp;
+
+  #pragma unroll
+  for (int n=0; n < CU_DM_CSR_N; n++) {
+    if ( index_m_t++ < dimM ) {
+      a_r[n] = A_d_r[0];
+      A_d_r += dimK;
+    }
+  }
+
+  for (int csr_i = 0; csr_i < csr_iter; csr_i++) {
+    #pragma unroll
+    for (int i = 0; i < (CU_DM_CSR_SHARED_ELEMENT/CU_DM_CSR_THREAD_Y); i++) {
+      if (VALUE_TYPE != 0) {
+        csr_val_sh[idx][idy + i*CU_DM_CSR_THREAD_Y] = csr_val
+        [csr_index];
+      }
+      csr_col_sh[idx][idy + i*CU_DM_CSR_THREAD_Y] = csr_col[csr_index];
+      csr_index += CU_DM_CSR_THREAD_Y;
+    }
+    __syncthreads();
+
+    #pragma unroll
+    for (int index = 0; index < CU_DM_CSR_SHARED_ELEMENT; index++) {
+      index_n = csr_col_sh[idx][index];
+      real b_r = VALUE_TYPE == 0 ? 1.0 : csr_val_sh[idx][index];
+      real *C_d_r = C_d + __mul24(index_m, dimN) + index_n;
+
+      index_m_t = index_m;
+      #pragma unroll
+      for (int n=0; n < CU_DM_CSR_N; n++) {
+        if (index_m_t++ < dimM) {
+          tmp = alpha * b_r * a_r[n];
+          atomicAdd(C_d_r, tmp);
+          C_d_r += dimN;
+        }
+      }
+    }
+    __syncthreads();
+  }
+
+  if (csr_rem != 0) {
+    #pragma unroll
+    for (int i = 0; i < (CU_DM_CSR_SHARED_ELEMENT/CU_DM_CSR_THREAD_Y); i++) {
+      if (csr_index < csr_end) {
+        if (VALUE_TYPE !=0) {
+          csr_val_sh[idx][idy + i*CU_DM_CSR_THREAD_Y] = csr_val[csr_index];
+        }
+        csr_col_sh[idx][idy + i*CU_DM_CSR_THREAD_Y] = csr_col[csr_index];
+      }
+      csr_index += CU_DM_CSR_THREAD_Y;
+    }
+    __syncthreads();
+
+    #pragma unroll
+    for (int index = 0; index < csr_rem; index++) {
+      index_n = csr_col_sh[idx][index];
+      real b_r = VALUE_TYPE == 0 ? 1.0 : csr_val_sh[idx][index];
+      real *C_d_r = C_d + __mul24(index_m, dimN) + index_n;
+      index_m_t = index_m;
+      #pragma unroll
+      for (int n=0; n < CU_DM_CSR_N; n++) {
+        if (index_m_t++ < dimM) {
+          tmp = alpha * b_r * a_r[n];
+          atomicAdd(C_d_r, tmp);
+          C_d_r += dimN;
+        }
+      }
+    }
+  }
+}
+
+#define     CU_CSCMM_DMD2CSC_THREAD_X   128
+#define     CU_CSCMM_DMD2CSC_SHARE_X    128
+__global__ void KeSMatrixDenseMulDense2CSC(real *csc_val,
+                                           const int *csc_row,
+                                           const int *csc_col,
+                                           real *A_d,
+                                           real *B_d,
+                                           bool trans_A,
+                                           bool trans_B,
+                                           int dimM,
+                                           int dimN,
+                                           int dimK,
+                                           real alpha,
+                                           real beta) {
+  __shared__ real B_s[CU_CSCMM_DMD2CSC_SHARE_X];
+  const int idx = threadIdx.x;  // one block compute one column
+  const int ibx = blockIdx.x;  // col index
+  int csc_start;
+  int csc_end;
+  if (ibx < dimN) {
+    csc_start = csc_col[ibx];
+    csc_end = csc_col[ibx + 1];
+  } else {
+    csc_start = 0;
+    csc_end = 0;
+  }
+
+  int iter_num = dimK / CU_CSCMM_DMD2CSC_SHARE_X;
+  int iter_rem = dimK % CU_CSCMM_DMD2CSC_SHARE_X;
+  real * B_tmp = B_d + ibx;  // column index
+
+  for (int j = 0; j < iter_num; j++) {
+    int rowStart = (j * CU_CSCMM_DMD2CSC_SHARE_X + idx) * dimN;
+    int index = rowStart;
+    for (int m = idx;
+         m < CU_CSCMM_DMD2CSC_SHARE_X; m += CU_CSCMM_DMD2CSC_THREAD_X) {
+     B_s[m] = B_tmp[index];
+     index = index + CU_CSCMM_DMD2CSC_THREAD_X * dimN;
+    }
+    __syncthreads();
+
+    for (int i = csc_col[ibx] + idx;
+         i < csc_col[ibx + 1]; i += CU_CSCMM_DMD2CSC_THREAD_X) {
+      int row = csc_row[i];  // row Index
+      /* compute C[row, ibx] */
+      float results = 0;
+      if (!trans_A) {
+        int index = row * dimK + j * CU_CSCMM_DMD2CSC_SHARE_X;
+        for (int k = 0; k < CU_CSCMM_DMD2CSC_SHARE_X; k++) {
+          results += A_d[index + k] * B_s[k];
+        }
+      } else {
+        int  index = j * CU_CSCMM_DMD2CSC_SHARE_X;
+        for (int k = 0; k < CU_CSCMM_DMD2CSC_SHARE_X; k++) {
+          results += A_d[(index + k) * dimM + row] * B_s[k];
+        }
+      }
+      csc_val[i]  += results * alpha;
+    }
+  }
+
+  if (iter_rem) {
+    int rowStart = (iter_num * CU_CSCMM_DMD2CSC_SHARE_X + idx) * dimN;
+    int index = rowStart;
+    // #pragma unroll
+    for (int m = idx; m < iter_rem;  m += CU_CSCMM_DMD2CSC_THREAD_X) {
+      B_s[m] = B_tmp[index];
+      index = index + CU_CSCMM_DMD2CSC_THREAD_X * dimN;
+    }
+    __syncthreads();
+    for (int i = csc_start + idx;
+         i < csc_end; i += CU_CSCMM_DMD2CSC_THREAD_X) {
+      int row = csc_row[i];  // row Index
+      /* compute C[row, ibx] */
+      float results = 0;
+      if (!trans_A) {
+        int index = row * dimK + iter_num * CU_CSCMM_DMD2CSC_SHARE_X;
+        for (int k = 0; k < iter_rem; k++) {
+          results += A_d[index + k] * B_s[k];
+        }
+      } else {
+        int  index =  iter_num * CU_CSCMM_DMD2CSC_SHARE_X;
+        for (int k = 0; k < iter_rem; k++) {
+          results += A_d[(index + k) * dimM + row] * B_s[k];
+        }
+      }
+      csc_val[i] += alpha * results;
+    }
+  }
+}
+
+#define     CU_CSCMM_DMD2CSR_THREAD_X   128
+#define     CU_CSCMM_DMD2CSR_SHARE_X    128
+__global__ void KeSMatrixDenseMulDense2CSR(real *csr_val,
+                                     const int *csr_row,
+                                     const int *csr_col,
+                                     real *A_d,
+                                     real *B_d,
+                                     bool  trans_A,
+                                     bool  trans_B,
+                                     int dimM,
+                                     int dimN,
+                                     int dimK,
+                                     real alpha,
+                                     real beta) {
+  __shared__ real A_s[CU_CSCMM_DMD2CSR_SHARE_X];
+  const int idx = threadIdx.x;  // one block comput one row
+  const int ibx = blockIdx.x;  // row index
+
+  int csr_start;
+  int csr_end;
+  if (ibx < dimM) {
+    csr_start = csr_row[ibx];
+    csr_end = csr_row[ibx+1];
+  } else {
+    csr_start = 0;
+    csr_end = 0;
+  }
+
+  int iter_num = dimK / CU_CSCMM_DMD2CSR_SHARE_X;
+  int csr_rem = dimK % CU_CSCMM_DMD2CSR_SHARE_X;
+  for (int j = 0; j < iter_num; j++) {
+    if (!trans_A) {
+      int colStart = j * CU_CSCMM_DMD2CSR_SHARE_X + ibx * dimK;
+      int index = colStart + idx;
+      #pragma unroll
+      for (int m = idx;
+           m < CU_CSCMM_DMD2CSR_SHARE_X; m += CU_CSCMM_DMD2CSR_THREAD_X) {
+        A_s[m] = A_d[index];
+        index = index + CU_CSCMM_DMD2CSR_THREAD_X;
+      }
+    } else {
+      int colStart = (j * CU_CSCMM_DMD2CSR_SHARE_X) * dimM  + ibx;
+      int index = colStart + idx * dimM;
+      for (int m = idx;
+           m < CU_CSCMM_DMD2CSR_SHARE_X; m += CU_CSCMM_DMD2CSR_THREAD_X) {
+        A_s[m] = A_d[index];
+        index = index + CU_CSCMM_DMD2CSR_THREAD_X * dimM;
+      }
+    }
+    __syncthreads();
+    for (int i = csr_start + idx; i < csr_end; i += CU_CSCMM_DMD2CSR_THREAD_X) {
+      int col_idx =  csr_col[i];  // col index
+      /* comput C[ibx, col_idx] */
+      real results = 0;
+      int index = (j * CU_CSCMM_DMD2CSR_SHARE_X) * dimN + col_idx;
+      for (int k = 0; k < CU_CSCMM_DMD2CSR_SHARE_X; k++) {
+        results += A_s[k] * B_d[k * dimN + index];
+      }
+      csr_val[i] += alpha * results;
+    }
+  }
+
+  if (csr_rem) {
+    if (!trans_A) {
+      int colStart = (ibx + 1) * dimK- csr_rem;
+      int index = colStart + idx;
+      #pragma unroll
+      for (int m = idx; m < csr_rem; m += CU_CSCMM_DMD2CSR_THREAD_X) {
+        A_s[m] = A_d[index];
+        index = index + CU_CSCMM_DMD2CSR_THREAD_X;
+      }
+     } else {
+        int colStart = (iter_num * CU_CSCMM_DMD2CSR_SHARE_X) * dimM  + ibx;
+        int index = colStart + idx * dimM;
+        for (int m = idx; m < csr_rem;  m += CU_CSCMM_DMD2CSR_THREAD_X) {
+          A_s[m] = A_d[index];
+          index = index + CU_CSCMM_DMD2CSR_THREAD_X * dimM;
+        }
+     }
+     __syncthreads();
+     for (int i = csr_start + idx;
+          i < csr_end; i += CU_CSCMM_DMD2CSR_THREAD_X) {
+       int col_idx =  csr_col[i];
+       float results = 0;
+       int  index = (iter_num *CU_CSCMM_DMD2CSR_SHARE_X) * dimN + col_idx;
+       for (int k = 0; k < csr_rem; k++) {
+         results += A_s[k ] * B_d[k * dimN + index];
+       }
+       csr_val[i] += alpha * results;
+     }
+  }
+}
+
+
+/**
+ *  @brief  Use to calculate row/col index for CSR/CSC sparse matrix
+ *          according to csr_row(csc_col) and
+ *          the value position in csr_val/csc_val
+ *
+ *  @param  indice      csr_row for hl_csr_matrix
+ *                      csc_col for hl_csc_matrix
+ *  @param  num         length of csr_row/csc_col
+ *  @param  index       the value position in csr_val/csc_val
+ *                      but need to add 1
+ *                      that is, 1,2,3,...,nnz
+ *  @note   the following kernels doesn't use findIndex,
+ *          but may be used in the future.
+ */
+__device__ __forceinline__
+int findIndex(int* indice, int num, int index) {
+  int start = 0;
+  int end = num - 1;
+  int mid = -1;
+  while (start < end) {
+    mid = start + ((end - start) / 2);
+    if (indice[mid] < index)
+      start = mid + 1;
+    else
+      end = mid;
+  }
+  return (end - 1);
+}
+
+/**
+ * @brief  sum reduction
+ *
+ * @param[in,out]  smem       input data, better to use __shared__ memory.
+ * @param[in]      tid        local thread index.
+ * @param[in]      blockDimX  the size of blockDim.x.
+ *
+ * note: return smem[0]: the sum of each elements of smem.
+ */
+__device__ __forceinline__
+void reduce(real* smem, int tid, int blockDimX) {
+  for (unsigned int s = blockDimX / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      smem[tid] += smem[tid + s];
+    }
+    __syncthreads();
+  }
+}
+
+/**
+ * @brief sum columns of csr sparse matrix (csr_val), then add to a_val.
+ *        This kernel used atomicAdd and adapted to w >> h, w is the
+ *        width of csr, and h is the height of csr.
+ */
+__global__ void KeSMatrixCsrColumnSum(real* a_val, real* csr_val,
+                                      int* csr_col, const int dimNNZ) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int idx = gid; idx < dimNNZ; idx += gridDim.x * blockDim.x) {
+    int colIdx = csr_col[idx];
+    real val = csr_val[idx];
+    atomicAdd(a_val + colIdx, val);
+  }
+}
+
+__global__ void KeSMatrixCsrAddBias(real* csr_val, int* csr_col, real* b_d,
+                                    real scale, const int nnz) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;  // global index
+  for (int idx = gid; idx < nnz; idx += gridDim.x * blockDim.x) {
+    int colIdx = csr_col[idx];
+    // not coalesced access to b_d
+    csr_val[idx] += scale * b_d[colIdx];
+  }
+}
+
+/**
+ * @brief  csr sparse matrix add dense matrix.
+ *         This kernel occurs load imbalances
+ *         if number of each row is different greatly.
+ */
+__global__ void KeSMatrixCsrAddDense(real* csr_val, int* csr_row,
+                                     int* csr_col, real* b_d, real alpha,
+                                     real beta, int dimM, int dimN) {
+  int gidx = blockIdx.x * blockDim.x + threadIdx.x;
+  int gidy = blockIdx.y;
+  if (gidy < dimM) {
+    int start = csr_row[gidy];
+    int end = csr_row[gidy + 1];
+    for (int x = gidx; x < (end - start); x += gridDim.x * blockDim.x) {
+      int col = csr_col[start + x];
+      real val = csr_val[start + x];
+      csr_val[start + x] = beta * val + alpha * b_d[gidy * dimN + col];
+    }
+  }
+}
+
+#define CU_BLOCK_K 16
+#define CU_BLOCK_SIZE 128
+
+__global__ void KeSMatrixDenseMulDenseTrans2CSR(
+    real* csr_val, const int* csr_row, const int* csr_col, real* A_d,
+    real* B_d, bool trans_A, bool trans_B, int dimM, int dimN, int dimK,
+    real alpha, real beta) {
+
+  __shared__ real B_s[CU_BLOCK_SIZE][CU_BLOCK_K];
+  __shared__ real A_s[CU_BLOCK_K];
+
+  const int idx = threadIdx.x;
+
+  const int gidx_begin = blockIdx.x * CU_BLOCK_SIZE;
+  const int gidy = blockIdx.y;
+  const int gx_dim = gridDim.x * blockDim.x;
+
+  int start = csr_row[gidy];
+  int end = csr_row[gidy + 1];
+  int size = end - start;
+
+  int c_iter_num = (size + gx_dim - 1) / gx_dim;
+  int iter_num = (dimK + CU_BLOCK_K - 1) / CU_BLOCK_K;
+  for (int i = 0; i < c_iter_num; ++i) {
+    if ((gidx_begin + i * gx_dim) >= size) {
+      return;  // No need to calculate in this block.
+    }
+
+    real res = 0.0;
+    int c_idx = gidx_begin + i * gx_dim + idx;
+
+    for (int j = 0; j < iter_num; ++j) {
+      int col = j * CU_BLOCK_K + idx;
+      if (idx < CU_BLOCK_K) {
+        A_s[idx] = col < dimK ? A_d[gidy * dimK + col] : 0.0;
+      }
+      for (int m = 0; m < CU_BLOCK_K; ++m) {
+        int row = (idx / CU_BLOCK_K) + m * (CU_BLOCK_SIZE / CU_BLOCK_K);
+        col = idx % CU_BLOCK_K;
+        int csr_idx = gidx_begin + i * gx_dim + row;
+        int ldRow = csr_idx < size ? csr_col[start + csr_idx] : 0;
+        int ldCol = j * CU_BLOCK_K + col;
+        B_s[row][col] = (csr_idx < size && ldCol < dimK) ?
+                        B_d[ldRow * dimK + ldCol] : 0.0;
+      }
+      __syncthreads();
+
+      for (int k = 0; k < CU_BLOCK_K; k++) {
+        res += A_s[k] * B_s[idx][k];
+      }
+      __syncthreads();
+    }
+
+    if (c_idx < size) {
+      csr_val[start + c_idx] += alpha * res;
+    }
+  }
+}
diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc
new file mode 100644
index 00000000000000..27bbd03bc32829
--- /dev/null
+++ b/paddle/cuda/src/hl_cudart_wrap.cc
@@ -0,0 +1,210 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifdef PADDLE_USE_DSO
+
+#include <mutex>
+#include "hl_dso_loader.h"
+
+/**
+ * cudart wrapper: for dynamic load libcudart.so.
+ * When nvcc compile cuda kernels, it will insert
+ * some build-in runtime routines, which must be
+ * provided by us if PADDLE_USE_DSO is true. If
+ * PADDLE_USE_DSO is false, all of them must be
+ * ignored to avoid multiple definitions.
+ */
+namespace dynload {
+
+extern std::once_flag cudart_dso_flag;
+extern void* cudart_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cuda routine
+ * via operator overloading.
+ **/
+#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type)                    \
+  struct DynLoad__##__name {                                        \
+    template <typename... Args>                                     \
+    __type operator()(Args... args) {                               \
+      typedef __type (*cudartFunc)(Args...);                        \
+      std::call_once(cudart_dso_flag, GetCudartDsoHandle,           \
+                    &cudart_dso_handle);                            \
+      void* p_##__name = dlsym(cudart_dso_handle, #__name);         \
+      return reinterpret_cast<cudartFunc>(p_##__name)(args...);     \
+    }                                                               \
+  } __name;  /* struct DynLoad__##__name */
+
+/* include all needed cuda functions in HPPL */
+#define CUDA_ROUTINE_EACH(__macro)          \
+  __macro(cudaLaunch, cudaError_t)          \
+  __macro(cudaSetupArgument, cudaError_t)   \
+  __macro(cudaConfigureCall, cudaError_t)   \
+  __macro(__cudaRegisterFatBinary, void**)  \
+  __macro(__cudaUnregisterFatBinary, void)  \
+  __macro(__cudaRegisterFunction, void)     \
+  __macro(__cudaRegisterVar, void)          \
+  __macro(__cudaRegisterManagedVar, void)   \
+  __macro(__cudaInitModule, char)           \
+  __macro(__cudaRegisterTexture, void)      \
+  __macro(__cudaRegisterSurface, void)
+
+CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
+
+#if CUDART_VERSION >= 7000
+  DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
+#endif
+
+#undef CUDA_ROUNTINE_EACH
+
+}  /* namespace dynload */
+
+#if CUDART_VERSION >= 7000
+__host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
+                                                dim3 gridDim,
+                                                dim3 blockDim,
+                                                void **args,
+                                                size_t sharedMem,
+                                                cudaStream_t stream)
+{
+  return dynload::cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream);
+}
+#endif /* CUDART_VERSION >= 7000 */
+
+
+__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func)
+{
+  return dynload::cudaLaunch(func);
+}
+
+__host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
+                                                 size_t size,
+                                                 size_t offset)
+{
+  return dynload::cudaSetupArgument(arg, size, offset);
+}
+
+__host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
+                                                 dim3 blockDim,
+                                                 size_t sharedMem,
+                                                 cudaStream_t stream)
+{
+  return dynload::cudaConfigureCall(gridDim, blockDim,
+                                    sharedMem, stream);
+}
+
+extern "C" {
+
+void** CUDARTAPI __cudaRegisterFatBinary(
+  void *fatCubin
+)
+{
+  return dynload::__cudaRegisterFatBinary(fatCubin);
+
+}
+
+void CUDARTAPI __cudaUnregisterFatBinary(
+  void **fatCubinHandle
+)
+{
+  return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
+}
+
+void CUDARTAPI __cudaRegisterFunction(
+        void   **fatCubinHandle,
+  const char    *hostFun,
+        char    *deviceFun,
+  const char    *deviceName,
+        int      thread_limit,
+        uint3   *tid,
+        uint3   *bid,
+        dim3    *bDim,
+        dim3    *gDim,
+        int     *wSize
+) {
+  return dynload::__cudaRegisterFunction(
+                fatCubinHandle, hostFun, deviceFun, deviceName,
+                thread_limit, tid, bid, bDim, gDim, wSize);
+}
+
+void CUDARTAPI __cudaRegisterVar(
+        void **fatCubinHandle,
+        char  *hostVar,
+        char  *deviceAddress,
+  const char  *deviceName,
+        int    ext,
+        int    size,
+        int    constant,
+        int    global
+) {
+  return dynload::__cudaRegisterVar(
+                fatCubinHandle, hostVar, deviceAddress,
+                deviceName, ext, size, constant, global);
+}
+
+
+
+extern void CUDARTAPI __cudaRegisterManagedVar(
+        void **fatCubinHandle,
+        void **hostVarPtrAddress,
+        char  *deviceAddress,
+  const char  *deviceName,
+        int    ext,
+        int    size,
+        int    constant,
+        int    global
+) {
+  return dynload::__cudaRegisterManagedVar(
+                fatCubinHandle, hostVarPtrAddress, deviceAddress,
+                deviceName, ext, size, constant, global);
+}
+
+char CUDARTAPI __cudaInitModule(
+        void **fatCubinHandle
+) {
+  return dynload::__cudaInitModule(fatCubinHandle);
+}
+
+void CUDARTAPI __cudaRegisterTexture(
+        void                    **fatCubinHandle,
+  const struct textureReference  *hostVar,
+  const void                    **deviceAddress,
+  const char                     *deviceName,
+        int                       dim,
+        int                       norm,
+        int                       ext
+) {
+  return dynload::__cudaRegisterTexture(
+                fatCubinHandle, hostVar, deviceAddress,
+                deviceName, dim, norm, ext);
+}
+
+void CUDARTAPI __cudaRegisterSurface(
+        void                    **fatCubinHandle,
+  const struct surfaceReference  *hostVar,
+  const void                    **deviceAddress,
+  const char                     *deviceName,
+        int                       dim,
+        int                       ext
+) {
+  return dynload::__cudaRegisterSurface(
+                fatCubinHandle, hostVar, deviceAddress,
+                deviceName, dim, ext);
+}
+
+} /* extern "C" */
+
+#endif
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
new file mode 100644
index 00000000000000..3558b163b5ae0d
--- /dev/null
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "hl_dso_loader.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/CommandLineParser.h"
+
+P_DEFINE_string(cudnn_dir, "",
+                "Specify path for loading libcudnn.so. For instance, "
+                "/usr/local/cudnn/lib64. If empty [default], dlopen will search "
+                "cudnn from LD_LIBRARY_PATH");
+
+P_DEFINE_string(cuda_dir, "",
+                "Specify path for loading cuda library, such as libcublas, "
+                "libcurand. For instance, /usr/local/cuda/lib64. "
+                "(Note: libcudart can not be specified by cuda_dir, since some "
+                "build-in function in cudart already ran before main entry). "
+                "If empty [default], dlopen will search cuda from LD_LIBRARY_PATH");
+
+static inline std::string join(const std::string& part1, const std::string& part2) {
+  // directory separator
+  const char sep = '/';
+
+  if (!part2.empty() && part2.front() == sep) {
+    return part2;
+  }
+  std::string ret;
+  ret.reserve(part1.size() + part2.size() + 1);
+  ret = part1;
+  if (!ret.empty() && ret.back() != sep) {
+    ret += sep;
+  }
+  ret += part2;
+  return ret;
+}
+
+static inline void GetDsoHandleWithSearchPath(
+        const std::string& search_root,
+        const std::string& dso_path,
+        void** dso_handle) {
+    int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+    *dso_handle = nullptr;
+
+    std::string dlPath = dso_path;
+    if (search_root.empty()) {
+        // default search xxx.so from LD_LIBRARY_PATH
+        *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+    } else {
+        // search xxx.so from custom path
+        dlPath = join(search_root, dso_path);
+        *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+        // then, search xxx.so from LD_LIBRARY_PATH
+        if (nullptr == *dso_handle) {
+            *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+        }
+    }
+
+    CHECK(nullptr != *dso_handle)
+      << "For Gpu version of PaddlePaddle, it couldn't find CUDA library: "
+      << dlPath.c_str() << " Please make sure you already specify its path."
+      << "Note: for training data on Cpu using Gpu version of PaddlePaddle,"
+      << "you must specify libcudart.so via LD_LIBRARY_PATH.";
+}
+
+void GetCublasDsoHandle(void** dso_handle) {
+    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+}
+
+void GetCudnnDsoHandle(void** dso_handle) {
+    GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+}
+
+void GetCudartDsoHandle(void** dso_handle) {
+    GetDsoHandleWithSearchPath("", "libcudart.so", dso_handle);
+}
+
+void GetCurandDsoHandle(void** dso_handle) {
+    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+}
diff --git a/paddle/cuda/src/hl_math.cc b/paddle/cuda/src/hl_math.cc
new file mode 100644
index 00000000000000..76d48c4a9b94d4
--- /dev/null
+++ b/paddle/cuda/src/hl_math.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "avx_mathfun.h"
+
+namespace hppl {
+__m256 exp(__m256 a) {
+  return exp256_ps(a);
+}
+
+__m256 log(__m256 a) {
+  return log256_ps(a);
+}
+
+__m256 sin(__m256 a) {
+  return sin256_ps(a);
+}
+
+__m256 cos(__m256 a) {
+  return cos256_ps(a);
+}
+
+}  // namespace hppl
diff --git a/paddle/cuda/src/hl_perturbation_util.cu b/paddle/cuda/src/hl_perturbation_util.cu
new file mode 100644
index 00000000000000..a10d06f8a97cd8
--- /dev/null
+++ b/paddle/cuda/src/hl_perturbation_util.cu
@@ -0,0 +1,230 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <cmath>
+#include <stdlib.h>
+#include "hl_cuda.h"
+#include "hl_time.h"
+#include "hl_base.h"
+#include "hl_perturbation_util.cuh"
+
+#define _USE_MATH_DEFINES
+
+/*
+ * Get the original coordinate for a pixel in a transformed image.
+ * x, y: coordiate in the transformed image.
+ * tgtCenter: the center coordiate of the transformed image.
+ * imgSCenter: the center coordinate of the source image.
+ * centerX, centerY: translation.
+ * sourceX, sourceY: output coordinates in the original image.
+ */
+__device__ void getTranformCoord(int x, int y, real theta, real scale,
+                                 real tgtCenter, real imgCenter,
+                                 real centerR, real centerC,
+                                 int* sourceX, int* sourceY) {
+  real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)};
+
+  // compute coornidates in the rotated and scaled image
+  real x_new = x - tgtCenter + centerC;
+  real y_new = y - tgtCenter + centerR;
+
+  // compute coornidates in the original image
+  x_new -= imgCenter;
+  y_new -= imgCenter;
+  real xx = H[0] * x_new + H[1] * y_new;
+  real yy = H[2] * x_new + H[3] * y_new;
+  *sourceX = __float2int_rn(xx / scale + imgCenter);
+  *sourceY = __float2int_rn(yy / scale + imgCenter);
+}
+
+/*
+ * imgs:            (numImages, imgPixels)
+ * target:          (numImages * samplingRate, tgtPixels)
+ * the channels of one pixel are stored continuously in memory.
+ *
+ * created by Wei Xu (genome), converted by Jiang Wang
+ */
+
+__global__ void kSamplingPatches(const real* imgs, real* targets,
+                                 int imgSize, int tgtSize, const int channels,
+                                 int samplingRate, const real* thetas,
+                                 const real* scales, const int* centerRs,
+                                 const int* centerCs, const real padValue,
+                                 const int numImages) {
+  const int caseIdx = blockIdx.x * 4 + threadIdx.x;
+  const int pxIdx = blockIdx.y * 128 + threadIdx.y;
+  const int imgPixels = imgSize * imgSize;
+  const int tgtPixels = tgtSize * tgtSize;
+  const int numPatches = numImages * samplingRate;
+
+  real tgtCenter = (tgtSize - 1) / 2;
+  real imgCenter = (imgSize - 1) / 2;
+
+  if (pxIdx < tgtPixels && caseIdx < numPatches) {
+    const int imgIdx = caseIdx / samplingRate;
+
+    // transform coordiates
+    const int pxX = pxIdx % tgtSize;
+    const int pxY = pxIdx / tgtSize;
+
+    int srcPxX, srcPxY;
+    getTranformCoord(pxX, pxY, thetas[imgIdx], scales[imgIdx], tgtCenter,
+                     imgCenter, centerCs[caseIdx], centerRs[caseIdx], &srcPxX,
+                     &srcPxY);
+
+    imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels;
+    targets += (caseIdx * tgtPixels + pxIdx) * channels;
+    if (srcPxX >= 0 && srcPxX < imgSize && srcPxY >= 0 && srcPxY < imgSize) {
+      for (int j = 0; j < channels; j++) targets[j] = imgs[j];
+    } else {
+      for (int j = 0; j < channels; j++) targets[j] = padValue;
+    }
+  }
+}
+
+/*
+ * Functionality: generate the disturb (rotation and scaling) and
+ *                sampling location sequence
+ *
+ * created by Wei Xu
+ */
+void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
+                                int*& gpuCenterR, int*& gpuCenterC,
+                                int numImages, int imgSize, real rotateAngle,
+                                real scaleRatio, int samplingRate,
+                                bool isTrain) {
+  // The number of output samples.
+  int numPatches = numImages * samplingRate;
+
+  // create CPU perturbation parameters.
+  real* r_angle = new real[numImages];
+  real* s_ratio = new real[numImages];
+  int* center_r = new int[numPatches];
+  int* center_c = new int[numPatches];
+
+  // generate the random disturbance sequence and the sampling locations
+  if (isTrain) {  // random sampling for training
+    // generate rotation ans scaling parameters
+    // TODO(yuyang18): Since it will initialize random seed here, we can use
+    // rand_r instead of rand to make this method thread safe.
+    srand(getCurrentTimeStick());
+    for (int i = 0; i < numImages; i++) {
+      r_angle[i] =
+          (rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0)  // NOLINT
+                                          - 0.5);
+      s_ratio[i] =
+          1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio;  // NOLINT
+    }
+
+    int imgCenter = (imgSize - 1) / 2;
+
+    // generate sampling location parameters
+    for (int i = 0; i < numImages; i++) {
+      int j = 0;
+      srand((unsigned)time(NULL));
+      while (j < samplingRate) {
+        int pxX =
+            (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0));  // NOLINT
+        int pxY =
+            (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0));  // NOLINT
+
+        const real H[4] = {cos(-r_angle[i]), -sin(-r_angle[i]),
+                           sin(-r_angle[i]), cos(-r_angle[i])};
+        real x = pxX - imgCenter;
+        real y = pxY - imgCenter;
+        real xx = H[0] * x + H[1] * y;
+        real yy = H[2] * x + H[3] * y;
+
+        real srcPxX = xx / s_ratio[i] + imgCenter;
+        real srcPxY = yy / s_ratio[i] + imgCenter;
+
+        if (srcPxX >= 0 && srcPxX <= imgSize - 1 && srcPxY >= 0 &&
+            srcPxY <= imgSize - 1) {
+          center_r[i * samplingRate + j] = pxY;
+          center_c[i * samplingRate + j] = pxX;
+          j++;
+        }
+      }
+    }
+  } else {  // central crop for testing
+    for (int i = 0; i < numImages; i++) {
+      r_angle[i] = 0.0;
+      s_ratio[i] = 1.0;
+
+      for (int j = 0; j < samplingRate; j++) {
+        center_r[i * samplingRate + j] = (imgSize - 1) / 2;
+        center_c[i * samplingRate + j] = (imgSize - 1) / 2;
+      }
+    }
+  }
+
+  // copy disturbance sequence to gpu
+  hl_memcpy_host2device(gpuAngle, r_angle, sizeof(real) * numImages);
+  hl_memcpy_host2device(gpuScaleRatio, s_ratio, sizeof(real) * numImages);
+
+  delete[] r_angle;
+  delete[] s_ratio;
+
+  // copy sampling location sequence to gpu
+  hl_memcpy_host2device(gpuCenterR, center_r, sizeof(int) * numPatches);
+  hl_memcpy_host2device(gpuCenterC, center_c, sizeof(int) * numPatches);
+
+  delete[] center_r;
+  delete[] center_c;
+}
+
+void hl_conv_random_disturb_with_params(const real* images, int imgSize,
+                                        int tgtSize, int channels,
+                                        int numImages, int samplingRate,
+                                        const real* gpuRotationAngle,
+                                        const real* gpuScaleRatio,
+                                        const int* gpuCenterR,
+                                        const int* gpuCenterC,
+                                        int paddingValue,
+                                        real* target) {
+  // The number of output samples.
+  int numPatches = numImages * samplingRate;
+  // The memory size of one output patch.
+  int targetSize = tgtSize * tgtSize;
+
+  dim3 threadsPerBlock(4, 128);
+  dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128));
+
+  kSamplingPatches <<<numBlocks, threadsPerBlock>>>
+      (images, target, imgSize, tgtSize, channels, samplingRate,
+      gpuRotationAngle, gpuScaleRatio, gpuCenterR, gpuCenterC,
+      paddingValue, numImages);
+
+  hl_device_synchronize();
+}
+
+void hl_conv_random_disturb(const real* images, int imgSize,
+                            int tgtSize, int channels, int numImages,
+                            real scaleRatio, real rotateAngle,
+                            int samplingRate, real* gpu_r_angle,
+                            real* gpu_s_ratio, int* gpu_center_r,
+                            int* gpu_center_c, int paddingValue,
+                            bool isTrain, real* targets) {
+  // generate the random disturbance sequence and the sampling locations
+  hl_generate_disturb_params(gpu_r_angle, gpu_s_ratio, gpu_center_r,
+                  gpu_center_c, numImages, imgSize, rotateAngle,
+                  scaleRatio, samplingRate, isTrain);
+
+  hl_conv_random_disturb_with_params(
+                  images, imgSize, tgtSize, channels, numImages,
+                  samplingRate, gpu_r_angle, gpu_s_ratio,
+                  gpu_center_r, gpu_center_r, paddingValue,
+                  targets);
+}
diff --git a/paddle/cuda/src/hl_table_apply.cu b/paddle/cuda/src/hl_table_apply.cu
new file mode 100644
index 00000000000000..05335c5f835fc5
--- /dev/null
+++ b/paddle/cuda/src/hl_table_apply.cu
@@ -0,0 +1,122 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "hl_base.h"
+#include "hl_device_functions.cuh"
+#include "hl_cuda.h"
+#include "paddle/utils/Logging.h"
+
+template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
+__global__ void KeMatrixAddRows(real* output, int ldo,
+                                real* table, int ldt,
+                                int* ids,
+                                int numSamples,
+                                int tableSize,
+                                int dim) {
+  int idx = threadIdx.x;
+  int idy = blockIdx.x + threadIdx.y * gridDimX;
+
+  while (idy < numSamples) {
+    int tableId = ids[idy];
+    if ((0 <= tableId) && (tableId < tableSize)) {
+      real *out = output + idy * ldo;
+      real *tab = table + tableId * ldt;
+      for (int i = idx; i < dim; i += blockDimX) {
+        if (AddRow) {
+          atomicAdd(&tab[i], out[i]);
+        } else {
+          out[i] += tab[i];
+        }
+      }
+    }
+    idy += blockDimY * gridDimX;
+  }
+}
+
+void hl_matrix_select_rows(real* output, int ldo,
+                           real* table, int ldt,
+                           int* ids,
+                           int numSamples,
+                           int tableSize,
+                           int dim) {
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(table);
+  CHECK_NOTNULL(ids);
+
+  dim3 threads(128, 8);
+  dim3 grid(8, 1);
+  KeMatrixAddRows<128, 8, 8, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
+    (output, ldo, table, ldt, ids, numSamples, tableSize, dim);
+
+  CHECK_SYNC("hl_matrix_select_rows failed");
+}
+
+void hl_matrix_add_to_rows(real* table, int ldt,
+                           real* input, int ldi,
+                           int* ids,
+                           int numSamples,
+                           int tableSize,
+                           int dim) {
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(table);
+  CHECK_NOTNULL(ids);
+
+  dim3 threads(128, 8);
+  dim3 grid(8, 1);
+  KeMatrixAddRows<128, 8, 8, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
+    (input, ldi, table, ldt, ids, numSamples, tableSize, dim);
+
+  CHECK_SYNC("hl_matrix_add_to_rows failed");
+}
+
+template<class T, int blockDimX, int gridDimX>
+__global__ void KeVectorSelect(T* dst, int sized,
+                               const T* src, int sizes,
+                               const int* ids, int sizei) {
+  int idx = threadIdx.x + blockDimX * blockIdx.x;
+  while (idx < sizei) {
+    int index = ids[idx];
+    // check(index < sizes);
+    dst[idx] = src[index];
+    idx += blockDimX * gridDimX;
+  }
+}
+
+template <class T>
+void hl_vector_select_from(T* dst, int sized,
+                           const T* src, int sizes,
+                           const int* ids, int sizei) {
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(src);
+  CHECK_NOTNULL(ids);
+  CHECK_EQ(sized, sizei);
+
+  dim3 threads(512, 1);
+  dim3 grid(8, 1);
+  KeVectorSelect<T, 512, 8><<< grid, threads, 0, STREAM_DEFAULT >>>
+    (dst, sized, src, sizes, ids, sizei);
+
+  CHECK_SYNC("hl_vector_select_from failed");
+}
+
+template
+void hl_vector_select_from(real* dst, int sized,
+                           const real* src, int sizes,
+                           const int* ids, int sizei);
+template
+void hl_vector_select_from(int* dst, int sized,
+                           const int* src, int sizes,
+                           const int* ids, int sizei);
+
diff --git a/paddle/cuda/src/hl_time.cc b/paddle/cuda/src/hl_time.cc
new file mode 100644
index 00000000000000..adc88d60dd8d54
--- /dev/null
+++ b/paddle/cuda/src/hl_time.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <chrono>
+#include <stdlib.h>
+#include <iostream>
+#include "hl_time.h"
+
+using std::chrono::high_resolution_clock;
+
+int64_t getCurrentTimeStick() {
+    high_resolution_clock::time_point tp = high_resolution_clock::now();
+    high_resolution_clock::duration dtn = tp.time_since_epoch();
+    return dtn.count();
+}
+
diff --git a/paddle/cuda/src/hl_top_k.cu b/paddle/cuda/src/hl_top_k.cu
new file mode 100644
index 00000000000000..ed74787b610ca1
--- /dev/null
+++ b/paddle/cuda/src/hl_top_k.cu
@@ -0,0 +1,386 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "hl_base.h"
+#include "hl_top_k.h"
+#include "hl_sparse.ph"
+#include "paddle/utils/Logging.h"
+
+// using namespace hppl;
+
+struct Pair {
+  __device__ __forceinline__
+  Pair() {}
+
+  __device__ __forceinline__
+  Pair(real value, int id) : v_(value), id_(id) {}
+
+  __device__ __forceinline__
+  void set(real value, int id) {
+    v_ = value;
+    id_ = id;
+  }
+
+  __device__ __forceinline__
+  void operator=(const Pair& in) {
+    v_ = in.v_;
+    id_ = in.id_;
+  }
+
+  __device__ __forceinline__
+  bool operator<(const real value) const {
+    return (v_ < value);
+  }
+
+  __device__ __forceinline__
+  bool operator<(const Pair& in) const {
+    return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_));
+  }
+
+  __device__ __forceinline__
+  bool operator>(const Pair& in) const {
+    return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_));
+  }
+
+  real v_;
+  int id_;
+};
+
+__device__ __forceinline__
+void addTo(Pair topK[], const Pair &p, int beamSize) {
+  for (int k = beamSize - 2; k >= 0; k--) {
+    if (topK[k] < p) {
+      topK[k + 1] = topK[k];
+    } else {
+      topK[k + 1] = p;
+      return;
+    }
+  }
+  topK[0] = p;
+}
+
+template<int beamSize>
+__device__ __forceinline__
+void addTo(Pair topK[], const Pair &p) {
+  for (int k = beamSize - 2; k >= 0; k--) {
+    if (topK[k] < p) {
+      topK[k + 1] = topK[k];
+    } else {
+      topK[k + 1] = p;
+      return;
+    }
+  }
+  topK[0] = p;
+}
+
+template<int blockSize>
+__device__ __forceinline__
+void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
+  while (idx < dim) {
+    if (topK[beamSize - 1] < src[idx]) {
+      Pair tmp(src[idx], idx);
+      addTo(topK, tmp, beamSize);
+    }
+    idx += blockSize;
+  }
+}
+
+template<int blockSize>
+__device__ __forceinline__
+void getTopK(Pair topK[], real *src, int idx, int dim,
+             const Pair& max, int beamSize) {
+  while (idx < dim) {
+    if (topK[beamSize - 1] < src[idx]) {
+      Pair tmp(src[idx], idx);
+      if (tmp < max) {
+        addTo(topK, tmp, beamSize);
+      }
+    }
+    idx += blockSize;
+  }
+}
+
+template<int blockSize>
+__device__ __forceinline__
+void getTopK(Pair topK[], real *val, int *col,
+             int idx, int dim, int beamSize) {
+  while (idx < dim) {
+    if (topK[beamSize - 1] < val[idx]) {
+      Pair tmp(val[idx], col[idx]);
+      addTo(topK, tmp, beamSize);
+    }
+    idx += blockSize;
+  }
+}
+
+template<int blockSize>
+__device__ __forceinline__
+void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
+             const Pair& max, int beamSize) {
+  while (idx < dim) {
+    if (topK[beamSize - 1] < val[idx]) {
+      Pair tmp(val[idx], col[idx]);
+      if (tmp < max) {
+        addTo(topK, tmp, beamSize);
+      }
+    }
+    idx += blockSize;
+  }
+}
+
+template<int maxLength, int blockSize>
+__device__ __forceinline__
+void threadGetTopK(Pair topK[], int& beam, int beamSize,
+                   real* src,
+                   bool& firstStep, bool& isEmpty, Pair& max,
+                   int dim, const int tid) {
+  if (beam > 0) {
+    int length = beam < beamSize ? beam : beamSize;
+    if (firstStep) {
+      firstStep = false;
+      getTopK<blockSize>(topK, src, tid, dim, length);
+    } else {
+      for (int k = 0; k < maxLength; k++) {
+        if (k < maxLength - beam) {
+          topK[k] = topK[k + beam];
+        } else {
+          topK[k].set(-HL_FLOAT_MAX, -1);
+        }
+      }
+      if (!isEmpty) {
+        getTopK<blockSize>(topK + maxLength - beam, src, tid, dim,
+                           max, length);
+      }
+    }
+
+    max = topK[maxLength - 1];
+    if (max.id_ == -1) isEmpty = true;
+    beam = 0;
+  }
+}
+
+template<int maxLength, int blockSize>
+__device__ __forceinline__
+void threadGetTopK(Pair topK[], int& beam, int beamSize,
+                   real* val, int* col,
+                   bool& firstStep, bool& isEmpty, Pair& max,
+                   int dim, const int tid) {
+  if (beam > 0) {
+    int length = beam < beamSize ? beam : beamSize;
+    if (firstStep) {
+      firstStep = false;
+      getTopK<blockSize>(topK, val, col, tid, dim, length);
+    } else {
+      for (int k = 0; k < maxLength; k++) {
+        if (k < maxLength - beam) {
+          topK[k] = topK[k + beam];
+        } else {
+          topK[k].set(-HL_FLOAT_MAX, -1);
+        }
+      }
+      if (!isEmpty) {
+        getTopK<blockSize>(topK + maxLength - beam, val, col, tid, dim,
+                           max, length);
+      }
+    }
+
+    max = topK[maxLength - 1];
+    if (max.id_ == -1) isEmpty = true;
+    beam = 0;
+  }
+}
+
+template<int maxLength, int blockSize>
+__device__ __forceinline__
+void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
+                 real** topVal, int** topIds,
+                 int& beam, int& beamSize,
+                 const int tid, const int warp) {
+  while (true) {
+    __syncthreads();
+    if (tid < blockSize / 2) {
+      if (shTopK[tid] < shTopK[tid + blockSize / 2]) {
+        maxId[tid] = tid + blockSize / 2;
+      } else {
+        maxId[tid] = tid;
+      }
+    }
+    __syncthreads();
+    for (int stride = blockSize / 4; stride > 0; stride = stride/2) {
+      if (tid < stride) {
+        if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) {
+          maxId[tid] = maxId[tid + stride];
+        }
+      }
+      __syncthreads();
+    }
+    __syncthreads();
+
+    if (tid == 0) {
+      **topVal = shTopK[maxId[0]].v_;
+      **topIds = shTopK[maxId[0]].id_;
+      (*topVal)++;
+      (*topIds)++;
+    }
+    if (tid == maxId[0]) beam++;
+    if (--beamSize == 0) break;
+    __syncthreads();
+
+    if (tid == maxId[0]) {
+      if (beam < maxLength) {
+        shTopK[tid] = topK[beam];
+      }
+    }
+    if (maxId[0] / 32 == warp) {
+      if (__shfl(beam, (maxId[0]) % 32, 32) == maxLength) break;
+    }
+  }
+}
+
+/**
+ * Each block compute one sample.
+ * In a block:
+ * 1. every thread get top maxLength value;
+ * 2. merge to shTopK, block reduce and get max value;
+ * 3. go to the second setp, until one thread's topK value is null;
+ * 4. go to the first setp, until get the topK value.
+ */
+template<int maxLength, int blockSize>
+__global__ void KeMatrixTopK(real* topVal, int ldv,
+                             int * topIds,
+                             real* src, int lds,
+                             int dim,
+                             int beamSize) {
+  __shared__ Pair shTopK[blockSize];
+  __shared__ int maxId[blockSize / 2];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+  src += blockIdx.x * lds;
+  topVal += blockIdx.x * ldv;
+  topIds += blockIdx.x * beamSize;
+
+  Pair topK[maxLength]; // NOLINT
+  int beam = maxLength;
+  Pair max;
+  bool isEmpty = false;
+  bool firstStep = true;
+
+  for (int k = 0; k < maxLength; k++) {
+    topK[k].set(-HL_FLOAT_MAX, -1);
+  }
+  while (beamSize) {
+    threadGetTopK<maxLength, blockSize>
+      (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+
+    shTopK[tid] = topK[0];
+    blockReduce<maxLength, blockSize>
+      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+  }
+}
+
+template<int maxLength, int blockSize>
+__global__ void KeSMatrixTopK(real* topVal, int ldv,
+                              int * topIds,
+                              real* val,
+                              int* row,
+                              int* col,
+                              int beamSize) {
+  __shared__ Pair shTopK[blockSize];
+  __shared__ int maxId[blockSize / 2];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+  topVal += blockIdx.x * ldv;
+  topIds += blockIdx.x * beamSize;
+
+  Pair topK[maxLength]; // NOLINT
+  int beam = maxLength;
+  Pair max;
+  bool isEmpty = false;
+  bool firstStep = true;
+
+  int start = row[blockIdx.x];
+  int end = row[blockIdx.x + 1];
+  int dim = end - start;
+  val += start;
+  col += start;
+
+  if (beamSize > dim) {
+    // if the number of values to sort are less than the output size,
+    // use -1 to indicate the end of valid sorted values.
+    if (tid == 0) {
+      topIds[dim] = -1;
+    }
+
+    beamSize = dim;
+  }
+
+  for (int k = 0; k < maxLength; k++) {
+    topK[k].set(-HL_FLOAT_MAX, -1);
+  }
+  while (beamSize) {
+    threadGetTopK<maxLength, blockSize>
+      (topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
+
+    shTopK[tid] = topK[0];
+    blockReduce<maxLength, blockSize>
+      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+  }
+}
+
+void hl_matrix_top_k(real* topVal, int ldv,
+                     int * topIds,
+                     real* src, int lds,
+                     int dim,
+                     int beamSize,
+                     int numSamples) {
+  CHECK_NOTNULL(topVal);
+  CHECK_NOTNULL(topIds);
+  CHECK_NOTNULL(src);
+
+  if (beamSize > dim) beamSize = dim;
+
+  dim3 threads(256, 1);
+  dim3 grid(numSamples, 1);
+  KeMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>>
+    (topVal, ldv, topIds, src, lds, dim, beamSize);
+
+  CHECK_SYNC("hl_matrix_top_k failed");
+}
+
+void hl_sparse_matrix_top_k(real* topVal, int ldv,
+                            int * topIds,
+                            hl_sparse_matrix_s src,
+                            int beamSize,
+                            int numSamples) {
+  CHECK_NOTNULL(topVal);
+  CHECK_NOTNULL(topIds);
+  CHECK_NOTNULL(src);
+  CHECK_EQ(src->format, HL_SPARSE_CSR)
+    <<"sparse matrix format error!";
+
+  hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
+  if (csr->csr_val == NULL || csr->csr_row == NULL ||
+      csr->csr_col == NULL) {
+    LOG(FATAL) << "parameter src is null!";
+  }
+
+  dim3 threads(256, 1);
+  dim3 grid(numSamples, 1);
+  KeSMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>>
+    (topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
+
+  CHECK_SYNC("hl_sparse_matrix_top_k failed");
+}
+
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
new file mode 100644
index 00000000000000..9ac4d210f6d376
--- /dev/null
+++ b/paddle/gserver/CMakeLists.txt
@@ -0,0 +1,64 @@
+# Gserver package contains:
+#   * Layers
+#   * Activations
+#   * DataProviders
+#   * Evaluators
+#   * GradientMachines(NeuralNetwork)
+file(GLOB_RECURSE GSERVER_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.h")
+file(GLOB_RECURSE GSERVER_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cpp")
+set(GSERVER_SOURCES
+    layers/LstmCompute.cu
+    layers/GruCompute.cu
+    ${GSERVER_SOURCES})
+
+macro(filter_test VAR_NAME)
+    set(tmp)
+    foreach(p IN LISTS ${VAR_NAME})
+        if(NOT ${p} MATCHES ".*tests/.*")
+             set(tmp ${p} ${tmp})
+        endif()
+    endforeach()
+    set(${VAR_NAME} ${tmp})
+endmacro()
+
+filter_test(GSERVER_HEADER)
+filter_test(GSERVER_SOURCES)
+if(NOT WITH_GPU)
+    list(REMOVE_ITEM GSERVER_HEADER
+        layers/CudnnConvLayer.h
+        layers/CudnnPoolLayer.h
+        layers/CudnnBatchNormLayer.h
+        layers/NormProjectionLayer.h
+        layers/NormLayer.h)
+
+    list(REMOVE_ITEM GSERVER_SOURCES
+        layers/CudnnConvLayer.cpp
+        layers/CudnnPoolLayer.cpp
+        layers/CudnnBatchNormLayer.cpp
+        layers/NormProjectionLayer.cpp
+        layers/NormLayer.cpp)
+    compile_cu_as_cpp(layers/LstmCompute.cu)
+    compile_cu_as_cpp(layers/GruCompute.cu)
+endif()
+
+if(NOT WITH_PYTHON)
+    list(REMOVE_ITEM GSERVER_SOURCES
+            dataproviders/PyDataProvider.cpp)
+    
+    list(REMOVE_ITEM GSERVER_HEADER
+            dataproviders/PyDataProvider.h)
+endif()
+
+if(WITH_GPU)
+    add_paddle_culib(paddle_gserver ${GSERVER_SOURCES})
+else()
+    add_library(paddle_gserver STATIC
+        ${GSERVER_SOURCES})
+endif()
+
+add_style_check_target(paddle_gserver ${GSERVER_SOURCES})
+add_style_check_target(paddle_gserver ${GSERVER_HEADER})
+add_dependencies(paddle_gserver gen_proto_cpp)
+if(WITH_TESTING)
+    add_subdirectory(tests)
+endif()
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
new file mode 100644
index 00000000000000..cf4fe5966b3a6d
--- /dev/null
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -0,0 +1,288 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ActivationFunction.h"
+
+#include <algorithm>
+#include <memory>
+#include <iostream>
+#include <type_traits>
+#include <string>
+#include <thread>
+#include "paddle/utils/ClassRegistrar.h"
+#include "paddle/parameter/Argument.h"
+
+#include "paddle/utils/Logging.h"
+
+namespace paddle {
+
+static ClassRegistrar<ActivationFunction> gActivationRegistrar;
+#define ACTIVATION_CLASS_NAME(ACTIVATION_NAME) ACTIVATION_NAME##Activation
+
+#define BEGIN_DEFINE_ACTIVATION(ACTIVATION_NAME)                             \
+  class ACTIVATION_CLASS_NAME(ACTIVATION_NAME) : public ActivationFunction { \
+  private:                                                                   \
+    static const std::string name;                                           \
+                                                                             \
+  public:                                                                    \
+    const std::string& getName() const { return name; }
+
+#define END_DEFINE_ACTIVATION(ACTIVATION_NAME)                     \
+  };                                                               \
+  const std::string ACTIVATION_CLASS_NAME(ACTIVATION_NAME)::name = \
+      #ACTIVATION_NAME;                                            \
+  static InitFunction __reg_activation__##ACTIVATION_NAME([] {     \
+    gActivationRegistrar.registerClass<                            \
+        ACTIVATION_CLASS_NAME(ACTIVATION_NAME)>(#ACTIVATION_NAME); \
+  });
+
+/**
+ * @brief The IdentityActivation class
+ *
+ * Do nothing when forward/backward.
+ */
+class IdentityActivation : public ActivationFunction {
+public:
+  static const std::string name;
+  void forward(Argument& act) { (void)act; }
+  void backward(Argument& act) { (void)act; }
+  const std::string& getName() const { return name; }
+};
+const std::string IdentityActivation::name = "";
+static InitFunction __reg_activation__identity([] {
+  gActivationRegistrar.registerClass<IdentityActivation>("");
+  gActivationRegistrar.registerClass<IdentityActivation>("linear");
+});
+
+/**
+ * SigmoidActivation
+ *
+ * f(z) = \frac{1}{1+exp(-z)}
+ */
+BEGIN_DEFINE_ACTIVATION(sigmoid)
+void forward(Argument& act) { act.value->sigmoid(*act.value); }
+void backward(Argument& act) { act.grad->sigmoidDerivative(*act.value); }
+END_DEFINE_ACTIVATION(sigmoid)
+
+/**
+ * Do Softmax activation for all sample.
+ * P(y=j|x) = \frac{e^{x^Tw_j}}{\sum^K_{k=1}e^{x^Tw_k}}
+ */
+BEGIN_DEFINE_ACTIVATION(softmax)
+private:
+MatrixPtr sftMaxSum_;
+MatrixPtr sftMaxDot_;
+MatrixPtr one_;
+
+public:
+void forward(Argument& act) { act.value->softmax(*act.value); }
+
+void backward(Argument& act) {
+  MatrixPtr outputV = act.value;
+  MatrixPtr outputG = act.grad;
+
+  if (outputG->useGpu()) {
+    outputG->softmaxBackward(*outputV);
+  } else {
+    SetDevice device(act.deviceId);
+    Matrix::resizeOrCreate(sftMaxDot_, outputG->getHeight(),
+                           outputG->getWidth(),
+                           /* trans */ false, useGpu(act.deviceId));
+    Matrix::resizeOrCreate(sftMaxSum_, outputG->getHeight(), 1,
+                           /* trans */ false, useGpu(act.deviceId));
+    if (!one_ || one_->getWidth() != outputG->getWidth()) {
+      Matrix::resizeOrCreate(one_, 1, outputG->getWidth(),
+                             /* trans */ false, useGpu(act.deviceId));
+      one_->one();
+    }
+
+    sftMaxDot_->dotMul(*outputG, *outputV);
+    sftMaxSum_->colMerge(*sftMaxDot_);
+
+    act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
+  }
+}
+END_DEFINE_ACTIVATION(softmax)
+
+/// Softmax on all frames of one sequence.
+/// Width of frame must be one.
+BEGIN_DEFINE_ACTIVATION(sequence_softmax)
+private:
+ACTIVATION_CLASS_NAME(softmax) softmax_;
+Argument argument_;
+
+public:
+void forward(Argument& act) {
+  CHECK_EQ(act.value->getWidth(), 1UL);
+
+  if (!argument_.value) {
+    argument_.value = Matrix::create(nullptr, /* height= */ 1, 1,
+                                     /* trans= */ false, useGpu(act.deviceId));
+    argument_.grad = Matrix::create(nullptr, /* height= */ 1, 1,
+                                    /* trans= */ false, useGpu(act.deviceId));
+  }
+
+  auto starts = act.sequenceStartPositions->getVector(useGpu(act.deviceId));
+  act.value->sequenceSoftmax(*act.value, *starts);
+}
+
+void backward(Argument& act) {
+  CHECK_EQ(act.grad->getWidth(), 1UL);
+
+  size_t numSequences = act.getNumSequences();
+  const int* starts = act.sequenceStartPositions->getData(false);
+
+  for (size_t i = 0; i < numSequences; ++i) {
+    // TODO(Dangqingqing) optimization for GPU
+    size_t offset = starts[i];
+    size_t size = starts[i + 1] - starts[i];
+    argument_.value->setData(act.value->getData() + offset, 1UL, size);
+    argument_.grad->setData(act.grad->getData() + offset, 1UL, size);
+
+    softmax_.backward(argument_);
+  }
+}
+END_DEFINE_ACTIVATION(sequence_softmax)
+
+/**
+ * Relu Activation.
+ *
+ * forward. y = max(0, z)
+ *
+ * derivative of relu is:
+ *
+ *    1 if z > 0
+ *
+ *    0 otherwise.
+ */
+BEGIN_DEFINE_ACTIVATION(relu)
+void forward(Argument& act) { act.value->relu(*act.value); }
+
+void backward(Argument& act) { act.grad->reluDerivative(*act.value); }
+END_DEFINE_ACTIVATION(relu)
+
+/**
+ * BRelu Activation.
+ *
+ * forward. y = min(24, max(0, z))
+ *
+ * derivative of brelu is:
+ *
+ *    1 if 0 < z < 24
+ *
+ *    0 otherwise.
+ *
+ * TODO(yuyang18): Remove magic number 24 or make it configuable.
+ */
+BEGIN_DEFINE_ACTIVATION(brelu)
+void forward(Argument& act) { act.value->brelu(*act.value); }
+
+void backward(Argument& act) { act.grad->breluDerivative(*act.value); }
+END_DEFINE_ACTIVATION(brelu)
+
+/**
+ * tanh activation.
+ *
+ * f(z) = tanh(z)=\frac{e^z-e^{-z}}{e^z+e^{-z}}
+ */
+BEGIN_DEFINE_ACTIVATION(tanh)
+void forward(Argument& act) { act.value->tanh(*act.value); }
+
+void backward(Argument& act) { act.grad->tanhDerivative(*act.value); }
+END_DEFINE_ACTIVATION(tanh)
+
+/**
+ * Scaled Tanh Activation
+ *
+ * f(z) = 1.7159 * tanh(2/3*z)
+ */
+BEGIN_DEFINE_ACTIVATION(stanh)
+private:
+real a, b;
+
+public:
+ACTIVATION_CLASS_NAME(stanh)() : a(1.7159), b(2. / 3.) {}
+void forward(Argument& act) { act.value->scaledTanh(*act.value, a, b); }
+
+void backward(Argument& act) {
+  act.grad->scaledTanhDerivative(*act.value, a, b);
+}
+END_DEFINE_ACTIVATION(stanh)
+
+/**
+ * Soft relu activation.
+ *
+ * f(z) = ln(1+e^z)
+ */
+BEGIN_DEFINE_ACTIVATION(softrelu)
+void forward(Argument& act) { act.value->softrelu(*act.value); }
+
+void backward(Argument& act) { act.grad->softreluDerivative(*act.value); }
+END_DEFINE_ACTIVATION(softrelu)
+
+/**
+ * Abs Activation.
+ *
+ * Forward: f(z) = abs(z)
+ *
+ * Derivative:
+ *
+ *     1   if z>0
+ *
+ *    -1   if z<0
+ *
+ *     0   if z=0
+ */
+BEGIN_DEFINE_ACTIVATION(abs)
+void forward(Argument& act) {
+  SetDevice device(act.deviceId);
+  Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
+                         /* trans */ false, useGpu(act.deviceId));
+
+  act.in->copyFrom(*act.value);
+  act.value->abs(*act.value);
+}
+
+void backward(Argument& act) { act.grad->absDerivative(*act.in); }
+END_DEFINE_ACTIVATION(abs)
+
+/**
+ * Square Activation.
+ *
+ * f(z) = z^2.
+ */
+BEGIN_DEFINE_ACTIVATION(square)
+void forward(Argument& act) {
+  SetDevice device(act.deviceId);
+  Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
+                         /* trans */ false, useGpu(act.deviceId));
+
+  act.in->copyFrom(*act.value);
+  act.value->square(*act.value);
+}
+
+void backward(Argument& act) { act.grad->squareDerivative(*act.in); }
+END_DEFINE_ACTIVATION(square)
+
+BEGIN_DEFINE_ACTIVATION(exponential)
+void forward(Argument& act) { act.value->exp(*act.value); }
+
+void backward(Argument& act) { act.grad->expDerivative(*act.value); }
+END_DEFINE_ACTIVATION(exponential)
+
+ActivationFunction* ActivationFunction::create(const std::string& type) {
+  return gActivationRegistrar.createByType(type);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h
new file mode 100644
index 00000000000000..0c5eddfc8ab84f
--- /dev/null
+++ b/paddle/gserver/activations/ActivationFunction.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+#include <string>
+
+namespace paddle {
+struct Argument;
+class ActivationFunction {
+public:
+  static ActivationFunction* create(const std::string& type);
+
+  ActivationFunction() {}
+
+  virtual ~ActivationFunction() {}
+
+  // act.value <- f(act.value),
+  // where f is the activation function.
+  // Suppose that before calling forward(), act.value is x and
+  // after forward() is called, act.value is y, then y = f(x),
+  // Usually, act is Layer::output_
+  virtual void forward(Argument& act) = 0;
+
+  // x and y are defined in the above comment for forward().
+  // Before calling backward(), act.grad = dE / dy, where E is the error/cost.
+  // After backward() returns, act.grad = dE / dx = (dE/dy) * (dy/dx)
+  virtual void backward(Argument& act) = 0;
+
+  virtual const std::string& getName() const = 0;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
new file mode 100644
index 00000000000000..ba05b70fe9a3de
--- /dev/null
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -0,0 +1,398 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "DataProvider.h"
+
+#include "paddle/utils/Util.h"
+#include "paddle/utils/StringUtil.h"
+#include "paddle/utils/Logging.h"
+#include <algorithm>
+#include <unistd.h>
+#include "ProtoDataProvider.h"
+
+namespace paddle {
+
+void BufferBatch::swap(BufferBatch* bufBatch) {
+  DataBatch* batchData = bufBatch->getDataBatch();
+  hl_event_t hlEvent = bufBatch->getCuEvent();
+  hl_stream_t hlStream = bufBatch->getCuStream();
+  bufBatch->setDataBatch(batchData_);
+  bufBatch->setCuStream(hlStream_);
+  bufBatch->setCuEvent(hlEvent_);
+
+  batchData_ = batchData;
+  hlEvent_ = hlEvent;
+  hlStream_ = hlStream;
+}
+
+void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) {
+  if (batchData_ == NULL) {
+    batchData_ = new DataBatch();
+  }
+  std::vector<Argument>& destData = batchData_->getStreams();
+  int numStreams = srcBatch->getNumStreams();
+  destData.resize(numStreams);
+  batchData_->setSize(srcBatch->getSize());
+  if (useGpu) {
+    createCuEvent();
+  }
+
+  for (int i = 0; i < numStreams; i++) {
+    destData[i].resizeAndCopyFrom(srcBatch->getStream(i), useGpu, hlStream_);
+  }
+  if (useGpu) {
+    hl_stream_record_event(hlStream_, hlEvent_);
+  }
+}
+
+DoubleBuffer::DoubleBuffer(DataProvider* dataPool, bool useGpu,
+                           int64_t batchSize) {
+  batchSize_ = batchSize;
+  dataPool_ = dataPool;
+  useGpu_ = useGpu;
+  dataQueue_ = new BufferBatchQueue();
+  bufferQueue_ = new BufferBatchQueue();
+
+  // insert a empty buffer
+  bufferQueue_->enqueue(new BufferBatch());
+  stopping_ = false;
+  pending_ = true;
+}
+
+DoubleBuffer::~DoubleBuffer() {
+  finishAsyncLoad();
+  while (dataQueue_->size()) {
+    BufferBatch* dataBtch = dataQueue_->dequeue();
+    delete dataBtch;
+    dataBtch = NULL;
+  }
+  while (bufferQueue_->size()) {
+    BufferBatch* bufBtch = bufferQueue_->dequeue();
+    delete bufBtch;
+    bufBtch = NULL;
+  }
+  delete dataQueue_;
+  dataQueue_ = NULL;
+  delete bufferQueue_;
+  bufferQueue_ = NULL;
+}
+
+void DoubleBuffer::removeOneBatch(DataBatch* dataBatch) {
+  // get data
+  BufferBatch* batch = dataQueue_->dequeue();
+  batch->syncEvent();  // when use GPU, need synchronized with the cuEvent
+  *dataBatch = *(batch->getDataBatch());
+
+  // push anothor buffer
+  if (*usingBatch_ == nullptr) {
+    *usingBatch_ = std::make_shared<BufferBatch>();
+  }
+
+  // Mark the using-batch
+  batch->swap((*usingBatch_).get());
+  bufferQueue_->enqueue(batch);
+
+  if (0 == dataBatch->getSize()) {
+    setPending(true);
+  }
+}
+
+void DoubleBuffer::insertOneBatch(DataBatch* batch) {
+  BufferBatch* bufBatch = bufferQueue_->dequeue();
+  // clone and copy the data from an Threadlocal Variable
+  bufBatch->clone(batch, useGpu_);
+  dataQueue_->enqueue(bufBatch);
+}
+
+void DoubleBuffer::asyncLoadBatch() {
+  int64_t actualSize = 0;
+  if (useGpu_) {
+    hl_set_device(FLAGS_gpu_id);
+  }
+  setPending(false);
+
+  while (true) {
+    taskReadySem_.wait();
+    if (stopping_) break;
+
+    while (batchSize_ == 0) {
+      usleep(5);
+    }
+
+    do {
+      DataBatch newBatch;
+      {
+        REGISTER_TIMER("getNextBatchInternal");
+        actualSize = dataPool_->getNextBatchInternal(batchSize_, &newBatch);
+      }
+      insertOneBatch(&newBatch);
+    } while (actualSize > 0);
+  }
+}
+
+void DoubleBuffer::startAsyncLoad() {
+  if (asyncLoader_ == nullptr) {
+    asyncLoader_.reset(new std::thread([this]() { this->asyncLoadBatch(); }));
+  }
+  taskReadySem_.post();
+}
+
+ClassRegistrar<DataProvider, DataConfig, bool> DataProvider::registrar_;
+DataProvider* DataProvider::create(const DataConfig& config, bool useGpu) {
+  return registrar_.createByType(config.type(), config, useGpu);
+}
+
+REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);
+REGISTER_DATA_PROVIDER(dummy, DummyDataProvider);
+REGISTER_DATA_PROVIDER(proto, ProtoDataProvider);
+REGISTER_DATA_PROVIDER(proto_sequence, ProtoSequenceDataProvider);
+
+int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
+  int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch)
+                                    : getNextBatchInternal(size, batch);
+
+  if (!batchSize) return 0;
+
+  if (!config_.constant_slots_size()) return batchSize;
+
+  auto& constantSlots = *constantSlots_;
+  constantSlots.resize(config_.constant_slots_size());
+
+  for (int i = 0; i < config_.constant_slots_size(); ++i) {
+    MemoryHandlePtr handle =
+        constantSlots[i] ? constantSlots[i]->getMemoryHandle() : nullptr;
+    Matrix::resizeOrCreate(constantSlots[i], batchSize,
+                           1,         // = width
+                           false,     // = trans
+                           useGpu_);  // = useGpu
+    if (handle != constantSlots[i]->getMemoryHandle()) {
+      // memory buf was reallocated. We need to initialize the value
+      constantSlots[i]->assign(config_.constant_slots(i));
+    }
+    batch->appendData(constantSlots[i],
+                      batch->getStream(0).sequenceStartPositions);
+  }
+
+  return batchSize;
+}
+
+int64_t DataProvider::getNextBatchFromBuffer(int64_t size, DataBatch* batch) {
+  CHECK(doubleBuffer_ != nullptr);
+
+  if (doubleBuffer_->getBatchSize() != size) {
+    doubleBuffer_->setBatchSize(size);
+  }
+
+  doubleBuffer_->removeOneBatch(batch);
+  return batch->getSize();
+}
+
+void DataProvider::initAsyncLoader() {
+  if (doubleBuffer_ == nullptr) {
+    doubleBuffer_.reset(new DoubleBuffer(this, useGpu_));
+  }
+  useGpu_ = false;  // Avoid D2D copy, it will delay the computing performance
+}
+
+SimpleDataProviderBase::SimpleDataProviderBase(const DataConfig& config,
+                                               bool useGpu, bool withInfo)
+    : DataProvider(config, useGpu) {
+  /* initialize the size of a sample, and the buffer */
+  sampleDim_ = config_.feat_dim() * (2 * config_.context_len() + 1);
+  bufferCapacity_ = config_.buffer_capacity();
+  withInfo_ = withInfo;
+  sampleNumInBuf_ = 0;
+  nextItemIndex_ = 0;
+
+  /* malloc buffer in cpu */
+  hInputDataBuf_ = std::make_shared<CpuMatrix>(bufferCapacity_, sampleDim_);
+  hInputLabelBuf_ = std::make_shared<CpuIVector>(bufferCapacity_);
+  hInputInfoBuf_ = std::make_shared<CpuIVector>(bufferCapacity_);
+}
+
+void SimpleDataProviderBase::shuffle() {
+  int i, t;
+  int len = sampleNumInBuf_;
+  std::vector<real> temp(sampleDim_);
+  real* data = hInputDataBuf_->getData();
+  int* label = hInputLabelBuf_->getData();
+  int* info = hInputInfoBuf_->getData();
+  int sampleSz = sizeof(real) * sampleDim_;
+  for (i = 0; i < len; i++) {
+    int randNum = rand();  // NOLINT TODO(yuyang18): Use rand_r instead?
+    t = randNum % (len - i) + i;
+    // swap
+    if (i != t) {
+      // swap data
+      memcpy(&temp[0], &data[i * sampleDim_], sampleSz);
+      memcpy(&data[i * sampleDim_], &data[t * sampleDim_], sampleSz);
+      memcpy(&data[t * sampleDim_], &temp[0], sampleSz);
+      std::swap(label[i], label[t]);
+      if (withInfo_) {
+        std::swap(info[i], info[t]);
+      }
+    }
+  }
+}
+
+int64_t SimpleDataProviderBase::getNextBatchInternal(int64_t size,
+                                                     DataBatch* batch) {
+  CHECK(batch != NULL);
+  batch->clear();
+
+  int64_t startIndex;
+  int64_t cpySize;
+
+  std::lock_guard<RWLock> guard(lock_);
+  if (sampleNumInBuf_ - nextItemIndex_ < size) {
+    int64_t n = fillBuffer();
+    VLOG(1) << "fillBuffer return " << n << " samples.\n";
+  }
+
+  startIndex = nextItemIndex_;
+  cpySize = std::min(size, sampleNumInBuf_ - nextItemIndex_);
+  nextItemIndex_ += cpySize;
+
+  if (cpySize > 0) {
+    real* data = hInputDataBuf_->getData() + startIndex * sampleDim_;
+    int* label = hInputLabelBuf_->getData() + startIndex;
+    int* info = hInputInfoBuf_->getData() + startIndex;
+
+    MatrixPtr& dataBatch = *dataBatch_;     // get the thread local object
+    IVectorPtr& labelBatch = *labelBatch_;  // get the thread local object
+    IVectorPtr& infoBatch = *infoBatch_;    // get the thread local object
+    if (!dataBatch) {
+      dataBatch = Matrix::create(cpySize, sampleDim_, false, useGpu_);
+      labelBatch = IVector::create(cpySize, useGpu_);
+      if (withInfo_) {
+        infoBatch = IVector::create(cpySize, 0);
+      }
+    } else {
+      dataBatch->resize(cpySize, sampleDim_);
+      labelBatch->resize(cpySize);
+      if (withInfo_) {
+        infoBatch->resize(cpySize);
+      }
+    }
+    dataBatch->copyFrom(data, cpySize * sampleDim_);
+    labelBatch->copyFrom(label, cpySize);
+    batch->appendData(dataBatch);
+    batch->appendLabel(labelBatch);
+    if (withInfo_) {
+      infoBatch->copyFrom(info, cpySize);
+      batch->appendLabel(infoBatch);
+    }
+  }
+
+  batch->setSize(cpySize);
+  return cpySize;
+}
+
+void SimpleDataProviderBase::reset() {
+  sampleNumInBuf_ = 0;
+  nextItemIndex_ = 0;
+  DataProvider::reset();
+}
+
+int64_t SimpleDataProviderBase::getSize() {
+  LOG(FATAL) << "Currently, not implemented";
+  return 0;
+}
+
+int64_t SimpleDataProviderBase::fillBuffer() {
+  int64_t n = sampleNumInBuf_ - nextItemIndex_;
+
+  /* flash the remaining data to the beginning of the buffer */
+  if (n > 0) {
+    hInputDataBuf_->copyFrom(
+        hInputDataBuf_->getData() + nextItemIndex_ * sampleDim_,
+        n * sampleDim_);
+    hInputLabelBuf_->copyFrom(hInputLabelBuf_->getData() + nextItemIndex_, n);
+    if (withInfo_) {
+      hInputInfoBuf_->copyFrom(hInputInfoBuf_->getData() + nextItemIndex_, n);
+    }
+  }
+
+  sampleNumInBuf_ =
+      n + fillBufferImp(hInputDataBuf_->getData() + n * sampleDim_,
+                        hInputLabelBuf_->getData() + n,
+                        hInputInfoBuf_->getData() + n, bufferCapacity_ - n);
+
+  /* for stachastic gradient training */
+  if (!skipShuffle_) {
+    shuffle();
+  }
+
+  nextItemIndex_ = 0;
+
+  return sampleNumInBuf_;
+}
+
+SimpleDataProvider::SimpleDataProvider(const DataConfig& config, bool useGpu)
+    : SimpleDataProviderBase(config, useGpu, /* withInfo= */ false),
+      currentSampleIndex_(0) {
+  loadData(config_.files());
+}
+
+SimpleDataProvider::~SimpleDataProvider() {}
+
+int64_t SimpleDataProvider::fillBufferImp(real* data, int* label, int* info,
+                                          int64_t size) {
+  (void)info;
+  int64_t n = std::min<int64_t>(labels_.size() - currentSampleIndex_, size);
+  memcpy(data, &data_[currentSampleIndex_ * sampleDim_],
+         n * sampleDim_ * sizeof(real));
+  memcpy(label, &labels_[currentSampleIndex_], sizeof(int) * n);
+  currentSampleIndex_ += n;
+
+  return n;
+}
+
+void SimpleDataProvider::reset() {
+  currentSampleIndex_ = 0;
+  SimpleDataProviderBase::reset();
+}
+
+void SimpleDataProvider::loadData(const std::string& fileName) {
+  std::ifstream is(fileName);
+  CHECK(is) << "Fail to open " << fileName;
+  std::string line;
+  while (is) {
+    if (!getline(is, line)) break;
+    LOG(INFO) << "load data file " << line;
+    loadDataFile(line);
+  }
+  LOG(INFO) << "read done, num of instance=" << labels_.size()
+            << " data size=" << data_.size();
+}
+
+void SimpleDataProvider::loadDataFile(const std::string& fileName) {
+  std::ifstream is(fileName);
+  std::string line;
+  std::vector<std::string> pieces;
+  while (is) {
+    if (!getline(is, line)) break;
+    str::split(line, ' ', &pieces);
+    CHECK_EQ((uint64_t)(sampleDim_ + 1), pieces.size())
+        << " Dimension mismatch, " << pieces.size() - 1 << " in " << fileName
+        << " " << sampleDim_ << " from config";
+    labels_.push_back(atoi(pieces[0].c_str()));
+    for (int i = 0; i < sampleDim_; ++i) {
+      data_.push_back(atof(pieces[i + 1].c_str()));
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
new file mode 100644
index 00000000000000..3c4bea0b3c3637
--- /dev/null
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -0,0 +1,387 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <vector>
+#include <memory>
+#include <mutex>
+#include <iostream>
+#include <fstream>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Queue.h"
+#include "paddle/utils/Locks.h"
+#include "paddle/utils/ThreadLocal.h"
+#include "paddle/utils/TypeDefs.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/Util.h"
+#include "paddle/math/Vector.h"
+#include "DataConfig.pb.h"
+#include "paddle/utils/ClassRegistrar.h"
+#include "paddle/parameter/Argument.h"
+
+namespace paddle {
+
+/**
+ * @brief Macro for registering a data provider.
+ */
+#define REGISTER_DATA_PROVIDER(__type_name, __class_name)               \
+  static InitFunction __reg_type_##__type_name([]() {                   \
+    DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
+  })
+
+class DataBatch;
+class BufferBatch;
+typedef std::shared_ptr<DataBatch> DataBatchPtr;
+typedef std::shared_ptr<BufferBatch> BufferBatchPtr;
+
+class DataBatch {
+public:
+  DataBatch() : size_(0) { data_.clear(); }
+
+  int64_t getSize() const { return size_; }
+
+  int64_t getNumSequences() const {
+    if (data_.empty()) return size_;
+    return data_[0].sequenceStartPositions
+               ? data_[0].sequenceStartPositions->getSize() - 1
+               : size_;
+  }
+
+  void setSize(int64_t size) { size_ = size; }
+
+  int64_t getNumStreams() const { return data_.size(); }
+
+  const Argument& getStream(int i) const { return data_[i]; }
+
+  std::vector<Argument>& getStreams() { return data_; }
+
+  std::vector<Argument> getStreams() const { return data_; }
+
+  void clear() {
+    data_.clear();
+    size_ = 0;
+  }
+
+  /**
+   * The order in which each data stream is appended must match the order
+   * specified in stream_names of DataConfig. The stream_names can be obtained
+   * using DataProvider::getStreamNames().
+   */
+  void appendData(MatrixPtr data) {
+    Argument argu;
+    argu.value = data;
+    data_.push_back(argu);
+  }
+
+  /**
+   * The order in which each data stream is appended must match the order
+   * specified in stream_names of DataConfig. The stream_names can be obtained
+   * using DataProvider::getStreamNames().
+   */
+  void appendData(const MatrixPtr& data,
+                  const ICpuGpuVectorPtr& sequenceStartPositions) {
+    Argument argu;
+    argu.value = data;
+    argu.sequenceStartPositions = sequenceStartPositions;
+    data_.push_back(argu);
+  }
+
+  void appendLabel(IVectorPtr label, MatrixPtr value = nullptr) {
+    Argument argu;
+    argu.ids = label;
+    argu.value = value;
+    data_.push_back(argu);
+  }
+
+  void appendUserDefinedPtr(UserDefinedVectorPtr ptr) {
+    Argument argu;
+    argu.udp = ptr;
+    data_.push_back(argu);
+  }
+
+  /*
+   * argus: DataBatch.getStreams()
+   * size: DataBatch.getSize()
+   * dataId: sub dataprovider id (in MultiDataProvider)
+   */
+  void appendArguments(const std::vector<Argument>& argus, int size,
+                       int dataId) {
+    size_ += size;
+    for (const auto& argu : argus) {
+      data_.push_back(argu);
+      data_.back().dataId = dataId;
+    }
+  }
+
+protected:
+  int64_t size_;
+  std::vector<Argument> data_;
+};
+
+class BufferBatch {
+public:
+  BufferBatch() {
+    hlStream_ = HPPL_STREAM_DEFAULT;
+    hlEvent_ = NULL;
+    batchData_ = NULL;
+  }
+  ~BufferBatch() {
+    if (hlEvent_) {
+      hl_destroy_event(hlEvent_);
+      hlEvent_ = NULL;
+    }
+    if (batchData_) {
+      delete batchData_;
+      batchData_ = NULL;
+    }
+  }
+
+  void setDataBatch(DataBatch* batchData) { batchData_ = batchData; }
+  DataBatch* getDataBatch() { return batchData_; }
+
+  void setCuStream(hl_stream_t stream) { hlStream_ = stream; }
+  hl_stream_t getCuStream() const { return hlStream_; }
+
+  void setCuEvent(hl_event_t event) { hlEvent_ = event; }
+
+  hl_event_t getCuEvent() const { return hlEvent_; }
+
+  void createCuEvent() {
+    if (!hlEvent_) {
+      hlStream_ = HPPL_STREAM_1;
+      hl_create_event(&hlEvent_);
+    }
+  }
+
+  void syncEvent() {
+    if (hlEvent_) {
+      hl_stream_wait_event(hlStream_, hlEvent_);
+    }
+  }
+
+  void swap(BufferBatch* bufBatch);
+  void clone(DataBatch* srcBatch, bool useGpu);
+
+protected:
+  DataBatch* batchData_;
+  hl_stream_t hlStream_;
+  hl_event_t hlEvent_;
+};
+
+class DataProvider;
+typedef std::shared_ptr<DataProvider> DataProviderPtr;
+
+typedef Queue<BufferBatch*> BufferBatchQueue;
+
+class DoubleBuffer {
+public:
+  DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
+  virtual ~DoubleBuffer();
+  void removeOneBatch(DataBatch* dataBatch);
+
+  void setBatchSize(int64_t newBatchSize) { batchSize_ = newBatchSize; }
+
+  int64_t getBatchSize() { return batchSize_; }
+
+  void startAsyncLoad();
+  void finishAsyncLoad() {
+    stopping_ = true;
+    taskReadySem_.post();
+    asyncLoader_->join();
+  }
+
+  void setPending(bool pending) { pending_ = pending; }
+
+protected:
+  virtual void asyncLoadBatch();
+  void insertOneBatch(DataBatch* batch);
+
+  DataProvider* dataPool_;
+  bool useGpu_;
+  int32_t batchSize_;
+  ThreadLocal<BufferBatchPtr> usingBatch_;
+  BufferBatchQueue* dataQueue_;
+  BufferBatchQueue* bufferQueue_;
+  std::unique_ptr<std::thread> asyncLoader_;
+  Semaphore taskReadySem_;
+  bool stopping_;
+  bool pending_;
+};
+
+/**
+ * DataProvider supplies data for training
+ * It can supplies multiple streams of data.
+ * For typical supervised training, there are two streams:
+ * one is for input, one is for label.
+ */
+class DataProvider {
+public:
+  static ClassRegistrar<DataProvider, DataConfig, bool> registrar_;
+  static DataProvider* create(const DataConfig& config,
+                              bool useGpu = FLAGS_use_gpu);
+
+  DataProvider(const DataConfig& config, bool useGpu)
+      : config_(config),
+        skipShuffle_(false),
+        usageRatio_(config.usage_ratio()),
+        useGpu_(useGpu) {
+    if (config_.async_load_data()) {
+      initAsyncLoader();
+    }
+  }
+  virtual ~DataProvider() {}
+
+  const DataConfig& getConfig() const { return config_; }
+
+  void setSkipShuffle() { skipShuffle_ = true; }
+  int64_t getNextBatch(int64_t size, DataBatch* batch);
+
+  /**
+   * Shuffle the data set
+   */
+  virtual void shuffle() = 0;
+
+  /**
+   * reset() must be called before any calls to getNextBatch()
+   * reset all the value of index
+   * IMPORTANT: subclass reset() should always call the base class reset()
+   * at the end of the function
+   */
+  virtual void reset() {
+    if (doubleBuffer_ != nullptr) {
+      LOG(INFO) << "the double-buffer is starting ...";
+      doubleBuffer_->startAsyncLoad();
+    }
+  }
+
+  /**
+   * return the number of training samples in the data set.
+   * return -1 to indicate unlimited number of samples.
+   */
+  virtual int64_t getSize() = 0;
+
+  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) = 0;
+
+protected:
+  DataConfig config_;
+  bool skipShuffle_;
+  float usageRatio_;
+  bool useGpu_;
+  std::unique_ptr<DoubleBuffer> doubleBuffer_;
+  ThreadLocal<std::vector<MatrixPtr>> constantSlots_;
+
+  int64_t getNextBatchFromBuffer(int64_t size, DataBatch* batch);
+
+  void initAsyncLoader();
+};
+
+/**
+ * A data provider which does nothing. It only serves as providing
+ * necessary configurations such as stream_names
+ */
+class DummyDataProvider : public DataProvider {
+public:
+  DummyDataProvider(const DataConfig& config, bool useGpu)
+      : DataProvider(config, useGpu) {}
+  virtual void shuffle() {}
+  virtual void reset() { DataProvider::reset(); }
+  virtual int64_t getSize() { return 0; }
+  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) {
+    (void)size;
+    (void)batch;
+    return 0;
+  }
+};
+
+// Data provider for one input and one integer label
+class SimpleDataProviderBase : public DataProvider {
+protected:
+  int64_t sampleDim_;       // sample feature dimension
+  int64_t bufferCapacity_;  // the number of samples
+  int64_t sampleNumInBuf_;
+  int64_t nextItemIndex_;  // next item to read in buffer
+  bool withInfo_;          // some user defined info for validation
+
+  // data buffer: bufferCapacity_ * nDataDim_
+  CpuMatrixPtr hInputDataBuf_;
+
+  // label buffer:bufferCapacity_ * 1
+  CpuIVectorPtr hInputLabelBuf_;
+
+  // info buffer:bufferCapacity_ * 1
+  CpuIVectorPtr hInputInfoBuf_;
+
+  ThreadLocal<MatrixPtr> dataBatch_;
+  ThreadLocal<IVectorPtr> labelBatch_;
+  ThreadLocal<IVectorPtr> infoBatch_;
+
+  RWLock lock_;
+
+public:
+  SimpleDataProviderBase(const DataConfig& config, bool useGpu, bool withInfo);
+  ~SimpleDataProviderBase() {}
+
+  void shuffle();
+
+  virtual void reset();
+
+  virtual int64_t getSize();
+
+  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
+
+  // return the number of samples in the buffer
+  int64_t fillBuffer();
+
+protected:
+  /**
+   * @brief Fill at most size samples into data and label.
+   *
+   * Each input is stored in contiguous memory locations in data.
+   *
+   * data[n * sampleDim_] .. data[n * sampleDim_ + sampleDim_ - 1] is for
+   * the input of the n-th sample.
+   *
+   * label[n] is the label for the n-th sample.
+   */
+  virtual int64_t fillBufferImp(real* data, int* label, int* info,
+                                int64_t size) = 0;
+};
+
+class SimpleDataProvider : public SimpleDataProviderBase {
+public:
+  SimpleDataProvider(const DataConfig& config, bool useGpu);
+  ~SimpleDataProvider();
+  virtual void reset();
+
+protected:
+  void loadData(const std::string& fileName);
+  void loadDataFile(const std::string& fileName);
+  virtual int64_t fillBufferImp(real* data, int* label, int* info,
+                                int64_t size);
+
+protected:
+  size_t currentSampleIndex_;
+  std::vector<int> labels_;
+  std::vector<real> data_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/DataProviderGroup.h b/paddle/gserver/dataproviders/DataProviderGroup.h
new file mode 100644
index 00000000000000..decbde6c91758c
--- /dev/null
+++ b/paddle/gserver/dataproviders/DataProviderGroup.h
@@ -0,0 +1,152 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "DataProvider.h"
+
+namespace paddle {
+
+template <class T>
+class DataProviderGroup : public DataProvider {
+protected:
+  typedef T ProviderType;
+  typedef std::shared_ptr<ProviderType> ProviderPtrType;
+  ProviderPtrType provider_;
+
+  std::vector<std::string> fileList_;
+  std::mutex lock_;
+  std::unique_ptr<MultiThreadWorker<ProviderType>> loader_;
+
+public:
+  DataProviderGroup(const DataConfig& config, bool useGpu);
+  ~DataProviderGroup() {}
+
+  virtual void reset();
+  virtual void shuffle() {}
+  virtual int64_t getSize() { return -1; }
+  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
+
+private:
+  void startLoader();
+  void stopLoader();
+  void forceStopLoader();
+  ProviderPtrType loadFile(const std::vector<std::string>& fileList);
+};
+
+template <class T>
+DataProviderGroup<T>::DataProviderGroup(const DataConfig& config, bool useGpu)
+    : DataProvider(config, useGpu) {
+  // load file list
+  loadFileList(config_.files(), fileList_);
+  CHECK_GT(fileList_.size(), 0LU);
+  LOG(INFO) << "load file list, numfiles=" << fileList_.size()
+            << ", max_num_of_data_providers_in_memory="
+            << (1 + config_.file_group_conf().queue_capacity() +
+                config_.file_group_conf().load_thread_num());
+}
+
+template <class T>
+void DataProviderGroup<T>::reset() {
+  forceStopLoader();
+  CHECK(!loader_);
+  provider_ = nullptr;
+
+  // shuffle file list
+  std::random_shuffle(fileList_.begin(), fileList_.end());
+
+  startLoader();
+  DataProvider::reset();
+}
+
+template <class T>
+int64_t DataProviderGroup<T>::getNextBatchInternal(int64_t size,
+                                                   DataBatch* batch) {
+  std::lock_guard<std::mutex> guard(lock_);
+
+  if (!loader_) {
+    return 0;
+  }
+  if (provider_) {
+    int64_t ret = provider_->getNextBatchInternal(size, batch);
+    if (ret > 0) {
+      return ret;
+    }
+  }
+
+  // else get data from next data provider
+  if (loader_->testResult()) {
+    LOG(INFO) << "WAIT provider";
+  }
+  provider_ = loader_->waitResult();
+  if (!provider_) {
+    stopLoader();  // All the data providers have been returned
+    return 0;
+  }
+  int64_t ret = provider_->getNextBatchInternal(size, batch);
+  CHECK(ret > 0) << "new data provider does not contain any valid samples!";
+  return ret;
+}
+
+template <class T>
+void DataProviderGroup<T>::startLoader() {
+  loader_.reset(new MultiThreadWorker<ProviderType>(
+      config_.file_group_conf().load_thread_num(),
+      config_.file_group_conf().queue_capacity()));
+
+  int loadFileCount = config_.file_group_conf().load_file_count();
+  for (size_t startPos = 0; startPos < fileList_.size();
+       startPos += loadFileCount) {
+    size_t endPos = std::min(fileList_.size(), startPos + loadFileCount);
+    std::vector<std::string> fileVec(fileList_.begin() + startPos,
+                                     fileList_.begin() + endPos);
+    loader_->addJob([this, fileVec]()
+                        -> ProviderPtrType { return this->loadFile(fileVec); });
+  }
+  loader_->stopAddJob();
+}
+
+template <class T>
+void DataProviderGroup<T>::stopLoader() {
+  if (loader_) {
+    loader_->stop();
+    loader_ = nullptr;
+  }
+}
+
+template <class T>
+void DataProviderGroup<T>::forceStopLoader() {
+  if (loader_) {
+    loader_->forceStop();
+    loader_ = nullptr;
+  }
+}
+
+template <class T>
+std::shared_ptr<T> DataProviderGroup<T>::loadFile(
+    const std::vector<std::string>& fileList) {
+  // disable async_load_data in sub dataprovider
+  DataConfig subConfig = config_;
+  subConfig.set_async_load_data(false);
+
+  CHECK(!fileList.empty()) << "fileList is empty";
+  ProviderPtrType provider =
+      std::make_shared<ProviderType>(subConfig, useGpu_, false);
+  provider->loadData(fileList);
+  provider->reset();
+  return provider;
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.cpp b/paddle/gserver/dataproviders/MultiDataProvider.cpp
new file mode 100644
index 00000000000000..c3d14a7069bd3d
--- /dev/null
+++ b/paddle/gserver/dataproviders/MultiDataProvider.cpp
@@ -0,0 +1,121 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Util.h"
+#include "MultiDataProvider.h"
+#include "paddle/utils/Logging.h"
+#include <algorithm>
+
+namespace paddle {
+
+using namespace std;
+
+MultiDataProvider::MultiDataProvider(const DataConfig& config, bool useGpu)
+    : DataProvider(config, useGpu) {
+  bool atLeastOneMainDataFlag = false;
+  totalDataRatio_ = 0;
+  LOG(INFO) << "MultiDataProvider: sub data provider size: "
+            << config.sub_data_configs_size();
+  LOG(INFO) << "MultiDataProvider: for_test: " << config.for_test();
+  isTestMode_ = config.for_test();
+  for (int i = 0; i < config.sub_data_configs_size(); i++) {
+    LOG(INFO) << "dataRatio of sub(" << i
+              << ") is: " << config.sub_data_configs(i).data_ratio();
+    totalDataRatio_ += config.sub_data_configs(i).data_ratio();
+    if (config.sub_data_configs(i).is_main_data()) {
+      LOG(INFO) << "main data is [" << i << "]";
+      atLeastOneMainDataFlag = true;
+    }
+  }
+  CHECK(atLeastOneMainDataFlag) << "all sub dataproviders in MultiData do not"
+                                << " have is_main_data flag";
+  LOG(INFO) << "totalDataRatio_=" << totalDataRatio_;
+  DataConfig subConfig;
+  int subDataProviderCount = config.sub_data_configs_size();
+  if (isTestMode()) {
+    LOG(INFO) << "construct MultiDataProvider in test mode";
+  } else {
+    LOG(INFO) << "construct MultiDataProvider in train mode";
+  }
+  subDataProviders_.resize(subDataProviderCount);
+  for (int i = 0; i < subDataProviderCount; i++) {
+    subConfig = config.sub_data_configs(i);
+    if (subConfig.async_load_data()) {
+      LOG(INFO) << "can not use async_load_data in sub dataprovider of "
+                   "MultiDataProvider";
+      subConfig.set_async_load_data(false);
+    }
+    subDataProviders_[i] =
+        std::unique_ptr<DataProvider>(DataProvider::create(subConfig, useGpu_));
+  }
+}
+
+void MultiDataProvider::reset() {
+  for (auto& elem : subDataProviders_) {
+    elem->reset();
+  }
+  DataProvider::reset();
+}
+
+void MultiDataProvider::shuffle() {
+  for (auto& elem : subDataProviders_) {
+    elem->shuffle();
+  }
+}
+
+int64_t MultiDataProvider::getNextBatchInternal(int64_t size,
+                                                DataBatch* batch) {
+  batch->clear();
+  for (size_t i = 0; i < subDataProviders_.size(); ++i) {
+    // calc size according to data ratio
+    int64_t subSize =
+        (int64_t)(1.0 * size * config_.sub_data_configs(i).data_ratio() /
+                  totalDataRatio_);
+    DataBatch subBatch;
+    int64_t realSize =
+        subDataProviders_[i]->getNextBatchInternal(subSize, &subBatch);
+    if (realSize == 0) {
+      // current subDataProvider has no data
+      if (!isTestMode()) {
+        // in train mode
+        if (config_.sub_data_configs(i).is_main_data()) {
+          // is main data provider. then return 0
+          batch->clear();
+          return 0;
+        } else {
+          // not main data provider, reset current subDataProvider and try again
+          subDataProviders_[i]->reset();
+          subBatch.clear();
+          realSize =
+              subDataProviders_[i]->getNextBatchInternal(subSize, &subBatch);
+          CHECK_GT(realSize, 0);
+        }
+      } else {
+        // in test mode, make an empty argument
+        Argument emptyArgu;
+        std::vector<Argument> argus;
+        argus.push_back(emptyArgu);
+        batch->appendArguments(argus, 0, -1);
+        continue;
+      }
+    }
+    batch->appendArguments(subBatch.getStreams(), subBatch.getSize(), i);
+  }
+  return batch->getSize();
+}
+
+REGISTER_DATA_PROVIDER(multi, MultiDataProvider);
+
+}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.h b/paddle/gserver/dataproviders/MultiDataProvider.h
new file mode 100644
index 00000000000000..714421286376b4
--- /dev/null
+++ b/paddle/gserver/dataproviders/MultiDataProvider.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "DataProvider.h"
+
+namespace paddle {
+
+class MultiDataProvider : public DataProvider {
+protected:
+  std::vector<std::unique_ptr<DataProvider>> subDataProviders_;
+
+public:
+  MultiDataProvider(const DataConfig& config, bool useGpu);
+  ~MultiDataProvider() {}
+  virtual void reset();
+  virtual void shuffle();
+  virtual int64_t getSize() { return -1; }
+  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
+  bool isTestMode() const { return isTestMode_; }
+
+private:
+  int totalDataRatio_;
+  bool isTestMode_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
new file mode 100644
index 00000000000000..b0c14c85b2d81e
--- /dev/null
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
@@ -0,0 +1,902 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "ProtoDataProvider.h"
+#include "paddle/utils/Util.h"
+#include "paddle/utils/StringUtil.h"
+#include <algorithm>
+#include <fstream>
+#include <istream>
+
+#include "paddle/utils/Logging.h"
+#include "DataProviderGroup.h"
+
+P_DEFINE_double(memory_threshold_on_load_data, 1.0,
+                "stop loading data when memory is not sufficient");
+
+namespace paddle {
+
+REGISTER_DATA_PROVIDER(proto_group, DataProviderGroup<ProtoDataProvider>);
+REGISTER_DATA_PROVIDER(proto_sequence_group,
+                       DataProviderGroup<ProtoSequenceDataProvider>);
+
+ProtoDataProvider::ProtoDataProvider(const DataConfig& config, bool useGpu,
+                                     bool loadDataAll)
+    : DataProvider(config, useGpu), sampleNums_(0), currentSequenceIndex_(0) {
+  if (loadDataAll) {
+    loadData(config_.files());
+  }
+}
+
+void ProtoDataProvider::loadData(const std::vector<std::string>& fileList) {
+  for (auto& file : fileList) {
+    if (FLAGS_memory_threshold_on_load_data < 1.0) {
+      double memUsage = getMemoryUsage();
+      if (memUsage > FLAGS_memory_threshold_on_load_data) {
+        LOG(INFO) << "memUsage is " << memUsage << ", > "
+                  << FLAGS_memory_threshold_on_load_data
+                  << " therefore SKIP ALL REMAINING file.";
+        break;
+      }
+    }
+    LOG(INFO) << "load data file " << file;
+    loadDataFile(file);
+  }
+
+  if (sequenceStartPositions_.size() == sampleNums_) {
+    // This means that each sample is one sequence
+    shuffledSequenceIds_.swap(sequenceStartPositions_);
+  } else {
+    sequenceStartPositions_.push_back(sampleNums_);
+    shuffledSequenceIds_.reserve(sequenceStartPositions_.size() - 1);
+    for (size_t i = 0; i < sequenceStartPositions_.size() - 1; ++i) {
+      shuffledSequenceIds_.push_back(i);
+    }
+  }
+
+  LOG(INFO) << "read done, num of instance=" << sampleNums_;
+  showDataStats();
+}
+
+void ProtoDataProvider::loadData(const std::string& fileName) {
+  std::vector<std::string> fileList;
+  loadFileList(fileName, fileList);
+  loadData(fileList);
+}
+
+void ProtoDataProvider::checkDataHeader(const DataHeader& header) {
+  if (header_.slot_defs_size()) {
+    // header_ is already set. Need to check consistency.
+    CHECK_EQ(header_.slot_defs_size(), header.slot_defs_size())
+        << "Different header";
+    for (int i = 0; i < header.slot_defs_size(); ++i) {
+      CHECK_EQ(header_.slot_defs(i).type(), header.slot_defs(i).type());
+      CHECK_EQ(header_.slot_defs(i).dim(), header.slot_defs(i).dim());
+    }
+    return;
+  }
+
+  // header_ is not set before
+  CHECK(header.slot_defs_size()) << "Invalid header: no slot is defined";
+  int i;
+  for (i = 0; i < header.slot_defs_size(); ++i) {
+    if (header.slot_defs(i).type() == SlotDef::INDEX ||
+        header.slot_defs(i).type() == SlotDef::VAR_MDIM_INDEX) {
+      break;
+    }
+    constexpr int kBufLen = 100;
+    char buf[kBufLen];
+    snprintf(buf, kBufLen, "slot%d_nnz", i);
+    nnzStats_.push_back(getStat(buf));
+  }
+  numVecSlots_ = i;
+
+  // Check that INDEX slots are after VECTOR slots
+  for (int i = numVecSlots_; i < header.slot_defs_size(); ++i) {
+    CHECK(header.slot_defs(i).type() == SlotDef::INDEX ||
+          header.slot_defs(i).type() == SlotDef::VAR_MDIM_INDEX);
+  }
+
+  slots_.clear();
+  slots_.reserve(header.slot_defs_size());
+  for (int i = 0; i < header.slot_defs_size(); ++i) {
+    slots_.emplace_back();
+    slots_.back().type = header.slot_defs(i).type();
+    slots_.back().dim = header.slot_defs(i).dim();
+    if (SlotDef::VECTOR_SPARSE_NON_VALUE == header.slot_defs(i).type() ||
+        SlotDef::VECTOR_SPARSE_VALUE == header.slot_defs(i).type()) {
+      slots_.back().indices.push_back(0);
+    }
+  }
+
+  header_ = header;
+}
+
+void ProtoDataProvider::checkSample(const DataSample& sample) {
+  CHECK_EQ(numVecSlots_, sample.vector_slots_size());
+  CHECK(header_.slot_defs_size() == numVecSlots_ + sample.id_slots_size() ||
+        header_.slot_defs_size() == numVecSlots_ + sample.var_id_slots_size());
+  for (int i = 0; i < numVecSlots_; ++i) {
+    uint32_t dim = header_.slot_defs(i).dim();
+    switch (header_.slot_defs(i).type()) {
+      case SlotDef::VECTOR_DENSE: {
+        CHECK_EQ(static_cast<int>(dim), sample.vector_slots(i).values_size());
+        CHECK_EQ(0, sample.vector_slots(i).ids_size());
+        break;
+      }
+      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
+        if (0 == sample.vector_slots(i).ids_size()) {
+          break;
+        }
+        CHECK_LT(0, sample.vector_slots(i).ids_size());
+        CHECK_EQ(0, sample.vector_slots(i).values_size());
+        auto maxId = *std::max_element(sample.vector_slots(i).ids().begin(),
+                                       sample.vector_slots(i).ids().end());
+        CHECK_GT(dim, maxId);
+        break;
+      }
+      case SlotDef::VECTOR_SPARSE_VALUE: {
+        if (0 == sample.vector_slots(i).ids_size()) {
+          CHECK_EQ(0, sample.vector_slots(i).values_size());
+          break;
+        }
+        CHECK_LT(0, sample.vector_slots(i).values_size());
+        CHECK_GE(static_cast<int>(dim), sample.vector_slots(i).values_size());
+        CHECK_EQ(sample.vector_slots(i).values_size(),
+                 sample.vector_slots(i).ids_size());
+        auto maxId = *std::max_element(sample.vector_slots(i).ids().begin(),
+                                       sample.vector_slots(i).ids().end());
+        CHECK_GT(dim, maxId);
+        break;
+      }
+      case SlotDef::VAR_MDIM_DENSE: {
+        if (static_cast<int>(dim) != 0) {
+          CHECK_EQ(static_cast<int>(dim), sample.vector_slots(i).values_size());
+          if (sample.vector_slots(i).dims_size() != 0) {
+            int totalDim = sample.vector_slots(i).dims(0);
+            for (int j = 1; j < sample.vector_slots(i).dims_size(); ++j) {
+              totalDim *= sample.vector_slots(i).dims(j);
+            }
+            CHECK_EQ(static_cast<int>(dim), totalDim);
+          }
+        } else {
+          CHECK_NE(sample.vector_slots(i).dims_size(), 0);
+          int totalDim = sample.vector_slots(i).dims(0);
+          for (int j = 1; j < sample.vector_slots(i).dims_size(); ++j) {
+            totalDim *= sample.vector_slots(i).dims(j);
+          }
+          CHECK_EQ(totalDim, sample.vector_slots(i).values_size());
+        }
+        break;
+      }
+      case SlotDef::STRING: {
+        CHECK_EQ(static_cast<int>(1), sample.vector_slots(i).strs_size());
+        CHECK_EQ(0, sample.vector_slots(i).ids_size());
+        CHECK_EQ(0, sample.vector_slots(i).values_size());
+        break;
+      }
+      default:
+        LOG(FATAL) << "BUG: Should not reach here";
+    }
+  }
+  for (int i = numVecSlots_; i < header_.slot_defs_size(); ++i) {
+    if (header_.slot_defs(i).type() != SlotDef::VAR_MDIM_INDEX) {
+      uint32_t id = sample.id_slots(i - numVecSlots_);
+      if (id == -1U) continue;
+      CHECK_LT(id, header_.slot_defs(i).dim());
+    } else {
+      for (int j = 0; j < sample.var_id_slots(i - numVecSlots_).ids_size();
+           ++j) {
+        uint32_t id = sample.var_id_slots(i - numVecSlots_).ids(j);
+        CHECK_LT(id, header_.slot_defs(i).dim());
+      }
+    }
+  }
+}
+
+void ProtoDataProvider::loadDataFile(const std::string& fileName) {
+  std::ifstream is(fileName);
+  CHECK(is) << "Fail to open " << fileName;
+  bool dataCompression = str::endsWith(fileName, ".gz");
+  std::unique_ptr<ProtoReader> reader(new ProtoReader(&is, dataCompression));
+  CHECK(reader) << "Fail to create proto data input stream";
+
+  DataHeader header;
+  CHECK(reader->read(&header));
+  checkDataHeader(header);
+
+  DataSample sample;
+  do {
+    if (!reader->read(&sample)) {
+      break;
+    }
+    checkSample(sample);
+    if (sample.is_beginning()) {
+      sequenceStartPositions_.push_back(sampleNums_);
+    }
+    fillSlots(sample);
+    ++sampleNums_;
+  } while (true);
+
+  CHECK(is.eof()) << "Fail to read file";
+  reader.reset(nullptr);
+  is.close();
+}
+
+// checkSample has done before, no check here
+void ProtoDataProvider::fillSlots(const DataSample& sample) {
+  for (size_t i = 0; i < slots_.size(); ++i) {
+    auto& slot = slots_[i];
+    int dim = slot.dim;
+    switch (slot.type) {
+      case SlotDef::VECTOR_DENSE: {
+        size_t oldSize = slot.denseData.size();
+        slot.denseData.resize(oldSize + dim);
+        const float* values = sample.vector_slots(i).values().data();
+#ifdef PADDLE_TYPE_DOUBLE
+        std::copy(values, values + dim, slot.denseData.begin() + oldSize);
+#else
+        memcpy(slot.denseData.data() + oldSize, values, sizeof(real) * dim);
+#endif
+        break;
+      }
+      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
+        int slotSize = sample.vector_slots(i).ids_size();
+        int subSlotSize = 0;
+        int id = 0;  // the slot id
+        // find whether this vector_slots has subseq. If not has subseq,
+        // subSlotSize = 0.
+        for (id = 0; id < sample.subseq_slots_size(); id++) {
+          if (sample.subseq_slots(id).slot_id() == i) {
+            subSlotSize = sample.subseq_slots(id).lens_size();
+            break;
+          }
+        }
+        if (subSlotSize && slot.subIndices.size() == 0UL) {
+          // If has subSeq, the first element of subIndices = 0.
+          slot.subIndices.push_back(0);
+        }
+        if (slotSize == 0UL) {
+          // if has no id, new indices = old indices.
+          slot.indices.push_back(slot.indices.back());
+          // if has subSeq, new subIndices = old subIndices.
+          if (slot.subIndices.size()) {
+            slot.subIndices.push_back(slot.subIndices.back());
+          }
+          break;
+        }
+        slot.sparseNonValueData.resize(slot.indices.back() + slotSize);
+        const unsigned int* ids = sample.vector_slots(i).ids().data();
+        memcpy(slot.sparseNonValueData.data() + slot.indices.back(), ids,
+               sizeof(*ids) * slotSize);
+        slot.indices.push_back(slot.indices.back() + slotSize);
+        if (subSlotSize) {
+          for (int ii = 0; ii < subSlotSize; ++ii) {
+            slot.subIndices.push_back(slot.subIndices.back() +
+                                      sample.subseq_slots(id).lens(ii));
+          }
+        }
+        break;
+      }
+      case SlotDef::VECTOR_SPARSE_VALUE: {
+        if (0 == sample.vector_slots(i).ids_size()) {
+          slot.indices.push_back(slot.indices.back());
+          break;
+        }
+        int slotSize = sample.vector_slots(i).ids_size();
+        slot.sparseFloatValueData.resize(slot.indices.back() + slotSize);
+        const unsigned int* ids = sample.vector_slots(i).ids().data();
+        const float* values = sample.vector_slots(i).values().data();
+        for (int ii = 0; ii < slotSize; ++ii) {
+          slot.sparseFloatValueData[slot.indices.back() + ii].col = ids[ii];
+          slot.sparseFloatValueData[slot.indices.back() + ii].value =
+              values[ii];
+        }
+        slot.indices.push_back(slot.indices.back() + slotSize);
+        break;
+      }
+      case SlotDef::INDEX: {
+        slot.indexData.push_back(sample.id_slots(i - numVecSlots_));
+        break;
+      }
+      case SlotDef::VAR_MDIM_DENSE: {
+        size_t oldSize = slot.varDenseData.size();
+        slot.varDenseData.resize(oldSize + 1);
+        size_t varDim = sample.vector_slots(i).values_size();
+        slot.varDenseData[oldSize].data.resize(varDim);
+        const float* values = sample.vector_slots(i).values().data();
+#ifdef PADDLE_TYPE_DOUBLE
+        std::copy(values, values + varDim,
+                  slot.varDenseData[oldSize].data.data());
+#else
+        memcpy(slot.varDenseData[oldSize].data.data(), values,
+               sizeof(real) * varDim);
+#endif
+        slot.varDenseData[oldSize].dims.resize(
+            sample.vector_slots(i).dims_size());
+        memcpy(slot.varDenseData[oldSize].dims.data(),
+               sample.vector_slots(i).dims().data(),
+               sizeof(uint32_t) * sample.vector_slots(i).dims_size());
+        break;
+      }
+      case SlotDef::VAR_MDIM_INDEX: {
+        size_t oldSize = slot.varIndices.size();
+        slot.varIndices.resize(oldSize + 1);
+        size_t varDim = sample.var_id_slots(i - numVecSlots_).ids_size();
+        slot.varIndices[oldSize].resize(varDim);
+        memcpy(slot.varIndices[oldSize].data(),
+               sample.var_id_slots(i - numVecSlots_).ids().data(),
+               sizeof(uint32_t) * varDim);
+        break;
+      }
+      case SlotDef::STRING: {
+        slot.strData.push_back(sample.vector_slots(i).strs(0));
+        break;
+      }
+    }
+  }
+}
+
+void ProtoDataProvider::showDataStats() {
+  std::ostringstream oss;
+  for (size_t i = 0; i < slots_.size(); ++i) {
+    auto& slot = slots_[i];
+    if (slot.type == SlotDef::VECTOR_SPARSE_NON_VALUE) {
+      size_t nnz = slot.sparseNonValueData.size();
+      oss << "slot" << i << ":avgNNZ=" << ((double)nnz / sampleNums_) << "; ";
+    } else if (slot.type == SlotDef::VECTOR_SPARSE_VALUE) {
+      size_t nnz = slot.sparseFloatValueData.size();
+      oss << "slot" << i << ":avgNNZ=" << ((double)nnz / sampleNums_) << "; ";
+    }
+  }
+  LOG(INFO) << oss.str();
+}
+
+void ProtoDataProvider::reset() {
+  currentSequenceIndex_ = 0;
+  if (!skipShuffle_) {
+    shuffle();
+  }
+
+  DataProvider::reset();
+}
+
+void ProtoDataProvider::shuffle() {
+  std::random_shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end());
+}
+
+/*
+  Loop through sequences starting from currentSequenceIndex_
+  for at most size samples. For each sequence ranging from [begin, end),
+  op(begin, end) will be called.
+
+  return the number of sequences scanned
+*/
+template <class Op>
+int64_t ProtoDataProvider::sequenceLoop(Op op, int64_t size) {
+  int64_t sz = 0;
+  size_t i;
+  size_t sequenceCount = shuffledSequenceIds_.size();
+  if (usageRatio_ < 1.0f) {
+    sequenceCount = static_cast<int64_t>(sequenceCount * usageRatio_);
+  }
+  for (i = currentSequenceIndex_; i < sequenceCount; ++i) {
+    size_t id = shuffledSequenceIds_[i];
+    int64_t begin = sequenceStartPositions_[id];
+    int64_t end = sequenceStartPositions_[id + 1];
+    int64_t len = end - begin;
+    if (sz + len > size && sz > 0) break;
+    sz += len;
+    op(begin, end);
+  }
+  return i - currentSequenceIndex_;
+}
+
+/*
+  Loop through sequences starting from currentSequenceIndex_
+  for at most size samples. For each sample of each sequence at position
+  pos, op(pos) will be called.
+
+  return the number of sequences scanned
+*/
+template <class Op>
+int64_t ProtoDataProvider::sampleLoop(Op op, int64_t size) {
+  if (iidData()) {
+    size = std::min<int64_t>(sampleNums_ - currentSequenceIndex_, size);
+    for (int64_t i = currentSequenceIndex_; i < currentSequenceIndex_ + size;
+         ++i) {
+      size_t pos = shuffledSequenceIds_[i];
+      op(pos);
+    }
+    return size;
+  } else {
+    auto f = [op](int64_t begin, int64_t end) {
+      for (int64_t pos = begin; pos < end; ++pos) {
+        op(pos);
+      }
+    };
+    return sequenceLoop(f, size);
+  }
+}
+
+/*
+  Loop through sub-sequences starting from currentSequenceIndex_
+  for at most size samples. For each sample of each sub-sequence at position
+  pos, op(pos) will be called.
+
+  return the number of sub-sequences scanned
+*/
+template <class Op>
+int64_t ProtoDataProvider::subSampleLoop(Op op, int64_t size, int slot) {
+  CHECK(iidData()) << "subSampleLoop only accepts iid data";
+  size = std::min<int64_t>(sampleNums_ - currentSequenceIndex_, size);
+  int subSize = 0;
+  for (int64_t i = currentSequenceIndex_; i < currentSequenceIndex_ + size;
+       ++i) {
+    size_t pos = shuffledSequenceIds_[i];
+    int64_t* indexs = slots_[slot].indices.data();
+    int64_t* subIndexs = slots_[slot].subIndices.data();
+    int64_t subSeqStart = 0;
+    int64_t subSeqEnd = 0;
+    for (int j = 0; j < (int)slots_[slot].subIndices.size(); j++) {
+      if (subIndexs[j] == indexs[pos]) {
+        subSeqStart = j;
+        if (subIndexs[pos] == subIndexs[pos + 1]) {
+          subSeqEnd = j + 1;
+          break;
+        }
+      } else if (subIndexs[j] == indexs[pos + 1]) {
+        subSeqEnd = j;
+        break;
+      }
+    }
+    for (int j = subSeqStart; j < subSeqEnd; j++) {
+      op(j);
+    }
+    subSize += subSeqEnd - subSeqStart;
+  }
+  return subSize;
+}
+
+int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
+                                                DataBatch* batch) {
+  int64_t numSequences = 0;  // actual number of sequences in the batch
+
+  // the number of sequences scanned, including those skipped because too long
+  int64_t numScannedSeqs = 0;
+  std::lock_guard<RWLock> guard(lock_);
+  if (iidData()) {
+    size = std::min<int64_t>(getSize() - currentSequenceIndex_, size);
+    numScannedSeqs = numSequences = size;
+  } else {
+    int64_t sz = 0;
+    auto op = [&sz, &numSequences](int64_t begin, int64_t end) {
+      ++numSequences;
+      sz += end - begin;
+    };
+    numScannedSeqs = sequenceLoop(op, size);
+    VLOG_IF(1, numScannedSeqs > numSequences)
+        << numScannedSeqs - numSequences
+        << " sequences are skipped because longer than " << size;
+    size = sz;
+  }
+  if (size <= 0) return 0;
+
+  DataBatch& cpuBatch = *cpuBatch_;
+  std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
+  cpuBatch.setSize(size);
+  cpuArguments.resize(header_.slot_defs_size());
+
+  if (!iidData()) {
+    ICpuGpuVector::resizeOrCreate(cpuArguments[0].sequenceStartPositions,
+                                   numSequences + 1, /* useGpu= */ false);
+    int* buf = cpuArguments[0].sequenceStartPositions->getMutableData(false);
+    int pos = 0;
+    int i = 0;
+    auto op = [buf, &pos, &i](int64_t begin, int64_t end) {
+      buf[i] = pos;
+      pos += end - begin;
+      ++i;
+    };
+    sequenceLoop(op, size);
+    buf[i] = size;
+    for (size_t slot = 1; slot < cpuArguments.size(); ++slot) {
+      cpuArguments[slot].sequenceStartPositions =
+          cpuArguments[0].sequenceStartPositions;
+    }
+  }
+
+  for (int slot = 0; slot < header_.slot_defs_size(); ++slot) {
+    size_t dim = header_.slot_defs(slot).dim();
+    SlotDef::SlotType slotType = header_.slot_defs(slot).type();
+
+    std::vector<int64_t> dataPos;
+    dataPos.reserve(size);
+    auto op = [this, &dataPos](int64_t pos) { dataPos.push_back(pos); };
+    sampleLoop(op, size);
+
+    switch (slotType) {
+      case SlotDef::VECTOR_DENSE: {
+        Matrix::resizeOrCreate(cpuArguments[slot].value, size, dim,
+                               false,   // trans = false
+                               false);  // useGpu = false
+        real* buf = cpuArguments[slot].value->getData();
+        for (int i = 0; i < size; ++i) {
+          memcpy(buf + i * dim,
+                 slots_[slot].denseData.data() + dataPos[i] * dim,
+                 sizeof(real) * dim);
+        }
+        break;
+      }
+      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
+        if (!(cpuArguments[slot].value)) {
+          cpuArguments[slot].value = Matrix::createSparseMatrix(
+              size, dim, size /*DEFAULT_AVG_WIDTH = 1*/, NO_VALUE, SPARSE_CSR,
+              false, useGpu_);
+        }
+        auto mat = cpuArguments[slot].value;
+        mat->resize(size, dim);
+        if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
+          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
+              ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
+                         slots_[slot].sparseNonValueData.data(), HPPL_STREAM_1);
+        } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
+          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
+              ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
+                         slots_[slot].sparseNonValueData.data());
+        } else {
+          LOG(FATAL) << "Not Supported";
+        }
+        size_t numElements = 0;
+        for (auto pos : dataPos) {
+          numElements +=
+              slots_[slot].indices[pos + 1] - slots_[slot].indices[pos];
+        }
+        nnzStats_[slot]->addSample(numElements);
+
+        break;
+      }
+      case SlotDef::VECTOR_SPARSE_VALUE: {
+        if (!(cpuArguments[slot].value)) {
+          cpuArguments[slot].value = Matrix::createSparseMatrix(
+              size, dim, size /*DEFAULT_AVG_WIDTH = 1*/, FLOAT_VALUE,
+              SPARSE_CSR, false, useGpu_);
+        }
+        auto mat = cpuArguments[slot].value;
+        mat->resize(size, dim);
+        if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
+          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
+              dataPos.data(), slots_[slot].indices.data(),
+              slots_[slot].sparseFloatValueData.data(), HPPL_STREAM_1);
+        } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
+          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
+              ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
+                         slots_[slot].sparseFloatValueData.data());
+        } else {
+          LOG(FATAL) << "Not Supported";
+        }
+        break;
+      }
+      case SlotDef::INDEX: {
+        IVector::resizeOrCreate(cpuArguments[slot].ids, size,
+                                /*  useGpu= */ false);
+        int* buf = cpuArguments[slot].ids->getData();
+        for (int i = 0; i < size; ++i) {
+          buf[i] = slots_[slot].indexData[dataPos[i]];
+        }
+        break;
+      }
+      case SlotDef::VAR_MDIM_DENSE: {
+        CHECK_EQ(size, 1);
+        auto mat = cpuArguments[slot].value;
+        size_t totalDim = slots_[slot].varDenseData[dataPos[0]].data.size();
+
+        CHECK_EQ(slots_[slot].varDenseData[dataPos[0]].dims.size(), size_t(3));
+        size_t height, width, depth, oldWidth;
+        /* dims[2] is depth, will be changed to dims[0] in future */
+        depth = slots_[slot].varDenseData[dataPos[0]].dims[2];
+        height = slots_[slot].varDenseData[dataPos[0]].dims[1];
+        width = slots_[slot].varDenseData[dataPos[0]].dims[0];
+        oldWidth = width;
+        /* process the undesirable sample */
+        if (oldWidth < height) {
+          width = height;
+        }
+        cpuArguments[slot].setFrameHeight(height);
+        cpuArguments[slot].setFrameWidth(width);
+
+        if (oldWidth < height) {
+          totalDim = width * height * depth;
+        }
+        Matrix::resizeOrCreate(cpuArguments[slot].value, size, totalDim,
+                               false,   // trans = false
+                               false);  // useGpu = false
+        real* buf = cpuArguments[slot].value->getData();
+        cpuArguments[slot].value->zeroMem();
+        if (oldWidth < height) {
+          real* srcBuf = slots_[slot].varDenseData[dataPos[0]].data.data();
+          for (size_t i = 0; i < depth; i++) {
+            for (size_t j = 0; j < height; j++) {
+              for (size_t k = 0; k < oldWidth; k++) {
+                buf[i * height * width + j * width + k] =
+                    srcBuf[i * height * oldWidth + j * oldWidth + k];
+              }
+            }
+          }
+        } else {
+          memcpy(buf, slots_[slot].varDenseData[dataPos[0]].data.data(),
+                 sizeof(real) * totalDim);
+        }
+        ICpuGpuVector::resizeOrCreate(
+            cpuArguments[slot].sequenceStartPositions,
+            size + 1, /* size == 1 currently */
+            /* useGpu= */ false);
+        int* bufStarts =
+            cpuArguments[slot].sequenceStartPositions->getMutableData(false);
+        bufStarts[0] = 0;
+        bufStarts[1] = 1;
+        break;
+      }
+      case SlotDef::VAR_MDIM_INDEX: {
+        CHECK_EQ(size, 1);
+        size_t totalDim = slots_[slot].varIndices[dataPos[0]].size();
+        IVector::resizeOrCreate(cpuArguments[slot].ids, totalDim,
+                                /*  useGpu= */ false);
+        int* buf = cpuArguments[slot].ids->getData();
+        memcpy(buf, slots_[slot].varIndices[dataPos[0]].data(),
+               sizeof(int) * totalDim);
+
+        ICpuGpuVector::resizeOrCreate(
+            cpuArguments[slot].sequenceStartPositions,
+            size + 1, /* size == 1 currently */
+            /* useGpu= */ false);
+        int* bufStarts =
+            cpuArguments[slot].sequenceStartPositions->getMutableData(false);
+        bufStarts[0] = 0;
+        /* we expand the convolutinal feature map to a sequence data,
+         * so there should be a corresponding sequence labels */
+        bufStarts[1] = totalDim;
+        break;
+      }
+      case SlotDef::STRING: {
+        if (cpuArguments[slot].strs) {
+          cpuArguments[slot].strs->resize(size);
+        } else {
+          cpuArguments[slot].strs =
+              std::make_shared<std::vector<std::string>>(size);
+        }
+        for (int i = 0; i < size; ++i) {
+          (*cpuArguments[slot].strs)[i] = slots_[slot].strData[dataPos[i]];
+        }
+        break;
+      }
+    }
+  }
+
+  if (useGpu_) {
+    std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
+    DataBatch& gpuBatch = *gpuBatch_;
+    std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
+    gpuArguments.resize(cpuArguments.size());
+    gpuBatch.setSize(size);
+    for (int i = 0; i < header_.slot_defs_size(); ++i) {
+      SlotDef::SlotType slotType = header_.slot_defs(i).type();
+      if (SlotDef::VECTOR_SPARSE_VALUE == slotType ||
+          SlotDef::VECTOR_SPARSE_NON_VALUE == slotType) {
+        gpuArguments[i] = cpuArguments[i];
+        gpuArguments[i].sequenceStartPositions =
+            cpuArguments[i].sequenceStartPositions;
+      } else {
+        gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
+                                          HPPL_STREAM_1);
+      }
+    }
+    hl_stream_synchronize(HPPL_STREAM_1);
+    *batch = gpuBatch;
+  } else {
+    *batch = cpuBatch;
+  }
+
+  currentSequenceIndex_ += numScannedSeqs;
+
+  return batch->getSize();
+}
+
+ProtoSequenceDataProvider::ProtoSequenceDataProvider(const DataConfig& config,
+                                                     bool useGpu,
+                                                     bool loadDataAll)
+    : ProtoDataProvider(config, useGpu, loadDataAll) {}
+
+int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
+                                                        DataBatch* batch) {
+  CHECK(iidData()) << "ProtoSequenceDataProvider only accepts iid data";
+  int64_t numSequences = 0;  // actual number of sequences in the batch
+
+  // the number of sequences scanned, including those skipped because too long
+  int64_t numScannedSeqs = 0;
+  std::lock_guard<RWLock> guard(lock_);
+  size = std::min<int64_t>(getSize() - currentSequenceIndex_, size);
+  numScannedSeqs = numSequences = size;
+  if (size <= 0) return 0;
+
+  DataBatch& cpuBatch = *cpuBatch_;
+  std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
+  cpuBatch.setSize(size);
+  cpuArguments.resize(header_.slot_defs_size());
+
+  for (int slot = 0; slot < header_.slot_defs_size(); ++slot) {
+    SlotDef::SlotType slotType = header_.slot_defs(slot).type();
+
+    std::vector<int64_t> dataPos;
+    dataPos.reserve(size);
+    auto op = [this, &dataPos](int64_t pos) { dataPos.push_back(pos); };
+    sampleLoop(op, size);
+
+    // current slot: sequenceStartPositions
+    ICpuGpuVector::resizeOrCreate(
+        cpuArguments[slot].sequenceStartPositions,
+        size + 1,
+        /* useGpu= */ false);
+
+    switch (slotType) {
+      case SlotDef::VECTOR_SPARSE_VALUE:
+      case SlotDef::VAR_MDIM_DENSE:
+      case SlotDef::VAR_MDIM_INDEX: {
+        LOG(FATAL) << "ProtoSequenceDataProvider only support"
+                   << " VECTOR_DENSE, VECTOR_SPARSE_NON_VALUE and INDEX slots";
+        break;
+      }
+      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
+        // copy to IDS, not value
+        // pointers used in current slot
+        sparse_non_value_t* data = slots_[slot].sparseNonValueData.data();
+        int64_t* indexs = slots_[slot].indices.data();
+        int64_t* seqs = dataPos.data();
+
+        // current slot: i need size instances. what is the total length?
+        int totalFeatureInCurrentSlot = 0;
+        for (int ins = 0; ins < size; ins++) {
+          int64_t currInsId = seqs[ins];
+          totalFeatureInCurrentSlot +=
+              indexs[currInsId + 1] - indexs[currInsId];
+          // special: if current instance has NO feature in current slot
+          if (indexs[currInsId + 1] == indexs[currInsId]) {
+            totalFeatureInCurrentSlot++;
+          }
+        }
+        // done
+
+        // current slot: ids
+        IVector::resizeOrCreate(cpuArguments[slot].ids,
+                                totalFeatureInCurrentSlot,
+                                /* useGpu= */ false);
+
+        // where to write
+        int* currPosOfArgumentId = cpuArguments[slot].ids->getData();
+        int* currPosOfArgumentSeqStart =
+            cpuArguments[slot].sequenceStartPositions->getMutableData(false);
+        int allSequenceLength = 0;
+        currPosOfArgumentSeqStart[0] = 0;
+        // for each instance, copy data and fill sequence positions
+        for (int instance = 0; instance < size; instance++) {
+          int64_t currInstanceId = seqs[instance];
+          int64_t currInstanceLength =
+              indexs[currInstanceId + 1] - indexs[currInstanceId];
+          sparse_non_value_t* currInstanceData = data + indexs[currInstanceId];
+          // write sequenceStartPositions
+          allSequenceLength += currInstanceLength;
+          currPosOfArgumentSeqStart[instance + 1] = allSequenceLength;
+          // copy features
+          for (int featCopier = 0; featCopier < currInstanceLength;
+               featCopier++) {
+            currPosOfArgumentId[featCopier] = currInstanceData[featCopier].col;
+          }
+          currPosOfArgumentId += currInstanceLength;
+          // special: if current instance has NO feature in current slot
+          if (currInstanceLength == 0) {
+            allSequenceLength++;
+            currPosOfArgumentSeqStart[instance + 1] = allSequenceLength;
+            currPosOfArgumentId[0] = -1;
+            currPosOfArgumentId++;
+          }
+          // done
+        }
+        if (slots_[slot].subIndices.size()) {
+          std::vector<int64_t> dataSubPos;
+          auto op = [this, &dataSubPos](int64_t pos) {
+            dataSubPos.push_back(pos);
+          };
+          int subSize = subSampleLoop(op, size, slot);
+          ICpuGpuVector::resizeOrCreate(
+              cpuArguments[slot].subSequenceStartPositions, subSize + 1,
+              false);
+          int* currPosOfArgumentSubSeqStart =
+            cpuArguments[slot].subSequenceStartPositions->getMutableData(false);
+          int64_t* subSeqs = dataSubPos.data();
+          int64_t* subIndexs = slots_[slot].subIndices.data();
+          int allSubSequenceLength = 0;
+          currPosOfArgumentSubSeqStart[0] = 0;
+          // for each instance, compute sub-sequence number
+          for (int instance = 0; instance < subSize; instance++) {
+            int64_t currSubInstanceId = subSeqs[instance];
+            int64_t currSubInstanceLength =
+                subIndexs[currSubInstanceId + 1] - subIndexs[currSubInstanceId];
+            // write subSequenceStartPositions
+            allSubSequenceLength += currSubInstanceLength;
+            currPosOfArgumentSubSeqStart[instance + 1] = allSubSequenceLength;
+            // special: if current instance has NO feature in current slot
+            if (currSubInstanceLength == 0) {
+              allSubSequenceLength++;
+              currPosOfArgumentSubSeqStart[instance + 1] = allSubSequenceLength;
+            }
+          }
+          cpuArguments[slot].checkSubset();
+        }
+        break;
+      }
+      case SlotDef::INDEX: {
+        // label slot
+        IVector::resizeOrCreate(cpuArguments[slot].ids, size,
+                                /* useGpu= */ false);
+        // fill labels
+        int* buf = cpuArguments[slot].ids->getData();
+        for (int i = 0; i < size; ++i) {
+          buf[i] = slots_[slot].indexData[dataPos[i]];
+        }
+        // label HAS sequence structure
+        cpuArguments[slot].sequenceStartPositions->fillSequence(false);
+        break;
+      }
+      case SlotDef::VECTOR_DENSE: {
+        // copy values
+        size_t dim = header_.slot_defs(slot).dim();
+        Matrix::resizeOrCreate(cpuArguments[slot].value, size, dim,
+                               false,   // trans = false
+                               false);  // useGpu = false
+        real* buf = cpuArguments[slot].value->getData();
+        for (int i = 0; i < size; ++i) {
+          memcpy(buf + i * dim,
+                 slots_[slot].denseData.data() + dataPos[i] * dim,
+                 sizeof(real) * dim);
+        }
+        // sequence structure
+        cpuArguments[slot].sequenceStartPositions->fillSequence(false);
+        break;
+      }
+      default: { LOG(FATAL) << "should not reach here"; }
+    }
+  }
+
+  if (useGpu_) {
+    std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
+    DataBatch& gpuBatch = *gpuBatch_;
+    std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
+    gpuArguments.resize(cpuArguments.size());
+    gpuBatch.setSize(size);
+    for (size_t i = 0; i < cpuArguments.size(); ++i) {
+      gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
+                                        HPPL_STREAM_1);
+    }
+    hl_stream_synchronize(HPPL_STREAM_1);
+    *batch = gpuBatch;
+  } else {
+    *batch = cpuBatch;
+  }
+
+  currentSequenceIndex_ += numScannedSeqs;
+  return batch->getSize();
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.h b/paddle/gserver/dataproviders/ProtoDataProvider.h
new file mode 100644
index 00000000000000..0f3f55738a37ed
--- /dev/null
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.h
@@ -0,0 +1,154 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/utils/Stat.h"
+#include "DataFormat.pb.h"
+
+#include "DataProvider.h"
+#include "ProtoReader.h"
+
+namespace paddle {
+
+/**
+ * @brief  Data file with each sample specified by proto message
+ *         DataSample defined in DataFormat.proto.
+ *
+ * The file format is
+ *
+ *    header
+ *
+ *    sample1
+ *
+ *    sample2
+ *
+ *    ...
+ *
+ *    sampleN
+ *
+ * @note: In the data file, each message is prefixed with its length.
+ * The read/write of the protbuf are implemented in ProtoReader.h
+ */
+class ProtoDataProvider : public DataProvider {
+public:
+  ProtoDataProvider(const DataConfig& config, bool useGpu,
+                    bool loadDataAll = true);
+  virtual void reset();
+
+  /**
+   * @note this size includes the sequences which are skipped because they
+   * are longer than the batch size.
+   */
+  virtual int64_t getSize() {
+    int64_t size = sampleNums_;
+    if (usageRatio_ < 1.0f) {
+      size = static_cast<int64_t>(size * usageRatio_);
+    }
+    return size;
+  }
+  virtual void shuffle();
+
+  void loadData(const std::vector<std::string>& fileList);
+
+  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
+
+protected:
+  void loadData(const std::string& fileName);
+  void loadDataFile(const std::string& fileName);
+
+  void checkDataHeader(const DataHeader& header);
+  void fillSlots(const DataSample& sample);
+
+  /**
+   * return true if each sample is one sequence, i.e., independent
+   * of other samples.
+   */
+  inline bool iidData() const { return sequenceStartPositions_.empty(); }
+
+  // check that sample is consistent with header_
+  void checkSample(const DataSample& sample);
+
+  template <class Op>
+  int64_t sequenceLoop(Op op, int64_t size);
+
+  template <class Op>
+  int64_t sampleLoop(Op op, int64_t size);
+
+  template <class Op>
+  int64_t subSampleLoop(Op op, int64_t size, int slot);
+
+  void showDataStats();
+
+protected:
+  struct ProtoVarSlot {
+    std::vector<real> data;
+    std::vector<int> dims;
+  };
+
+  struct ProtoSlot {
+    SlotDef::SlotType type;
+    int dim;
+    std::vector<int> indexData;
+    std::vector<real> denseData;
+    std::vector<sparse_non_value_t> sparseNonValueData;
+    std::vector<sparse_float_value_t> sparseFloatValueData;
+    std::vector<int64_t> indices;
+    std::vector<int64_t> subIndices;
+
+    std::vector<ProtoVarSlot> varDenseData;
+    std::vector<std::vector<int>> varIndices;
+    std::vector<std::string> strData;
+  };
+  DataHeader header_;
+  int numVecSlots_;
+
+  std::vector<ProtoSlot> slots_;
+  size_t sampleNums_;
+
+  /**
+   * The starting position of each sequence in samples.
+   * The last element should be num of samples.
+   * If empty, each sample is one sequence.
+   */
+  std::vector<size_t> sequenceStartPositions_;
+
+  int64_t currentSequenceIndex_;
+
+  // The size should be the number of sequences.
+  std::vector<size_t> shuffledSequenceIds_;
+
+  ThreadLocalD<DataBatch> cpuBatch_;
+  ThreadLocalD<DataBatch> gpuBatch_;
+
+  RWLock lock_;
+  std::vector<StatPtr> nnzStats_;  // stats for number of none-zeros entries
+};
+
+/**
+ * Special use for Proto data: instances should contain sparse-non-value slots
+ * and label. ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE
+ */
+class ProtoSequenceDataProvider : public ProtoDataProvider {
+public:
+  ProtoSequenceDataProvider(const DataConfig& config, bool useGpu,
+                            bool loadDataAll = true);
+  ~ProtoSequenceDataProvider() {}
+  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/ProtoReader.h b/paddle/gserver/dataproviders/ProtoReader.h
new file mode 100644
index 00000000000000..3b1eb7e9ef03c4
--- /dev/null
+++ b/paddle/gserver/dataproviders/ProtoReader.h
@@ -0,0 +1,177 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <memory>
+
+#include <google/protobuf/message_lite.h>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <google/protobuf/io/gzip_stream.h>
+
+namespace paddle {
+
+/**
+ * ProtoReader/ProtoWriter are used to read/write a sequence of protobuf
+ * messages from/to i/ostream.
+ */
+class ProtoReader {
+public:
+  explicit ProtoReader(std::istream* s, bool dataCompression = false) {
+    CHECK(s) << "istream pointer is nullptr";
+    istreamInput_.reset(new google::protobuf::io::IstreamInputStream(s));
+    if (dataCompression) {
+      gzipInput_.reset(
+          new google::protobuf::io::GzipInputStream(istreamInput_.get()));
+      codedInput_.reset(
+          new google::protobuf::io::CodedInputStream(gzipInput_.get()));
+    } else {
+      codedInput_.reset(
+          new google::protobuf::io::CodedInputStream(istreamInput_.get()));
+    }
+    dataCompression_ = dataCompression;
+    approximateReadedBytes_ = 0;
+    codedInput_->SetTotalBytesLimit(kDefaultTotalBytesLimit,
+                                    kDefaultTotalBytesLimit);
+  }
+
+  /**
+   * read one message
+   */
+  bool read(google::protobuf::MessageLite* msg) {
+    if (approximateReadedBytes_ >= kMaxLimitBytes) {
+      // Once bytes we read get close to 64MB(larger than 55MB),
+      // we re-intialize the codedInputStream object.
+      approximateReadedBytes_ = 0;
+
+      /**
+       * Explicitly destroys the object owned by unique_ptr at first and then
+       * construct an new object.
+       *
+       * 1.reset()
+       *
+       * 2.reset(new ...)   <-- such sequence is EXTREAMLY important!
+       *
+       * Reason: (!!!Read me before you modify the following 2 lines of
+       * codes!!!)
+       *
+       * Otherwise, reset() method will ask the CodedInputStream constructor
+       * to construct the new object at first forcing the IstreamInputStream
+       * object to move its underlying pointer to the next 8192 bytes.
+       *
+       * Then the old object will be destroied calling
+       * IstreamInputStream::BackUp() to move the underlying pointer back.
+       * This means that the InstreamInputStream object is referenced by
+       * 2 different CodedInputStream object at the same time which "confuses"
+       * the position of istreamInput_'s underlying pointer. Such fatal
+       * confusion will lead to undefined behaviour when 'codedInput_' is
+       * used to read new data.
+       *
+       */
+      codedInput_.reset();
+      if (dataCompression_) {
+        codedInput_.reset(
+            new google::protobuf::io::CodedInputStream(gzipInput_.get()));
+      } else {
+        codedInput_.reset(
+            new google::protobuf::io::CodedInputStream(istreamInput_.get()));
+      }
+      codedInput_->SetTotalBytesLimit(kDefaultTotalBytesLimit,
+                                      kDefaultTotalBytesLimit);
+    }
+
+    uint32_t size;
+    if (!codedInput_->ReadVarint32(&size)) {
+      return false;
+    }
+    google::protobuf::io::CodedInputStream::Limit limit =
+        codedInput_->PushLimit(size);
+    CHECK(msg->ParseFromCodedStream(codedInput_.get()));
+    codedInput_->PopLimit(limit);
+
+    /**
+     * size is varint in the data file, we don't know the length.
+     * We assume every size takes 4 bytes in the data file.
+     */
+    approximateReadedBytes_ += 4 + size;
+    return true;
+  }
+
+protected:
+  std::unique_ptr<google::protobuf::io::ZeroCopyInputStream> istreamInput_;
+  std::unique_ptr<google::protobuf::io::GzipInputStream> gzipInput_;
+  std::unique_ptr<google::protobuf::io::CodedInputStream> codedInput_;
+  bool dataCompression_;
+
+  /**
+   * This is the maximum number of bytes that this CodedInputStream will read
+   * before refusing to continue.
+   */
+  static const int kDefaultTotalBytesLimit = 64 << 20;  // 64MB
+
+  /**
+   * If data readed by the reader is more than 55MB( << 64MB),
+   * we reset the CodedInputStream object.
+   * This can help avoid 64MB warning which will cause the ParseFromCodedStream
+   * to fail.
+   */
+  static const int kMaxLimitBytes = 55 << 20;
+
+  /**
+   * This variable dosen't store the exact bytes readed by CodedInputStream
+   * object since which is constructed. Instead, it store the approximate bytes
+   * because we can't tell how many bytes are readed by the object with the
+   * help of API.
+   *
+   * @note this code depends on protobuf 2.4.0. There is nothing like
+   * CodedInputStream::CurrentPosition() in protobuf 2.5.0 to tell us how many
+   * bytes has the object readed so far. Therefore, we calculated bytes ourselves.
+   */
+  int approximateReadedBytes_;
+};
+
+class ProtoWriter {
+public:
+  explicit ProtoWriter(std::ostream* s, bool dataCompression = false) {
+    CHECK(s) << "ostream pointer is nullptr";
+    ostreamOutput_.reset(new google::protobuf::io::OstreamOutputStream(s));
+    if (dataCompression) {
+      gzipOutput_.reset(
+          new google::protobuf::io::GzipOutputStream(ostreamOutput_.get()));
+      codedOutput_.reset(
+          new google::protobuf::io::CodedOutputStream(gzipOutput_.get()));
+    } else {
+      codedOutput_.reset(
+          new google::protobuf::io::CodedOutputStream(ostreamOutput_.get()));
+    }
+  }
+
+  /**
+   * write one message.
+   */
+  bool write(const google::protobuf::MessageLite& msg) {
+    codedOutput_->WriteVarint32(msg.ByteSize());
+    bool ret = msg.SerializeToCodedStream(codedOutput_.get());
+    return ret;
+  }
+
+protected:
+  std::unique_ptr<google::protobuf::io::ZeroCopyOutputStream> ostreamOutput_;
+  std::unique_ptr<google::protobuf::io::GzipOutputStream> gzipOutput_;
+  std::unique_ptr<google::protobuf::io::CodedOutputStream> codedOutput_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/gserver/dataproviders/PyDataProvider.cpp
new file mode 100644
index 00000000000000..aeefd16063df82
--- /dev/null
+++ b/paddle/gserver/dataproviders/PyDataProvider.cpp
@@ -0,0 +1,467 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "PyDataProvider.h"
+#include "paddle/utils/PythonUtil.h"
+#include <fenv.h>
+#include "paddle/utils/Util.h"
+
+namespace paddle {
+
+#ifndef PADDLE_NO_PYTHON
+REGISTER_DATA_PROVIDER(py, PyDataProvider);
+#endif
+
+PyDataProvider::PyDataProvider(const DataConfig& config, bool useGpu,
+                               bool loadDataAll)
+    : DataProvider(config, useGpu), batchSize_(0) {
+  PyGuard guard;
+  pyModuleName_ = config_.load_data_module();
+  pyClassName_ = config_.load_data_object();
+  if (config_.load_data_args() != "") {
+    pyUserArgs_["load_data_args"] = config_.load_data_args();
+  }
+
+  if (loadDataAll) {
+    std::vector<std::string> fileList;
+    if (!config_.files().empty()) {
+      loadFileList(config_.files(), fileList);
+    }
+    loadData(fileList);
+  }
+}
+
+void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
+  int feFlag = fegetexcept();
+  VLOG(1) << "module:" << pyModuleName_ << " class:" << pyClassName_;
+  classInstance_ =
+      createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_);
+  CHECK(classInstance_) << "Create class instance failed.";
+  PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
+                                      const_cast<char*>("getHeader"), NULL));
+  CHECK_PY(obj) << "Call function getHeader failed.";
+  std::string headerInfo =
+      std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
+  parseHeaderData(headerInfo);
+  feenableexcept(feFlag);
+}
+
+void PyDataProvider::parseHeaderData(const std::string& headerData) {
+  char* pHeader = const_cast<char*>(headerData.c_str());
+  char* pHeaderEnd = pHeader + headerData.size();
+  slotNum_ = readT<unsigned int>(pHeader, pHeaderEnd);
+  unsigned int useSequenceFlag = readT<unsigned int>(pHeader, pHeaderEnd);
+  isIID_ = useSequenceFlag != 1;
+  slots_.clear();
+  slots_.reserve(slotNum_);
+  for (size_t i = 0; i < slotNum_; ++i) {
+    unsigned int slotType = readT<unsigned int>(pHeader, pHeaderEnd);
+    unsigned int slotDim = readT<unsigned int>(pHeader, pHeaderEnd);
+    slots_.emplace_back();
+    slots_.back().dim = slotDim;
+    slots_.back().type = static_cast<SlotDef_SlotType>(slotType);
+  }
+}
+
+void PyDataProvider::resetSlots() {
+  for (auto& slot : slots_) {
+    slot.indexData.clear();
+    slot.denseData.clear();
+    slot.sparseNonValueData.clear();
+    slot.sparseFloatValueData.clear();
+    slot.indices.clear();
+    slot.sequenceStartPositions.clear();
+    slot.sampleSequenceIdVec.clear();
+    slot.subSequenceStartPositions.clear();
+    slot.strData.clear();
+  }
+}
+
+void PyDataProvider::fillDenseSlot(ProtoSlot& slot, char*& data,
+                                   const char* dataEnd) {
+  unsigned int dim = slot.dim;
+  slot.sampleNum = readT<unsigned int>(data, dataEnd);
+  slot.denseData.resize(slot.sampleNum * dim);
+#ifdef PADDLE_TYPE_DOUBLE
+  CHECK_LE(data + sizeof(real) * dim * slot.sampleNum, dataEnd)
+      << "std::copy data is out of range";
+  // PyDataProvider always provide data in float
+  float* dat = reinterpret_cast<float*>(data);
+  std::copy(dat, dat + slot.sampleNum * dim, slot.denseData.begin());
+#else
+  memcpyWithCheck(slot.denseData.data(), data,
+                  sizeof(real) * dim * slot.sampleNum, dataEnd);
+#endif
+  // PyDataProvider always provide data in float
+  data += sizeof(float) * dim * slot.sampleNum;
+}
+
+void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
+                                            const char* dataEnd) {
+  slot.sampleNum = readT<unsigned int>(data, dataEnd);
+  unsigned int* indexPtr = (unsigned int*)data;
+  CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
+      << "Vector assign value is out of range";
+  slot.indices.assign(indexPtr, indexPtr + slot.sampleNum);
+  data += sizeof(unsigned int) * slot.sampleNum;
+  unsigned int length = 0;
+  length = readT<unsigned int>(data, dataEnd);
+  slot.indices.push_back(length);
+  slot.sparseNonValueData.resize(length);
+  memcpyWithCheck(slot.sparseNonValueData.data(), data,
+                  sizeof(unsigned int) * length, dataEnd);
+  data += sizeof(unsigned int) * length;
+}
+
+void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot, char*& data,
+                                         const char* dataEnd) {
+  slot.sampleNum = readT<unsigned int>(data, dataEnd);
+  unsigned int* indexPtr = (unsigned int*)data;
+  CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
+      << "Vector assign value is out of range";
+  slot.indices.assign(indexPtr, indexPtr + slot.sampleNum);
+  data += sizeof(unsigned int) * slot.sampleNum;
+  unsigned int length = 0;
+  length = readT<unsigned int>(data, dataEnd);
+  unsigned int* colPtr = reinterpret_cast<unsigned int*>(data);
+  CHECK_LE(data + sizeof(unsigned int) * length, dataEnd)
+      << "Data is out of range";
+  data += sizeof(unsigned int) * length;
+  size_t colLen = readT<unsigned int>(data, dataEnd);
+  CHECK_EQ(colLen, length);
+  float* valuePtr = reinterpret_cast<float*>(data);
+  CHECK_LE(data + sizeof(real) * length, dataEnd) << "Data is out of range";
+  data += sizeof(real) * length;
+  slot.indices.push_back(length);
+  slot.sparseFloatValueData.resize(length);
+  for (unsigned int ii = 0; ii < length; ++ii) {
+    slot.sparseFloatValueData[ii].col = colPtr[ii];
+    slot.sparseFloatValueData[ii].value = valuePtr[ii];
+  }
+}
+
+void PyDataProvider::fillIndexSlot(ProtoSlot& slot, char*& data,
+                                   const char* dataEnd) {
+  slot.sampleNum = readT<unsigned int>(data, dataEnd);
+  CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
+      << "Vector assign is out of range";
+  slot.indexData.assign(reinterpret_cast<int*>(data),
+                        reinterpret_cast<int*>(data) + slot.sampleNum);
+  data += sizeof(unsigned int) * slot.sampleNum;
+}
+
+void PyDataProvider::fillStringSlot(ProtoSlot& slot, char*& data,
+                                    const char* dataEnd) {
+  slot.sampleNum = readT<unsigned int>(data, dataEnd);
+  for (unsigned int i = 0; i < slot.sampleNum; ++i) {
+    size_t len = readT<uint32_t>(data, dataEnd);
+    auto str_begin = data;
+    data += len;
+    CHECK_LE(data, dataEnd) << "Data is out of range";
+    slot.strData.emplace_back(str_begin, len);
+  }
+}
+
+void PyDataProvider::fillSlotsByStr(const std::string& samples) {
+  char* data = const_cast<char*>(samples.c_str());
+  char* dataEnd = data + samples.size();
+  batchSize_ = readT<unsigned int>(data, dataEnd);
+  if (0 == batchSize_) {
+    return;
+  }
+
+  for (size_t j = 0; j < slotNum_; ++j) {
+    auto& slot = slots_[j];
+    CHECK(SlotDef::INDEX >= slot.type || SlotDef::STRING == slot.type)
+        << " Slot type:" << slot.type << " is out of range.";
+    CHECK_GE(slot.type, SlotDef::VECTOR_DENSE) << " Slot type:" << slot.type
+                                               << " is out of range.";
+    switch (slot.type) {
+      case SlotDef::VECTOR_DENSE:
+        fillDenseSlot(slot, data, dataEnd);
+        break;
+      case SlotDef::VECTOR_SPARSE_NON_VALUE:
+        fillSparseNonValueSlot(slot, data, dataEnd);
+        break;
+      case SlotDef::VECTOR_SPARSE_VALUE:
+        fillSparseValueSlot(slot, data, dataEnd);
+        break;
+      case SlotDef::INDEX:
+        fillIndexSlot(slot, data, dataEnd);
+        break;
+      case SlotDef::VAR_MDIM_DENSE:
+        LOG(FATAL) << "Not implemented";
+        break;
+      case SlotDef::VAR_MDIM_INDEX:
+        LOG(FATAL) << "Not implemented";
+        break;
+      case SlotDef::STRING:
+        fillStringSlot(slot, data, dataEnd);
+        break;
+    }
+  }
+  // read sequenceStartPositions
+  for (size_t j = 0; j < slotNum_; ++j) {
+    auto& slot = slots_[j];
+    if (!iidData()) {
+      unsigned int sequenceNum = readT<unsigned int>(data, dataEnd);
+      slot.sequenceNum = sequenceNum;
+      for (size_t i = 0; i < sequenceNum; ++i) {
+        slot.sequenceStartPositions.push_back(
+            readT<unsigned int>(data, dataEnd));
+      }
+      for (size_t i = 0; i < sequenceNum; ++i) {
+        size_t begin = slot.sequenceStartPositions[i];
+        size_t end = (i < sequenceNum - 1)
+                         ? slot.sequenceStartPositions[i + 1]
+                         : slot.sampleNum;
+        for (size_t ii = begin; ii < end; ++ii) {
+          slot.sampleSequenceIdVec.push_back(ii);
+        }
+      }
+    } else {
+      for (size_t i = 0; i < slot.sampleNum; ++i) {
+        slot.sampleSequenceIdVec.push_back(i);
+      }
+    }
+  }
+  // read subSequenceStartPositions, not all slots have this infomation.
+  for (size_t j = 0; j < slotNum_; ++j) {
+    auto& slot = slots_[j];
+    if (!iidData() && data != dataEnd) {
+      unsigned int subSequenceNum = readT<unsigned int>(data, dataEnd);
+      slot.subSequenceNum = subSequenceNum;
+      for (size_t i = 0; i < subSequenceNum; ++i) {
+        slot.subSequenceStartPositions.push_back(
+            readT<unsigned int>(data, dataEnd));
+      }
+    }
+  }
+}
+
+void PyDataProvider::reset() {
+  {  // Invoke PyDataProvider Reset
+    PyGuard guard;
+    PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
+                                        const_cast<char*>("reset"), NULL));
+    CHECK_PY(obj) << "Call function reset failed.";
+  }
+
+  if (!skipShuffle_) {
+    // Invoke PyDataProvider Shuffle
+    shuffle();
+  }
+  DataProvider::reset();
+}
+
+void PyDataProvider::shuffle() {
+  // py shuffle
+  PyGuard guard;
+  PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
+                                      const_cast<char*>("shuffle"), NULL));
+  CHECK_PY(obj) << "Call function shuffle failed.";
+}
+
+void PyDataProvider::handleDenseSlot(ProtoSlot& slot, size_t slotIndex,
+                                     std::vector<Argument>& cpuArguments) {
+  unsigned int dim = slot.dim;
+  Matrix::resizeOrCreate(cpuArguments[slotIndex].value, slot.sampleNum, dim,
+                         false,   // trans = false
+                         false);  // useGpu = false
+  real* buf = cpuArguments[slotIndex].value->getData();
+  for (size_t i = 0; i < slot.sampleNum; ++i) {
+    memcpyWithCheck(buf + i * dim,
+                    slot.denseData.data() + slot.sampleSequenceIdVec[i] * dim,
+                    sizeof(real) * dim,
+                    slot.denseData.data() + slot.denseData.size());
+  }
+}
+
+void PyDataProvider::handleSparseNonValueSlot(
+    ProtoSlot& slot, size_t slotIndex, std::vector<Argument>& cpuArguments) {
+  unsigned int dim = slot.dim;
+  if (!(cpuArguments[slotIndex].value)) {
+    cpuArguments[slotIndex].value = Matrix::createSparseMatrix(
+        slot.sampleNum, dim, slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/, NO_VALUE,
+        SPARSE_CSR, false, useGpu_);
+  }
+  auto mat = cpuArguments[slotIndex].value;
+  mat->resize(slot.sampleNum, dim, slot.sampleNum, NO_VALUE, SPARSE_CSR);
+  if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
+    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
+        ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
+                   slot.sparseNonValueData.data(), HPPL_STREAM_1);
+  } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
+    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
+        ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
+                   slot.sparseNonValueData.data());
+  } else {
+    LOG(FATAL) << "Not Supported";
+  }
+}
+
+void PyDataProvider::handleSparseValueSlot(
+    ProtoSlot& slot, size_t slotIndex, std::vector<Argument>& cpuArguments) {
+  unsigned int dim = slot.dim;
+  if (!(cpuArguments[slotIndex].value)) {
+    cpuArguments[slotIndex].value = Matrix::createSparseMatrix(
+        slot.sampleNum, dim, slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
+        FLOAT_VALUE, SPARSE_CSR, false, useGpu_);
+  }
+  auto mat = cpuArguments[slotIndex].value;
+  mat->resize(slot.sampleNum, dim, slot.sampleNum, FLOAT_VALUE, SPARSE_CSR);
+  if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
+    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
+        ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
+                   slot.sparseFloatValueData.data(), HPPL_STREAM_DEFAULT);
+  } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
+    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
+        ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
+                   slot.sparseFloatValueData.data());
+  } else {
+    LOG(FATAL) << "Not Supported";
+  }
+}
+
+void PyDataProvider::handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
+                                     std::vector<Argument>& cpuArguments) {
+  IVector::resizeOrCreate(cpuArguments[slotIndex].ids, slot.sampleNum,
+                          /*useGpu_*/ false);
+  int* buf = cpuArguments[slotIndex].ids->getData();
+  for (size_t i = 0; i < slot.sampleNum; ++i) {
+    buf[i] = slot.indexData[slot.sampleSequenceIdVec[i]];
+  }
+}
+
+void PyDataProvider::handleStringSlot(ProtoSlot& slot, size_t slotIndex,
+                                      std::vector<Argument>& cpuArguments) {
+  if (cpuArguments[slotIndex].strs) {
+    cpuArguments[slotIndex].strs->resize(slot.sampleNum);
+  } else {
+    cpuArguments[slotIndex].strs =
+        std::make_shared<std::vector<std::string>>(slot.sampleNum);
+  }
+  for (size_t i = 0; i < slot.sampleNum; ++i) {
+    (*cpuArguments[slotIndex].strs)[i] =
+        slot.strData[slot.sampleSequenceIdVec[i]];
+  }
+}
+
+int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
+  PyGuard guard;
+  PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
+                                      const_cast<char*>("getNextBatch"),
+                                      const_cast<char*>("i"), size));
+  CHECK_PY(obj) << "Call function getNextBatch failed.";
+  const std::string& samples =
+      std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
+  resetSlots();
+  fillSlotsByStr(samples);
+  size = batchSize_;
+  if (size <= 0) return 0;
+
+  DataBatch& cpuBatch = *cpuBatch_;
+  std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
+  cpuBatch.setSize(size);
+  cpuArguments.resize(slotNum_);
+
+  if (!iidData()) {
+    for (size_t j = 0; j < slotNum_; ++j) {
+      auto& slot = slots_[j];
+      ICpuGpuVector::resizeOrCreate(
+          cpuArguments[j].sequenceStartPositions,
+          slot.sequenceNum + 1, /* useGpu= */ false);
+      int* buf = cpuArguments[j].sequenceStartPositions->getMutableData(false);
+      std::copy(slot.sequenceStartPositions.begin(),
+                slot.sequenceStartPositions.end(), buf);
+      buf[slot.sequenceStartPositions.size()] = slot.sampleNum;
+
+      if (slot.subSequenceStartPositions.size()) {
+        ICpuGpuVector::resizeOrCreate(
+            cpuArguments[j].subSequenceStartPositions,
+            slot.subSequenceNum + 1,
+            /*  useGpu= */ false);
+        int* buf =
+           cpuArguments[j].subSequenceStartPositions->getMutableData(false);
+        std::copy(slot.subSequenceStartPositions.begin(),
+                  slot.subSequenceStartPositions.end(), buf);
+        buf[slot.subSequenceNum] = slot.sampleNum;
+        // check subSequenceStartPositions and sequenceStartPositions
+        cpuArguments[j].checkSubset();
+      }
+    }
+  }
+
+  for (size_t slotIndex = 0; slotIndex < slotNum_; ++slotIndex) {
+    auto& slot = slots_[slotIndex];
+    SlotDef::SlotType slotType = slot.type;
+    switch (slotType) {
+      case SlotDef::VECTOR_DENSE:
+        handleDenseSlot(slot, slotIndex, cpuArguments);
+        break;
+      case SlotDef::VECTOR_SPARSE_NON_VALUE:
+        handleSparseNonValueSlot(slot, slotIndex, cpuArguments);
+        break;
+      case SlotDef::VECTOR_SPARSE_VALUE:
+        handleSparseValueSlot(slot, slotIndex, cpuArguments);
+        break;
+      case SlotDef::INDEX:
+        handleIndexSlot(slot, slotIndex, cpuArguments);
+        break;
+      case SlotDef::VAR_MDIM_DENSE:
+        LOG(FATAL) << "Not implemented";
+        break;
+      case SlotDef::VAR_MDIM_INDEX:
+        LOG(FATAL) << "Not implemented";
+        break;
+      case SlotDef::STRING:
+        handleStringSlot(slot, slotIndex, cpuArguments);
+        break;
+    }
+  }
+
+  if (useGpu_) {
+    std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
+    DataBatch& gpuBatch = *gpuBatch_;
+    std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
+    gpuArguments.resize(cpuArguments.size());
+    gpuBatch.setSize(size);
+    for (size_t i = 0; i < slotNum_; ++i) {
+      SlotDef::SlotType slotType = slots_[i].type;
+      if (SlotDef::VECTOR_SPARSE_VALUE == slotType ||
+          SlotDef::VECTOR_SPARSE_NON_VALUE == slotType) {
+        gpuArguments[i] = cpuArguments[i];
+        gpuArguments[i].sequenceStartPositions =
+            cpuArguments[i].sequenceStartPositions;
+
+        if (slots_[i].subSequenceStartPositions.size()) {
+          gpuArguments[i].subSequenceStartPositions =
+              cpuArguments[i].subSequenceStartPositions;
+        }
+      } else {
+        gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
+                                          HPPL_STREAM_1);
+      }
+    }
+    hl_stream_synchronize(HPPL_STREAM_1);
+    *batch = gpuBatch;
+  } else {
+    *batch = cpuBatch;
+  }
+
+  return batch->getSize();
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/PyDataProvider.h b/paddle/gserver/dataproviders/PyDataProvider.h
new file mode 100644
index 00000000000000..939d9cf725c2fe
--- /dev/null
+++ b/paddle/gserver/dataproviders/PyDataProvider.h
@@ -0,0 +1,118 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <paddle/utils/PythonUtil.h>
+#include "DataFormat.pb.h"
+#include "DataProvider.h"
+
+#include <vector>
+
+namespace paddle {
+
+class PyDataProvider : public DataProvider {
+public:
+  PyDataProvider(const DataConfig& config, bool useGpu,
+                 bool loadDataAll = true);
+
+  virtual void reset();
+
+  // Note this size includes the sequences which are skipped because they
+  // are longer than the batch size
+  virtual int64_t getSize() {
+    LOG(FATAL) << "Not implement yet";
+    return -1;
+  }
+  virtual void shuffle();
+
+  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
+
+protected:
+  struct ProtoSlot;
+  // return false if each each sample is one sequence, i.e., independent
+  // of other samples.
+  inline bool iidData() const { return isIID_; }
+
+  void parseHeaderData(const std::string& headerData);
+  void fillDenseSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
+  void fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
+                              const char* dataEnd);
+  void fillSparseValueSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
+  void fillIndexSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
+  void fillStringSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
+  void fillSlotsByStr(const std::string& samples);
+  void handleDenseSlot(ProtoSlot& slot, size_t slotIndex,
+                       std::vector<Argument>& cpuArguments);
+  void handleSparseNonValueSlot(ProtoSlot& slot, size_t slotIndex,
+                                std::vector<Argument>& cpuArguments);
+  void handleSparseValueSlot(ProtoSlot& slot, size_t slotIndex,
+                             std::vector<Argument>& cpuArguments);
+  void handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
+                       std::vector<Argument>& cpuArguments);
+  void handleStringSlot(ProtoSlot& slot, size_t slotIndex,
+                        std::vector<Argument>& cpuArguments);
+  void resetSlots();
+  void loadData(const std::vector<std::string>& fileList);
+
+protected:
+  struct ProtoSlot {
+    SlotDef::SlotType type;
+    int dim;
+    unsigned int sampleNum;
+    unsigned int sequenceNum;
+    unsigned int subSequenceNum;
+    // Store the data of index type slot
+    std::vector<int> indexData;
+    // Store the data of dense type slot
+    std::vector<real> denseData;
+    // Store the data of sparseNonValue type slot
+    std::vector<sparse_non_value_t> sparseNonValueData;
+    // Store the data of sparseValue type slot
+    std::vector<sparse_float_value_t> sparseFloatValueData;
+    // Used to store the index of each sample in slot values
+    std::vector<int64_t> indices;
+    // The starting position of each sequence in samples
+    // The last element should be the number of samples
+    // If empty, each sample is one sequence.
+    std::vector<size_t> sequenceStartPositions;
+    // The index id of sequences in slot
+    std::vector<int64_t> sampleSequenceIdVec;
+    // The starting position of each subsequence in samples
+    // The last element should be the number of subsequence
+    // If empty, each sequence of sample has no subsequence.
+    std::vector<size_t> subSequenceStartPositions;
+    // Store the data of string type slot
+    std::vector<std::string> strData;
+  };
+  std::vector<ProtoSlot> slots_;
+
+  PyObjectPtr classInstance_;
+  unsigned int batchSize_;
+  unsigned int slotNum_;
+  // if use sequence, isIID_ equals false, otherwise it is true.
+  bool isIID_;
+  // The name of python module name
+  std::string pyModuleName_;
+  // The name of python class name
+  std::string pyClassName_;
+  // User args set in config
+  std::map<std::string, std::string> pyUserArgs_;
+
+  ThreadLocalD<DataBatch> cpuBatch_;
+  ThreadLocalD<DataBatch> gpuBatch_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
new file mode 100644
index 00000000000000..f7886c4e014d77
--- /dev/null
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -0,0 +1,979 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_NO_PYTHON
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unordered_set>
+#include <list>
+
+#include "DataProvider.h"
+#include "paddle/utils/PythonUtil.h"
+
+namespace paddle {
+
+/**
+ * Slot type
+ */
+enum SlotType {
+  ST_DENSE = 0,
+  ST_NON_SPARSE_VALUE = 1,
+  ST_SPARSE_VALUE = 2,
+  ST_INDEX = 3
+};
+
+/**
+ * Sequence type
+ */
+enum SeqType {
+  SQT_NONE = 0,
+  SQT_SEQ,
+  SQT_SUBSEQ
+};
+
+/**
+ * Cache Type.
+ */
+enum CacheType {
+  NO_CACHE = 0,  // Each pass will load data from PyDataProvider2.
+  CACHE_PASS_IN_MEM = 1,  // First pass will load data from PyDataProvider2,
+                          // then cache all data in memory. Load data from
+                          // memory in rest passes.
+};
+
+struct SlotHeader {  // Slot Header will parse from python object's slots field.
+  size_t dim;
+  SlotType slotType;
+  SeqType seqType;
+};
+
+inline std::ostream& operator << (std::ostream& os, const SlotHeader& header) {
+  os <<"Dim = " << header.dim << " Type = " << header.slotType
+     << " SeqType = " << header.seqType;
+  return os;
+}
+
+/**
+ * FieldScanner Interface.
+ *
+ * It will read python object, and fill to argument's each slot.
+ * There are two steps, prepare and fill. Scanner will alloc memory during
+ * prepare step, fill data into argument during fill step.
+ */
+class IFieldScanner {
+public:
+  DISABLE_COPY(IFieldScanner);
+  /**
+   * Ctor.
+   * @param headerPtr slot header that scanner belong to.
+   */
+  explicit IFieldScanner(SlotHeader* headerPtr) : headerPtr_(headerPtr) {}
+  virtual ~IFieldScanner() {}
+
+  /**
+   * Start prepare step.
+   */
+  virtual void startPrepare(Argument& argument) {}
+
+  /**
+   * Prepare step.
+   *
+   * @note the obj could be a timestep of sample or whole sample. It depends
+   * what scanner it is.
+   */
+  virtual void prepare(Argument& argument, PyObject* obj) {}
+
+  /**
+   * Finish Prepare step.
+   */
+  virtual void finishPrepare(Argument& argument) {}
+
+  /**
+   * Start fill step.
+   */
+  virtual void startFill(Argument& argument) {}
+
+  /**
+   * Fill step.
+   *
+   * @note the obj could be a timestep of sample or whole sample. It depends
+   * what scanner it is.
+   */
+  virtual void fill(Argument& argument, PyObject* obj) {}
+
+  /**
+   * Finish fill step.
+   */
+  virtual void finishFill(Argument& argument) {}
+
+  /**
+   * Factory method. Create a scanner by header. The final scanner may be
+   * combine many scanners.
+   *
+   * @note Fatal if header is not support.
+   */
+  static IFieldScanner* create(SlotHeader* header);
+
+protected:
+  SlotHeader* headerPtr_;
+};
+
+
+/**
+ * Py Data Provider Cache Interface.
+ */
+class IPyDataProviderCache {
+public:
+  virtual ~IPyDataProviderCache() {}
+
+  /**
+   * invoke when DataProvider::reset()
+   * @return true if read data from python.
+   */
+  virtual bool reset() = 0;
+
+  /**
+   * invoke when these data are used by DataProvider, and need to clear.
+   * @param [inout] data used data.
+   *
+   * @note The implemented class must clear these data array. Or if you want to
+   * delete the PyObjectPtr later, you should make sure the paddle process only
+   * have one active thread calling python code (use PyGuard otherwise).
+   */
+  virtual void drop(std::deque<PyObjectPtr>* data) = 0;
+
+  /**
+   * Return whole data in cache.
+   */
+  virtual std::deque<PyObjectPtr>* load() = 0;
+
+  /**
+   * Factory method. Convert CacheType to IPyDataProviderCache*
+   */
+  static IPyDataProviderCache* create(CacheType ct);
+};
+
+/**
+ * PyDataProvider2.
+ *
+ * For usage, please refer python module 'paddle.trainer.PyDataProvider2'
+ *
+ * Here, we start a thread to read data. It is totally asynchronous for reading
+ * data. And it support cache strategies.
+ */
+class PyDataProvider2 : public DataProvider {
+public:
+  /**
+   * Ctor
+   */
+  PyDataProvider2(const DataConfig& config,
+                  bool useGpu)
+    :DataProvider(config, useGpu), callingContextCreated_(2) {
+    auto& args = config.load_data_args();
+    PyObjectPtr kwargs = PyObjectPtr(PyDict_New());
+    if (!args.empty()) {
+      kwargs = callPythonFuncRetPyObj(
+            "paddle.trainer.PyDataProvider2",
+            "deserialize_args",
+            {args});
+    }
+
+    py::DictHelper kwargsDict(kwargs);
+    kwargsDict.setBool("is_train", !config.for_test());
+
+    // kwargs is keyword arguemts to create object.
+    this->createPyDataObj(config.load_data_module(),
+                          config.load_data_object(),
+                          config.files(),
+                          std::move(kwargs));
+    DBG << "Instance " << instance_.get() << " loaded.";
+    this->readPyFields();
+    DBG << "Py Field Done";
+  }
+
+  /**
+   * Dtor
+   * @note will stop loading thread when destructing
+   */
+  virtual ~PyDataProvider2() {
+    resetImpl(false);
+  }
+
+private:
+  void createPyDataObj(const std::string& model,
+                       const std::string& className,
+                       const std::string& fileListName,
+                       PyObjectPtr && kwargs) {
+    LOG(INFO) << "loading dataprovider " << model <<"::" << className;
+
+    PyObjectPtr module(PyImport_ImportModule(model.c_str()));
+    CHECK_PY(module) << "Cannot imort module " << model.c_str();
+    PyObjectPtr moduleDict(PyModule_GetDict(module.get()));
+    CHECK_PY(moduleDict) << "Invoke module.__dict__ error";
+    PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(),
+                                         className.c_str()));
+    CHECK_PY(cls) << "load class " << className.c_str() << "error";
+
+    // If there are multiple python instance share same module, the PyObjectPtr
+    // only for instance will make python reference-count error.
+    //
+    // So here, we increase reference count manually.
+    if (gModuleClsPtrs_.find((uintptr_t) module.get())
+        != gModuleClsPtrs_.end()) {
+      // Multi instance use same module
+      Py_XINCREF(module.get());
+      Py_XINCREF(moduleDict.get());
+    } else {
+      gModuleClsPtrs_.insert((uintptr_t) module.get());
+    }
+    if (gModuleClsPtrs_.find((uintptr_t) cls.get()) != gModuleClsPtrs_.end()) {
+      Py_XINCREF(cls.get());
+    } else {
+      gModuleClsPtrs_.insert((uintptr_t) cls.get());
+    }
+
+    PyObjectPtr fileListInPy = loadPyFileLists(fileListName);
+    PyDict_SetItemString(kwargs.get(), "file_list", fileListInPy.get());
+    {
+      PyGuard guard;
+      instance_.reset(PyObject_Call(cls.get(), zeroTuple_.get(), kwargs.get()));
+    }
+    CHECK_PY(instance_) << "Cannot Create instance";
+  }
+
+  void readPyFields() {
+    py::ObjectHelper self(this->instance_);
+    this->skipShuffle_ = !self.getBoolAttr("should_shuffle");
+    bool ok;
+    this->poolSize_ = self.getIntAttr<size_t>("pool_size", &ok);
+    if (!ok) {
+      this->poolSize_ = -1UL;
+    }
+    this->canOverBatchSize_ = self.getBoolAttr("can_over_batch_size");
+
+    calcBatchSize_.reset(self.getAttr("calc_batch_size"));
+    if (this->calcBatchSize_ && !py::isCallable(this->calcBatchSize_)) {
+      this->calcBatchSize_.reset();
+    }
+
+    generator_.reset(self.getAttr("generator"));
+    CHECK(py::isCallable(generator_));
+
+    // Reading slots.
+    PyObjectPtr slotsPtr(self.getAttr("slots"));
+    py::SequenceHelper slots(slotsPtr);
+    headers_.reserve(slots.size());
+    for (size_t i = 0; i < slots.size(); ++i) {
+      headers_.emplace_back();
+      auto& header = headers_.back();
+      PyObject* hdPtr = slots[i];
+      CHECK(hdPtr != nullptr);
+      Py_XINCREF(hdPtr);
+      PyObjectPtr headerPtrWrap(hdPtr);
+      py::ObjectHelper hd(headerPtrWrap);
+      header.dim = hd.getIntAttrWithError<size_t>("dim");
+      header.seqType = (SeqType) hd.getIntAttrWithError<int>("seq_type");
+      header.slotType = (SlotType) hd.getIntAttrWithError<int>("type");
+    }
+
+    DBG << "Data header size " << headers_.size();
+    for (auto & header : headers_) {
+      DBG << header;
+    }
+    cache_.reset(IPyDataProviderCache::create(
+        (CacheType)self.getIntAttrWithError<int>("cache")));
+  }
+
+  PyObjectPtr loadPyFileLists(const std::string& fileListName) {
+    loadFileList(fileListName, fileLists_);
+    PyObject* lst = PyList_New(fileLists_.size());
+    for (size_t i = 0; i < fileLists_.size(); ++i) {
+      PyList_SET_ITEM(lst, i,
+                      PyString_FromString(fileLists_[i].c_str()));
+    }
+    return PyObjectPtr(lst);
+  }
+
+  void loadThread() {
+    callingContexts_.reserve(fileLists_.size());
+    DBG << "Creating context";
+    for (auto& filename : fileLists_) {
+      PyGuard g;
+      py::CallableHelper generator(this->generator_);
+      generator.setArgsSize(2);
+      generator.getArgs().set(0, instance_);
+      generator.getArgs().set(1, PyString_FromString(filename.c_str()), true);
+      callingContexts_.emplace_back(generator());
+      CHECK_PY(callingContexts_.back()) << "Generator error.";
+      CHECK(PyIter_Check(callingContexts_.back()));
+    }
+    DBG << "Create context done";
+    callingContextCreated_.wait();
+
+    PositionRandom p(skipShuffle_);
+
+    while (!exit_ && !callingContexts_.empty()) {
+      PyObject* data = nullptr;
+
+      {  // Read data.
+        size_t cid = p(callingContexts_.size());
+        bool atEnd;
+        data = py::iterNext(callingContexts_[cid], &atEnd);
+        if (atEnd || data == nullptr) {
+          callingContexts_.erase(callingContexts_.begin() + cid);
+          this->pullCV_.notify_all();
+          continue;
+        }
+      }
+
+      size_t additionalBatchSize = 1;
+      if (calcBatchSize_) {
+        py::CallableHelper calcBatchSize(this->calcBatchSize_);
+        calcBatchSize.setArgsSize(1);
+        calcBatchSize.getArgs().set(0, data);
+        PyObjectPtr bs(calcBatchSize());
+        CHECK_PY(bs);
+        bool ok;
+        additionalBatchSize = py::castInt<size_t>(bs.get(), &ok);
+        CHECK(ok) << "CalcBatchSize must return int or long";
+      }
+
+      if (this->loadThread_){  // wait poolActualSize < poolSize;
+        std::unique_lock<std::mutex> l(mtx_);
+        pushCV_.wait(l, [this, additionalBatchSize] {
+          if (this->canOverBatchSize_) {
+            return this->poolActualSize_ < poolSize_;
+          } else {
+            return this->poolActualSize_ + additionalBatchSize < poolSize_;
+          }
+        });
+      }
+
+      {
+        std::lock_guard<std::mutex> guard(mtx_);
+        poolActualSize_ += additionalBatchSize;
+        dataPool_.emplace_back(data);
+      }
+
+      {
+        pullCV_.notify_all();
+      }
+    }
+    DBG << "load thread end";
+  }
+
+  inline void resetImpl(bool startNewThread) {
+    DBG << "Reseting " << startNewThread;
+    if (loadThread_) {  // is loading.
+      exit_.store(true);
+      loadThread_->join();
+      loadThread_.reset();
+    }
+    {
+      PyGuard g;
+      callingContexts_.clear();
+      dataPool_.clear();
+    }
+    poolActualSize_ = 0;
+    exit_ = false;
+    if (startNewThread && cache_->reset()) {
+      DBG << "Start new thread.";
+      loadThread_.reset(new std::thread([this] {
+        loadThread();
+      }));
+      callingContextCreated_.wait();
+    }
+    DBG << "Reset done";
+  }
+
+private:
+  std::unique_ptr<std::thread> loadThread_;
+  std::atomic<bool> exit_;
+  std::vector<PyObjectPtr> callingContexts_;
+  std::deque<PyObjectPtr> dataPool_;
+  size_t poolActualSize_;
+  std::condition_variable pushCV_;
+  std::condition_variable pullCV_;
+  std::mutex mtx_;
+  ThreadBarrier callingContextCreated_;
+  std::unique_ptr<IPyDataProviderCache> cache_;
+
+  PyObjectPtr instance_;
+  size_t poolSize_;
+  bool canOverBatchSize_;
+  PyObjectPtr calcBatchSize_;
+  PyObjectPtr generator_;
+  std::vector<std::string> fileLists_;
+  std::vector<SlotHeader> headers_;
+  static PyObjectPtr zeroTuple_;
+  static std::unordered_set<uintptr_t > gModuleClsPtrs_;
+
+  class PositionRandom {
+  public:
+    inline explicit PositionRandom(bool skipRand):
+        eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
+
+    inline size_t operator() (size_t len) {
+      if (!skipRand_) {
+        if (!dist_ || dist_->b() != len - 1) {
+          dist_.reset(new std::uniform_int_distribution<size_t>(0, len - 1));
+        }
+        return (*dist_)(eng_);
+      } else {
+        return 0;
+      }
+    }
+
+  private:
+    std::default_random_engine& eng_;
+    std::unique_ptr<std::uniform_int_distribution<size_t>> dist_;
+    bool skipRand_;
+  };
+
+  // DataProvider interface
+public:
+  /**
+   * Resetting the PyDataProvider. May start reading thread here.
+   */
+  virtual void reset() {
+    DataProvider::reset();
+    resetImpl(true);
+  }
+
+  /**
+   * Shuffle. Do nothing because PyDataProvider do shuffle implicitly by random
+   * select data from datapool.
+   */
+  void shuffle() {
+  }
+
+  /**
+   * Not limited size.
+   */
+  int64_t getSize() {
+    return -1;
+  }
+
+  /**
+   * Loading a batch of data.
+   */
+  int64_t getNextBatchInternal(int64_t size_, DataBatch *batch) {
+    CHECK_GE(size_, 0);
+    size_t size = (size_t) size_;
+    if (loadThread_) {  // loading from thread should wait for data pool ready.
+                        // but, loading from cache, cache object should ensure
+                        // data pool ready.
+      std::unique_lock<std::mutex> l(mtx_);
+      pullCV_.wait(l, [this, &size] {
+        return this->poolActualSize_ >= size || callingContexts_.empty();
+      });
+    }
+    std::deque<PyObjectPtr> data;
+    size_t bsize = 0;
+    std::deque<PyObjectPtr>* poolPtr = nullptr;
+
+    if (this->loadThread_) {  // loading from thread.
+      poolPtr = &this->dataPool_;
+    } else {  // loading from cache.
+      poolPtr = this->cache_->load();
+    }
+    CHECK(poolPtr != nullptr);
+
+    std::deque<PyObjectPtr>& pool = *poolPtr;
+
+    while (bsize < size && !pool.empty()) {
+      {  // move data from pool to data
+        std::lock_guard<std::mutex> guard(mtx_);
+        if (skipShuffle_) {
+          size_t i = 0;
+          CHECK(pool[i] != nullptr);
+          data.emplace_back(std::move(pool[i]));
+          pool.pop_front();
+        } else {  // when shuffle, use swap to drop only last pool element.
+          size_t i = ThreadLocalRand::rand() % pool.size();
+          CHECK(pool[i] != nullptr);
+          if (i != pool.size() - 1) {
+            std::swap(pool[i], pool.back());
+          }
+          data.emplace_back(std::move(pool.back()));
+          pool.pop_back();
+        }
+      }
+      {
+        if (calcBatchSize_) {  // custom calc batch size.
+          Py_INCREF(data.back().get());
+          py::CallableHelper calcBatchSize(calcBatchSize_);
+          calcBatchSize.setArgsSize(1);
+          calcBatchSize.getArgs().set(0, data.back());
+          PyObjectPtr customBatchSize(calcBatchSize());
+          bool ok;
+          bsize += py::castInt<size_t>(customBatchSize.get(), &ok);
+          CHECK(ok) << "calc_batch_size must return int";
+        } else {
+          bsize += 1;
+        }
+      }
+    }
+
+    if (this->loadThread_) {
+      {
+        std::lock_guard<std::mutex> g(mtx_);
+        poolActualSize_ -= bsize;
+      }
+      this->pushCV_.notify_all();
+    }
+
+    if (bsize == 0) {  // end of pass. In data pool, cannot get any data.
+      return 0;
+    }
+
+    DataBatch cpuBatch;
+    cpuBatch.setSize(bsize);
+    auto& inArgs = cpuBatch.getStreams();
+    inArgs.resize(headers_.size());
+    std::vector<std::unique_ptr<IFieldScanner> > scanners;
+    scanners.reserve(headers_.size());
+    for (auto& header : headers_) {
+      scanners.emplace_back(IFieldScanner::create(&header));
+    }
+    DBG << "Scanner created.";
+    for (size_t i=0; i < headers_.size(); ++i) {
+      scanners[i]->startPrepare(inArgs[i]);
+    }
+    for (auto & d : data) {
+      py::SequenceHelper s(d);
+      for (size_t i=0; i < headers_.size(); ++i) {
+        scanners[i]->prepare(inArgs[i], s[i]);
+      }
+    }
+    for (size_t i=0; i < headers_.size(); ++i) {
+      scanners[i]->finishPrepare(inArgs[i]);
+    }
+    for (size_t i=0; i < headers_.size(); ++i) {
+      scanners[i]->startFill(inArgs[i]);
+    }
+    for (auto & d : data) {
+      py::SequenceHelper s(d);
+      for (size_t i = 0; i < headers_.size(); ++i) {
+        scanners[i]->fill(inArgs[i], s[i]);
+      }
+    }
+
+    for (size_t i=0; i < headers_.size(); ++i) {
+      scanners[i]->finishFill(inArgs[i]);
+    }
+
+    DBG << "Reading CPU Batch Done.";
+
+    if (useGpu_) {
+      std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
+      DataBatch& gpuBatch = *batch;
+      std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
+      gpuArguments.resize(cpuArguments.size());
+      gpuBatch.setSize(size);
+      for (size_t i = 0; i < headers_.size(); ++i) {
+        gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
+                                          HPPL_STREAM_1);
+      }
+      hl_stream_synchronize(HPPL_STREAM_1);
+    } else {
+      *batch = cpuBatch;
+    }
+
+    {
+      PyGuard g;
+      cache_->drop(&data);
+    }
+    return bsize;
+  }
+};
+
+std::unordered_set<uintptr_t > PyDataProvider2::gModuleClsPtrs_;
+PyObjectPtr PyDataProvider2::zeroTuple_(PyTuple_New(0));
+
+REGISTER_DATA_PROVIDER(py2, PyDataProvider2);
+
+/**
+ * Scanner for dense slot.
+ */
+class DenseScanner: public IFieldScanner {
+public:
+  explicit DenseScanner(SlotHeader* ptr):IFieldScanner(ptr), height_(0) {}
+
+  /**
+   * Prepare.
+   * @param argument target argument
+   * @param obj each timestep of a sample.
+   */
+  virtual void prepare(Argument &argument, PyObject *obj) {
+    ++height_;
+  }
+
+  virtual void finishPrepare(Argument &argument) {
+    Matrix::resizeOrCreate(argument.value, height_, headerPtr_->dim,
+                           false, false);
+    height_ = 0;
+  }
+
+  /**
+   * Fill argument from obj.
+   * @param argument
+   * @param obj
+   */
+  virtual void fill(Argument &argument, PyObject *obj) {
+    real* dat = argument.value->getData() + height_ * headerPtr_->dim;
+    py::SequenceHelper s(obj);
+    // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
+    for (size_t i=0; i < headerPtr_->dim; ++i) {
+      dat[i] = (real) s.getDouble(i);
+    }
+    ++height_;
+  }
+
+private:
+  size_t height_;
+};
+
+/**
+ * Scanner for index slot
+ */
+class IndexScanner: public IFieldScanner {
+public:
+  explicit IndexScanner(SlotHeader* ptr):IFieldScanner(ptr), cnt_(0) {}
+
+  /**
+   * Prepare memory space.
+   *
+   * @note obj is a single timestep of sample
+   */
+  virtual void prepare(Argument &argument, PyObject *obj) {
+    ++cnt_;
+  }
+
+  virtual void finishPrepare(Argument &argument) {
+    IVector::resizeOrCreate(argument.ids, cnt_, false);
+    cnt_ = 0;
+  }
+
+  /**
+   * Fill one index to argument.
+   */
+  virtual void fill(Argument &argument, PyObject *obj) {
+    bool ok;
+    argument.ids->getData()[cnt_++] = py::castInt<int >(obj, &ok);
+    CHECK(ok) << "Cannot cast int " << py::repr(obj);
+  }
+
+private:
+  size_t cnt_;
+};
+
+class SparseNonValueScanner : public IFieldScanner {
+public:
+  explicit SparseNonValueScanner(SlotHeader* ptr): IFieldScanner(ptr),
+                                                   nnz_(0),
+                                                   height_(0) {}
+
+  /**
+   * Prepare memory space
+   * @note obj is a timestep of one sample.
+   */
+  virtual void prepare(Argument &argument, PyObject *obj) {
+    ++height_;
+    nnz_ += py::SequenceHelper(obj).size();
+  }
+
+  virtual void finishPrepare(Argument &argument) {
+    Matrix::resizeOrCreateSparseMatrix(argument.value, height_,
+                                       headerPtr_->dim,
+                                       nnz_, NO_VALUE);
+  }
+
+  virtual void startFill(Argument & argument) {
+    auto smat = (CpuSparseMatrix*) (argument.value.get());
+    smat->getRows()[0] = 0;
+    nnz_ = 0;
+    height_ = 1;
+  }
+
+  /**
+   * Fill one sparse vector to argument.
+   * @note obj is a timestep of one sample.
+   */
+  virtual void fill(Argument& argument, PyObject* obj) {
+    py::SequenceHelper s(obj);
+    auto sz = s.size();
+    auto smat = (CpuSparseMatrix*) (argument.value.get());
+    int* row = smat->getRows();
+    int* col = smat->getCols();
+    real* dat = smat->getData();
+    row[height_] = row[height_-1] + (int)sz;
+
+    for (decltype(sz) i = 0; i < sz; ++i) {
+      setData(col+nnz_, dat+nnz_, s[i]);
+      ++nnz_;
+    }
+    ++height_;
+  }
+
+protected:
+  /**
+   * Set a single sparse index and value.
+   * @param [out] col sparse index
+   * @param [out] dat sparse value
+   * @param [in] obj Python Object. For sparse_non_value is a PyInt or PyLong.
+   *                 For sparse_value is a Tuple (int, float).
+   */
+  virtual void setData(int* col, real * dat, PyObject* obj) {
+    bool ok;
+    *col = py::castInt<int>(obj, &ok);
+    CHECK(ok);
+  }
+
+  size_t nnz_;
+  size_t height_;
+};
+
+class SparseValueScanner : public SparseNonValueScanner {
+public:
+  explicit SparseValueScanner(SlotHeader *ptr) : SparseNonValueScanner(ptr) {}
+
+  virtual void finishPrepare(Argument &argument) {
+    Matrix::resizeOrCreateSparseMatrix(argument.value, height_,
+                                       headerPtr_->dim,
+                                       nnz_, FLOAT_VALUE);
+  }
+
+protected:
+  virtual void setData(int *col, real *dat, PyObject *obj) {
+    py::SequenceHelper s(obj);
+    SparseNonValueScanner::setData(col, dat, s[0]);
+    *dat = (real) s.getDouble(1);
+  }
+};
+
+/**
+ * Sequence Scanner. Scanner for sequence or sub-sequence.
+ */
+class SequenceScanner: public IFieldScanner {
+public:
+  /**
+   * Ctor
+   * @param innerScanner inner scanner for each timestep or sub-sequence.
+   * @param getSeqStartPos A callback, (Argument) => ICpuGpuVectorPtr.
+   *                       return a sequence start position or a sub-sequence
+   *                       start position.
+   */
+  SequenceScanner(std::unique_ptr<IFieldScanner>&& innerScanner,
+    const std::function<ICpuGpuVectorPtr&(Argument&)>& getSeqStartPos)
+      : IFieldScanner(nullptr), inner_(std::move(innerScanner)),
+        cnt_(0), getSeqStartPos_(getSeqStartPos) {}
+
+  /**
+   * Start prepare. Invoke inner->startPrepare too.
+   */
+  virtual void startPrepare(Argument &argument) {
+    inner_->startPrepare(argument);
+  }
+
+  /**
+   * Prepare. obj is a list or tuple. it will invoke inner_->prepare for each
+   * element of sequence obj.
+   */
+  virtual void prepare(Argument &argument, PyObject *obj) {
+    py::SequenceHelper s(obj);
+    ++cnt_;
+    for (size_t i=0; i < s.size(); ++i) {
+      inner_->prepare(argument, s[i]);
+    }
+  }
+
+  /**
+   * Finish prepare. invoke inner_->finishPrepare too.
+   */
+  virtual void finishPrepare(Argument &argument) {
+    ICpuGpuVector::resizeOrCreate(getSeqStartPos_(argument), cnt_ + 1, false);
+    inner_->finishPrepare(argument);
+  }
+
+  /**
+   * Start fill. invoke inner->startFill too.
+   */
+  virtual void startFill(Argument &argument) {
+    getSeqStartPos_(argument)->getMutableData(false)[0] = 0;
+    cnt_ = 1;
+    inner_->startFill(argument);
+  }
+
+  /**
+   * Fill. Obj is a tuple or list. invoke inner->fill for each element of
+   * sequence obj. And set seqStartPos at same time. The seqStartPos will be
+   * calculated by getSeqStartPos callback passed in ctor.
+   */
+  virtual void fill(Argument &argument, PyObject *obj) {
+    getSeqStartPos_(argument)->getMutableData(false)[cnt_] =
+      getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] +
+          (int)getSize(obj);
+    py::SequenceHelper s(obj);
+    ++cnt_;
+    for (size_t i=0; i < s.size(); ++i) {
+      inner_->fill(argument, s[i]);
+    }
+  }
+
+  /**
+   * Finish fill. will invoke inner->finishFill too.
+   */
+  virtual void finishFill(Argument &argument) {
+    inner_->finishFill(argument);
+  }
+
+protected:
+  size_t getSize(PyObject* obj) {
+    py::SequenceHelper s(obj);
+    auto sc = dynamic_cast<SequenceScanner*>(inner_.get());
+    if (sc) {
+      size_t sum = 0;
+      for (size_t i=0; i < s.size(); ++i) {
+        sum += sc->getSize(s[i]);
+      }
+      return sum;
+    } else {
+      return s.size();
+    }
+  }
+
+private:
+  std::unique_ptr<IFieldScanner> inner_;
+  size_t cnt_;
+  std::function<ICpuGpuVectorPtr&(Argument&)> getSeqStartPos_;
+};
+
+
+IFieldScanner* IFieldScanner::create(SlotHeader *header) {
+  IFieldScanner* retv = nullptr;
+  switch (header->slotType) {
+    case ST_DENSE:
+      retv = new DenseScanner(header);
+      break;
+    case ST_INDEX:
+      retv = new IndexScanner(header);
+      break;
+    case ST_NON_SPARSE_VALUE:
+      retv = new SparseNonValueScanner(header);
+      break;
+    case ST_SPARSE_VALUE:
+      retv = new SparseValueScanner(header);
+      break;
+    default:
+      LOG(FATAL) << "Not implemented " << header->slotType;
+  }
+
+  switch (header->seqType) {
+    case SQT_NONE:
+      break;
+    case SQT_SUBSEQ:
+      retv = new SequenceScanner(std::unique_ptr<IFieldScanner>(retv),
+            [](Argument& arg) -> ICpuGpuVectorPtr& {
+              return arg.subSequenceStartPositions;
+            });
+      // fall through, not break;
+    case SQT_SEQ:
+      retv = new SequenceScanner(std::unique_ptr<IFieldScanner>(retv),
+          [](Argument& arg) -> ICpuGpuVectorPtr& {
+            return arg.sequenceStartPositions;
+          });
+      break;
+    default:
+      LOG(FATAL) << "Not implemented";
+  }
+
+  return retv;
+}
+
+/**
+ * No Cache Strategy. Will destruct old data immediately and load data from
+ * python every pass.
+ */
+class NoCacheStrategy: public IPyDataProviderCache {
+public:
+  virtual bool reset() {
+    return true;
+  }
+
+  virtual void drop(std::deque<PyObjectPtr> *data) {
+    data->clear();
+  }
+
+  virtual std::deque<PyObjectPtr>* load() {
+    return nullptr;
+  }
+};
+
+/**
+ * Cache One Pass In Memory strategy.
+ *
+ * In first pass, will load data from python and store them in memory.
+ * The rest passes, will load data from memory.
+ */
+class CacheOnePassInMemory : public IPyDataProviderCache {
+public:
+  CacheOnePassInMemory() : objPool_(new std::deque<PyObjectPtr>()),
+                           droppedPool_(new std::deque<PyObjectPtr>())
+  {}
+
+  virtual bool reset() {
+    if (objPool_->empty() && droppedPool_->empty()) {
+      return true;
+    } else if (objPool_->empty()) {
+      std::swap(objPool_, droppedPool_);
+      return false;
+    } else {
+      LOG(FATAL) << "Unexpected branch";
+    }
+  }
+
+  virtual void drop(std::deque<PyObjectPtr> *data) {
+    size_t orgSize = droppedPool_->size();
+    droppedPool_->resize(orgSize + data->size());
+    for (size_t i=0; i < data->size(); ++i) {
+      std::swap((*droppedPool_)[orgSize + i], (*data)[i]);
+    }
+    data->clear();
+  }
+
+  virtual std::deque<PyObjectPtr>* load() {
+    return objPool_.get();
+  }
+
+private:
+  std::unique_ptr<std::deque<PyObjectPtr> > objPool_;
+  std::unique_ptr<std::deque<PyObjectPtr> > droppedPool_;
+};
+
+
+IPyDataProviderCache* IPyDataProviderCache::create(CacheType ct) {
+  switch (ct) {
+    case NO_CACHE:
+      return new NoCacheStrategy();
+    case CACHE_PASS_IN_MEM:
+      return new CacheOnePassInMemory();
+    default:
+      LOG(FATAL) << "Not implemented";
+  }
+}
+}  // namespace paddle
+
+#endif
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
new file mode 100644
index 00000000000000..d0b1c0447d23d3
--- /dev/null
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -0,0 +1,272 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "Evaluator.h"
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+
+namespace paddle {
+
+/**
+ * calculate sequence-to-sequence edit distance
+ */
+class CTCErrorEvaluator : public Evaluator {
+private:
+  MatrixPtr outActivations_;
+  int numTimes_, numClasses_, numSequences_, blank_;
+  real deletions_, insertions_, substitutions_;
+  int seqClassficationError_;
+
+  std::vector<int> path2String(const std::vector<int>& path) {
+    std::vector<int> str;
+    str.clear();
+    int prevLabel = -1;
+    for (std::vector<int>::const_iterator label = path.begin();
+         label != path.end(); label++) {
+      if (*label != blank_ &&
+          (str.empty() || *label != str.back() || prevLabel == blank_)) {
+        str.push_back(*label);
+      }
+      prevLabel = *label;
+    }
+    return str;
+  }
+
+  std::vector<int> bestLabelSeq() {
+    std::vector<int> path;
+    path.clear();
+    real* acts = outActivations_->getData();
+    for (int i = 0; i < numTimes_; ++i) {
+      path.push_back(std::max_element(acts + i * numClasses_,
+                                      acts + (i + 1) * numClasses_) -
+                     (acts + i * numClasses_));
+    }
+    return path2String(path);
+  }
+
+  /* "sp, dp, ip" is the weighting parameter of "substitution, deletion,
+   * insertion"
+   * in edit-distance error */
+  real stringAlignment(std::vector<int>& gtStr, std::vector<int>& recogStr,
+                       bool backtrace = true, real sp = 1.0, real dp = 1.0,
+                       real ip = 1.0) {
+    std::vector<std::vector<int>> matrix;
+    int substitutions, deletions, insertions;
+    real distance;
+    int n = gtStr.size();
+    int m = recogStr.size();
+
+    if (n == 0) {
+      substitutions = 0;
+      deletions = 0;
+      insertions = m;
+      distance = m;
+    } else if (m == 0) {
+      substitutions = 0;
+      deletions = n;
+      insertions = 0;
+      distance = n;
+    } else {
+      substitutions = 0;
+      deletions = 0;
+      insertions = 0;
+      distance = 0;
+      // initialize the matrix
+      matrix.resize(n + 1);
+      for (int i = 0; i < n + 1; ++i) {
+        matrix[i].resize(m + 1);
+        for (int j = 0; j < m + 1; ++j) {
+          matrix[i][j] = 0;
+        }
+      }
+      for (int i = 0; i < n + 1; ++i) {
+        matrix[i][0] = i;
+      }
+      for (int j = 0; j < m + 1; ++j) {
+        matrix[0][j] = j;
+      }
+
+      // calculate the insertions, substitutions and deletions
+      for (int i = 1; i < n + 1; ++i) {
+        int s_i = gtStr[i - 1];
+        for (int j = 1; j < m + 1; ++j) {
+          int t_j = recogStr[j - 1];
+          int cost = (s_i == t_j) ? 0 : 1;
+          const int above = matrix[i - 1][j];
+          const int left = matrix[i][j - 1];
+          const int diag = matrix[i - 1][j - 1];
+          const int cell = std::min(above + 1, std::min(left + 1, diag + cost));
+          matrix[i][j] = cell;
+        }
+      }
+
+      if (backtrace) {
+        size_t i = n;
+        size_t j = m;
+        substitutions = 0;
+        deletions = 0;
+        insertions = 0;
+
+        while (i != 0 && j != 0) {
+          if (matrix[i][j] == matrix[i - 1][j - 1]) {
+            --i;
+            --j;
+          } else if (matrix[i][j] == matrix[i - 1][j - 1] + 1) {
+            ++substitutions;
+            --i;
+            --j;
+          } else if (matrix[i][j] == matrix[i - 1][j] + 1) {
+            ++deletions;
+            --i;
+          } else {
+            ++insertions;
+            --j;
+          }
+        }
+        while (i != 0) {
+          ++deletions;
+          --i;
+        }
+        while (j != 0) {
+          ++insertions;
+          --j;
+        }
+        int diff = substitutions + deletions + insertions;
+        if (diff != matrix[n][m]) {
+          LOG(ERROR) << "Found path with distance " << diff
+                     << " but Levenshtein distance is " << matrix[n][m];
+        }
+
+        distance = (sp * substitutions) + (dp * deletions) + (ip * insertions);
+      } else {
+        distance = (real)matrix[n][m];
+      }
+    }
+    real maxLen = std::max(m, n);
+    deletions_ += deletions / maxLen;
+    insertions_ += insertions / maxLen;
+    substitutions_ += substitutions / maxLen;
+
+    if (distance != 0) {
+      seqClassficationError_ += 1;
+    }
+
+    return distance / maxLen;
+  }
+
+  real editDistance(real* output, int numTimes, int numClasses, int* labels,
+                    int labelsLen) {
+    numTimes_ = numTimes;
+    numClasses_ = numClasses;
+    blank_ = numClasses_ - 1;
+    outActivations_ = Matrix::create(output, numTimes, numClasses);
+    std::vector<int> recogStr, gtStr;
+    recogStr = bestLabelSeq();
+    for (int i = 0; i < labelsLen; ++i) {
+      gtStr.push_back(labels[i]);
+    }
+
+    return stringAlignment(gtStr, recogStr);
+  }
+
+public:
+  CTCErrorEvaluator()
+      : numTimes_(0),
+        numClasses_(0),
+        numSequences_(0),
+        blank_(0),
+        deletions_(0),
+        insertions_(0),
+        substitutions_(0),
+        seqClassficationError_(0) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    CHECK_EQ(arguments.size(), (size_t)2);
+    Argument output, label;
+    output.resizeAndCopyFrom(arguments[0], false);
+    label.resizeAndCopyFrom(arguments[1], false);
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+    CHECK(label.sequenceStartPositions);
+    CHECK(label.ids);
+    size_t numSequences = label.sequenceStartPositions->getSize() - 1;
+    const int* labelStarts = label.sequenceStartPositions->getData(false);
+    const int* outputStarts = output.sequenceStartPositions->getData(false);
+    real totalErr = 0;
+    for (size_t i = 0; i < numSequences; ++i) {
+      real err = 0;
+      err = editDistance(
+          output.value->getData() + output.value->getWidth() * outputStarts[i],
+          output.value->getHeight(), output.value->getWidth(),
+          label.ids->getData() + labelStarts[i],
+          labelStarts[i + 1] - labelStarts[i]);
+
+      totalErr += err;
+    }
+
+    return totalErr;
+  }
+
+  virtual void eval(const NeuralNetwork& nn) {
+    Evaluator::eval(nn);
+    std::vector<Argument> arguments;
+    arguments.reserve(config_.input_layers_size());
+    for (const std::string& name : config_.input_layers()) {
+      arguments.push_back(nn.getLayer(name)->getOutput());
+    }
+    numSequences_ += arguments[1].getNumSequences();
+  }
+
+  virtual void start() {
+    Evaluator::start();
+    numSequences_ = 0;
+    blank_ = 0;
+    deletions_ = 0;
+    insertions_ = 0;
+    substitutions_ = 0;
+    seqClassficationError_ = 0;
+  }
+
+  virtual void printStats(std::ostream& os) {
+    os << config_.name() << "="
+       << (numSequences_ ? totalScore_ / numSequences_ : 0);
+    os << "  deletions error"
+       << "=" << (numSequences_ ? deletions_ / numSequences_ : 0);
+    os << "  insertions error"
+       << "=" << (numSequences_ ? insertions_ / numSequences_ : 0);
+    os << "  substitutions error"
+       << "=" << (numSequences_ ? substitutions_ / numSequences_ : 0);
+    os << "  sequences error"
+       << "=" << (real)seqClassficationError_ / numSequences_;
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    double buf[6] = {totalScore_,
+                     (double)deletions_,
+                     (double)insertions_,
+                     (double)substitutions_,
+                     (double)seqClassficationError_,
+                     (double)numSequences_};
+    client->reduce(buf, buf, 6, FLAGS_trainer_id, 0);
+    totalScore_ = buf[0];
+    deletions_ = (real)buf[1];
+    insertions_ = (real)buf[2];
+    substitutions_ = (real)buf[3];
+    seqClassficationError_ = (int)buf[4];
+    numSequences_ = (int)buf[5];
+  }
+};
+
+REGISTER_EVALUATOR(ctc_edit_distance, CTCErrorEvaluator);
+
+}  // namespace paddle
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
new file mode 100644
index 00000000000000..273925ba55ee40
--- /dev/null
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -0,0 +1,238 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+
+#include "paddle/math/Vector.h"
+
+#include "Evaluator.h"
+
+namespace paddle {
+
+/**
+ * Chunk evaluator is used to evaluate segment labelling accuracy for a
+ * sequence. It calculates the chunk detection F1 score.
+ *
+ * A chunk is correctly detected if its beginning, end and type are correct.
+ * Other chunk type is ignored.
+ * For each label in the label sequence, we have
+ *
+ * @code
+ * tagType = label % numTagType
+ * chunkType = label / numTagType
+ * otherChunkType = numChunkTypes
+ * @endcode
+ *
+ * The total number of different labels is numTagType*numChunkTypes+1
+ * We support 4 labelling scheme
+ * The tag type for each of the scheme is shown as follows:
+ *
+ * @code
+ *  Scheme Begin Inside End   Single
+ *   plain  0     -      -     -
+ *   IOB    0     1      -     -
+ *   IOE    -     0      1     -
+ *   IOBES  0     1      2     3
+ * @endcode
+ *
+ * 'plain' means the whole chunk must contain exactly the same chunk label.
+ */
+class ChunkEvaluator : public Evaluator {
+  int otherChunkType_;
+  int numChunkTypes_;  // number of chunk types besides other chunk type
+  int numTagTypes_;
+  int tagBegin_;
+  int tagInside_;
+  int tagEnd_;
+  int tagSingle_;
+
+  int64_t numLabelSegments_;
+  int64_t numOutputSegments_;
+  int64_t numCorrect_;
+
+  struct Segment {
+    int begin;
+    int end;
+    int type;
+    bool operator==(const Segment& y) const {
+      return begin == y.begin && end == y.end && type == y.type;
+    }
+  };
+
+  std::vector<Segment> labelSegments_;
+  std::vector<Segment> outputSegments_;
+
+public:
+  virtual void init(const EvaluatorConfig& config) {
+    CHECK(!FLAGS_use_gpu) << "Not supported";
+    Evaluator::init(config);
+    if (config.chunk_scheme() == "IOB") {
+      numTagTypes_ = 2;
+      tagBegin_ = 0;
+      tagInside_ = 1;
+      tagEnd_ = -1;
+      tagSingle_ = -1;
+    } else if (config.chunk_scheme() == "IOE") {
+      numTagTypes_ = 2;
+      tagBegin_ = -1;
+      tagInside_ = 0;
+      tagEnd_ = 1;
+      tagSingle_ = -1;
+    } else if (config.chunk_scheme() == "IOBES") {
+      numTagTypes_ = 4;
+      tagBegin_ = 0;
+      tagInside_ = 1;
+      tagEnd_ = 2;
+      tagSingle_ = 3;
+    } else if (config.chunk_scheme() == "plain") {
+      numTagTypes_ = 1;
+      tagBegin_ = -1;
+      tagInside_ = -1;
+      tagEnd_ = -1;
+      tagSingle_ = -1;
+    } else {
+      LOG(FATAL) << "Unknown chunk scheme: " << config.chunk_scheme();
+    }
+    CHECK(config.has_num_chunk_types()) << "Missing num_chunk_types in config";
+    otherChunkType_ = numChunkTypes_ = config.num_chunk_types();
+  }
+
+  virtual void start() {
+    Evaluator::start();
+    numLabelSegments_ = 0;
+    numOutputSegments_ = 0;
+    numCorrect_ = 0;
+  }
+
+  virtual void printStats(std::ostream& os) {
+    double precision = (double)numCorrect_ / numOutputSegments_;
+    double recall = (double)numCorrect_ / numLabelSegments_;
+    double f1 =
+        !numCorrect_ ? 0 : 2 * precision * recall / (precision + recall);
+    os << config_.name() << "=" << f1 << " true_chunks=" << numLabelSegments_
+       << " result_chunks=" << numOutputSegments_
+       << " correct_chunks=" << numCorrect_;
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    int64_t buf[3] = {numLabelSegments_, numOutputSegments_, numCorrect_};
+    client->reduce(buf, buf, 3, FLAGS_trainer_id, 0);
+    numLabelSegments_ = buf[0];
+    numOutputSegments_ = buf[1];
+    numCorrect_ = buf[2];
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    CHECK_EQ(arguments.size(), (size_t)2);
+    IVectorPtr& output = arguments[0].ids;
+    IVectorPtr& label = arguments[1].ids;
+    auto sequenceStartPositions =
+        arguments[1].sequenceStartPositions->getVector(false);
+    CHECK_EQ(output->getSize(), label->getSize());
+    CHECK(sequenceStartPositions);
+    size_t numSequences = sequenceStartPositions->getSize() - 1;
+    const int* starts = sequenceStartPositions->getData();
+    for (size_t i = 0; i < numSequences; ++i) {
+      eval1(output->getData() + starts[i], label->getData() + starts[i],
+            starts[i + 1] - starts[i]);
+    }
+    return 0;
+  }
+
+  void eval1(int* output, int* label, int length) {
+    getSegments(output, length, outputSegments_);
+    getSegments(label, length, labelSegments_);
+    size_t i = 0, j = 0;
+    while (i < outputSegments_.size() && j < labelSegments_.size()) {
+      if (outputSegments_[i] == labelSegments_[j]) {
+        ++numCorrect_;
+      }
+      if (outputSegments_[i].end < labelSegments_[j].end) {
+        ++i;
+      } else if (outputSegments_[i].end > labelSegments_[j].end) {
+        ++j;
+      } else {
+        ++i;
+        ++j;
+      }
+    }
+    numLabelSegments_ += labelSegments_.size();
+    numOutputSegments_ += outputSegments_.size();
+  }
+
+  void getSegments(int* label, int length, std::vector<Segment>& segments) {
+    segments.clear();
+    segments.reserve(length);
+    int chunkStart = 0;
+    bool inChunk = false;
+    int tag = -1;
+    int type = otherChunkType_;
+    for (int i = 0; i < length; ++i) {
+      int prevTag = tag;
+      int prevType = type;
+      CHECK_LE(label[i], numChunkTypes_ * numTagTypes_);
+      tag = label[i] % numTagTypes_;
+      type = label[i] / numTagTypes_;
+      if (inChunk && isChunkEnd(prevTag, prevType, tag, type)) {
+        Segment segment{
+            chunkStart,  // begin
+            i - 1,       // end
+            prevType,
+        };
+        segments.push_back(segment);
+        inChunk = false;
+      }
+      if (isChunkBegin(prevTag, prevType, tag, type)) {
+        chunkStart = i;
+        inChunk = true;
+      }
+    }
+    if (inChunk) {
+      Segment segment{
+          chunkStart,  // begin
+          length - 1,  // end
+          type,
+      };
+      segments.push_back(segment);
+    }
+  }
+
+  // whether (prevTag, prevType) is the end of a chunk
+  bool isChunkEnd(int prevTag, int prevType, int tag, int type) {
+    if (prevType == otherChunkType_) return false;
+    if (type == otherChunkType_) return true;
+    if (type != prevType) return true;
+    if (prevTag == tagBegin_) return tag == tagBegin_ || tag == tagSingle_;
+    if (prevTag == tagInside_) return tag == tagBegin_ || tag == tagSingle_;
+    if (prevTag == tagEnd_) return true;
+    if (prevTag == tagSingle_) return true;
+    return false;
+  }
+
+  // whether (tag, type) is the beginning of a chunk
+  bool isChunkBegin(int prevTag, int prevType, int tag, int type) {
+    if (prevType == otherChunkType_) return type != otherChunkType_;
+    if (type == otherChunkType_) return false;
+    if (type != prevType) return true;
+    if (tag == tagBegin_) return true;
+    if (tag == tagInside_) return prevTag == tagEnd_ || prevTag == tagSingle_;
+    if (tag == tagEnd_) return prevTag == tagEnd_ || prevTag == tagSingle_;
+    if (tag == tagSingle_) return true;
+    return false;
+  }
+};
+
+REGISTER_EVALUATOR(chunk, ChunkEvaluator);
+
+}  // namespace paddle
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
new file mode 100644
index 00000000000000..ba9847f6abcee0
--- /dev/null
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -0,0 +1,1185 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Stat.h"
+#include "paddle/gserver/evaluators/Evaluator.h"
+
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+
+P_DECLARE_int32(trainer_id);
+
+namespace paddle {
+
+void Evaluator::eval(const NeuralNetwork& nn) {
+  std::vector<Argument> arguments;
+  arguments.reserve(config_.input_layers_size());
+  for (const std::string& name : config_.input_layers()) {
+    arguments.push_back(nn.getLayer(name)->getOutput());
+  }
+  SetDevice device(arguments[0].deviceId);
+  real score = evalImp(arguments);
+  totalScore_ += score;
+  updateSamplesNum(arguments);
+}
+
+class ClassificationErrorEvaluator : public Evaluator {
+public:
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    if (3 == arguments.size()) {
+      numSamples_ += arguments[2].value->getSum();
+    } else {
+      numSamples_ += arguments[0].getBatchSize();
+    }
+  }
+
+  MatrixPtr calcError(std::vector<Argument>& arguments) {
+    CHECK_GE(arguments.size(), (size_t)2);
+    CHECK_LE(arguments.size(), (size_t)3);
+    MatrixPtr& output = arguments[0].value;
+    IVectorPtr& label = arguments[1].ids;
+    MatrixPtr& multiBinaryLabel = arguments[1].value;  // For multi binary label
+    bool supportWeight = (3 == arguments.size()) ? true : false;
+    MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
+    if (nullptr == output ||
+        (nullptr == label && nullptr == multiBinaryLabel) ||
+        (supportWeight && nullptr == weight)) {
+      return 0;
+    }
+
+    if (label != nullptr) {
+      CHECK_EQ(label->getSize(), output->getHeight());
+    } else {
+      CHECK_EQ(multiBinaryLabel->getHeight(), output->getHeight());
+      CHECK_EQ(multiBinaryLabel->getWidth(), output->getWidth());
+    }
+    if (supportWeight) {
+      CHECK_EQ(output->getHeight(), weight->getHeight());
+      CHECK_EQ((size_t)1, weight->getWidth());
+    }
+
+    const MatrixPtr errorMat = Matrix::create(output->getHeight(),
+      1, /* trans= */ false, useGpu(arguments[0].deviceId));
+    errorMat->zeroMem();
+    if (label != nullptr) {
+      errorMat->classificationError(output, label);
+    } else if (dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()) ||
+               dynamic_cast<GpuSparseMatrix*>(multiBinaryLabel.get())) {
+      errorMat->classificationErrorMulti(*output, *multiBinaryLabel,
+                                         config_.classification_threshold());
+    } else {
+      errorMat->binaryClassificationError(0, *output, *multiBinaryLabel,
+                                          config_.classification_threshold());
+    }
+
+    if (supportWeight) {
+      errorMat->dotMul(*errorMat, *weight);
+    }
+    return errorMat;
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    MatrixPtr errorMat = calcError(arguments);
+    return errorMat->getSum();
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    mergeResultsOfAllClients(client);
+  }
+};
+
+// sequence level classification error stats:
+//   if any frame in one sequence has error, the sequence is error
+class SequenceClassificationErrorEvaluator
+    : public ClassificationErrorEvaluator {
+public:
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    numSamples_ += arguments[0].getNumSequences();
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    auto sequenceStartPositions =
+        arguments[0].sequenceStartPositions->getVector(false);
+    CHECK(sequenceStartPositions != nullptr);
+    const int* starts = sequenceStartPositions->getData();
+
+    MatrixPtr errorMat = calcError(arguments);
+
+    int errCounter = 0;
+    CpuVector errorVec(0, nullptr);
+    for (size_t i = 0; i < sequenceStartPositions->getSize() - 1; ++i) {
+      errorVec.subVecFrom(errorMat->getData(), starts[i],
+                          starts[i + 1] - starts[i]);
+      if (errorVec.getSum() > 0) {
+        errCounter += 1;
+      }
+    }
+
+    return static_cast<real>(errCounter);
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    mergeResultsOfAllClients(client);
+  }
+};
+REGISTER_EVALUATOR(seq_classification_error,
+                   SequenceClassificationErrorEvaluator);
+
+class SumEvaluator : public Evaluator {
+public:
+  SumEvaluator() : cpuLabel_(nullptr), cpuWeight_(nullptr) {}
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    if (2 == arguments.size()) {
+      numSamples_ += arguments[1].value->getSum();
+    } else {
+      numSamples_ += arguments[0].getBatchSize();
+    }
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    REGISTER_TIMER("SumEvaluator");
+    CHECK_GE(arguments.size(), (size_t)1);
+    CHECK_LE(arguments.size(), (size_t)2);
+    bool supportWeight = (2 == arguments.size()) ? true : false;
+    if (supportWeight) {
+      if (nullptr == arguments[1].value) {
+        return 0;
+      }
+      CHECK_EQ(arguments[1].value->getWidth(), (size_t)1);
+    }
+
+    // The sum of output
+    if (arguments[0].value) {
+      if (supportWeight) {
+        CHECK_EQ(arguments[0].value->getHeight(),
+                 arguments[1].value->getHeight());
+        MatrixPtr tmpMat = Matrix::create(arguments[0].value->getHeight(),
+                                          arguments[0].value->getWidth(),
+                                          /* trans= */ false,
+                                          arguments[0].value->useGpu());
+        tmpMat->copyFrom(*arguments[0].value);
+        tmpMat->rowScale(0, *tmpMat, *arguments[1].value);
+        return tmpMat->getSum();
+      } else {
+        return arguments[0].value->getSum();
+      }
+      // The sum of label
+    } else if (arguments[0].ids) {
+      size_t insNum = arguments[0].ids->getSize();
+      IVectorPtr label = arguments[0].ids;
+      MatrixPtr weight = supportWeight ? arguments[1].value : nullptr;
+      if (dynamic_cast<GpuIVector*>(label.get())) {
+        IVector::resizeOrCreate(cpuLabel_, insNum, false);
+        cpuLabel_->copyFrom(*arguments[0].ids);
+
+        if (supportWeight) {
+          CHECK_EQ(insNum, arguments[1].value->getHeight());
+          Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
+          cpuWeight_->copyFrom(*arguments[1].value);
+        }
+
+        label = cpuLabel_;
+        weight = cpuWeight_;
+      }
+
+      if (supportWeight) {
+        real score = 0.0;
+        int* labelD = label->getData();
+        real* weightD = weight->getData();
+        for (size_t i = 0; i < insNum; ++i) {
+          score += (labelD[i] * weightD[i]);
+        }
+        return score;
+      } else {
+        return label->getSum();
+      }
+    } else {
+      return 0;
+    }
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    mergeResultsOfAllClients(client);
+  }
+
+private:
+  IVectorPtr cpuLabel_;
+  MatrixPtr cpuWeight_;
+};
+
+class ColumnSumEvaluator : public Evaluator {
+public:
+  // column sum for the colIdx-th column
+  // colIdx = 0: the 0-th column
+  //         > 0: the colIdx-th column
+  //         < 0: the last colIdx-th column
+  explicit ColumnSumEvaluator(int32_t colIdx)
+      : colIdx_(colIdx), colNum_(0), sum_(nullptr) {}
+
+  virtual void start() {
+    Evaluator::start();
+    if (nullptr != sum_) {
+      sum_->zeroMem();
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    if (2 == arguments.size()) {
+      numSamples_ += arguments[1].value->getSum();
+    } else {
+      numSamples_ += arguments[0].getBatchSize();
+    }
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    REGISTER_TIMER("ColumnSumEvaluator");
+    CHECK_GE(arguments.size(), (size_t)1);
+    CHECK_LE(arguments.size(), (size_t)2);
+    bool supportWeight = (2 == arguments.size()) ? true : false;
+    if (nullptr == arguments[0].value ||
+        (supportWeight && nullptr == arguments[1].value)) {
+      return 0;
+    }
+
+    size_t insNum = arguments[0].value->getHeight();
+    size_t colNum = arguments[0].value->getWidth();
+    if (nullptr == sum_) {
+      sum_ = Matrix::create((size_t)1, colNum, false, /* useGpu */ false);
+      colNum_ = colNum;
+      sum_->zeroMem();
+    } else {
+      CHECK_EQ(colNum, sum_->getWidth());
+    }
+
+    if (supportWeight) {
+      CHECK_EQ(insNum, arguments[1].value->getHeight());
+      CHECK_EQ((size_t)1, arguments[1].value->getWidth());
+      MatrixPtr tmpMat = Matrix::create(insNum, colNum);
+      if (arguments[0].value->useGpu()) {
+        tmpMat->copyFrom(*arguments[0].value);
+      }
+      if (!arguments[1].value->useGpu()) {
+        if (!arguments[0].value->useGpu()) {
+          tmpMat->rowScale(0, *arguments[0].value, *arguments[1].value);
+        } else {
+          tmpMat->rowScale(0, *tmpMat, *arguments[1].value);
+        }
+      } else {
+        MatrixPtr tmp2 = Matrix::create(insNum, 1);
+        tmp2->copyFrom(*arguments[1].value);
+        if (!arguments[0].value->useGpu()) {
+          tmpMat->rowScale(0, *arguments[0].value, *tmp2);
+        } else {
+          tmpMat->rowScale(0, *tmpMat, *tmp2);
+        }
+      }
+      sum_->accumulateColSum(*tmpMat);
+    } else {
+      if (!arguments[0].value->useGpu()) {
+        sum_->accumulateColSum(*arguments[0].value);
+      } else {
+        MatrixPtr tmpMat = Matrix::create(insNum, colNum);
+        tmpMat->copyFrom(*arguments[0].value);
+        sum_->accumulateColSum(*tmpMat);
+      }
+    }
+    return 0;
+  }
+
+  virtual void printStats(std::ostream& os) {
+    CHECK(colIdx_ + (int32_t)colNum_ >= 0 && colIdx_ - (int32_t)colNum_ < 0)
+        << "column index [" << colIdx_ << "] out of range [-" << colNum_ << ", "
+        << colNum_ << ")";
+    size_t colIdx = 0;
+    if (colIdx_ >= 0) {
+      colIdx = colIdx_;
+    } else {
+      colIdx = colNum_ + colIdx_;
+    }
+    os << config_.name() << "="
+       << (numSamples_ ? sum_->getElement(0, colIdx) / numSamples_ : 0);
+  }
+
+  void distributeEval(ParameterClient2* client) {
+    client->reduce(sum_->getData(), sum_->getData(), colNum_, FLAGS_trainer_id,
+                   0);
+    client->reduce(&numSamples_, &numSamples_, 1, FLAGS_trainer_id, 0);
+  }
+
+private:
+  ColumnSumEvaluator() {}
+  int32_t colIdx_;
+  size_t colNum_;
+  MatrixPtr sum_; /* cpu matrix */
+};
+
+void AucEvaluator::start() {
+  Evaluator::start();
+  memset(statPos_, 0, sizeof(statPos_));
+  memset(statNeg_, 0, sizeof(statNeg_));
+}
+
+real AucEvaluator::evalImp(std::vector<Argument>& arguments) {
+  REGISTER_TIMER("AucEvaluator");
+  CHECK_GE(arguments.size(), (size_t)2);
+  CHECK_LE(arguments.size(), (size_t)3);
+  MatrixPtr output = arguments[0].value;
+  IVectorPtr label = arguments[1].ids;
+  bool supportWeight = (3 == arguments.size()) ? true : false;
+  MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
+  if (nullptr == output || nullptr == label ||
+      (supportWeight && nullptr == weight)) {
+    return 0;
+  }
+  size_t insNum = output->getHeight();
+  size_t outputDim = output->getWidth();
+  CHECK_EQ(insNum, label->getSize());
+  if (supportWeight) {
+    CHECK_EQ(insNum, weight->getHeight());
+    CHECK_EQ((size_t)1, weight->getWidth());
+  }
+
+  CHECK(colIdx_ + (int32_t)outputDim >= 0 && colIdx_ - (int32_t)outputDim < 0)
+      << "column index [" << colIdx_ << "] out of range [-" << outputDim << ", "
+      << outputDim << ")";
+  realColumnIdx_ = 0;
+  if (colIdx_ >= 0) {
+    realColumnIdx_ = colIdx_;
+  } else {
+    realColumnIdx_ = outputDim + colIdx_;
+  }
+
+  if (dynamic_cast<GpuMatrix*>(output.get())) {
+    Matrix::resizeOrCreate(cpuOutput_, insNum, outputDim,
+                           /* trans=*/false, /* useGpu=*/false);
+    cpuOutput_->copyFrom(*output);
+    IVector::resizeOrCreate(cpuLabel_, insNum, false);
+    cpuLabel_->copyFrom(*label);
+
+    if (supportWeight) {
+      Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
+      cpuWeight_->copyFrom(*weight);
+    }
+
+    output = cpuOutput_;
+    label = cpuLabel_;
+    weight = cpuWeight_;
+  }
+
+  real* outputD = output->getData();
+  int* labelD = label->getData();
+  real* weightD = supportWeight ? weight->getData() : nullptr;
+  size_t pos = realColumnIdx_;
+  for (size_t i = 0; i < insNum; ++i) {
+    real value = outputD[pos];
+    uint32_t binIdx = static_cast<uint32_t>(value * kBinNum_);
+    CHECK(binIdx <= kBinNum_) << "bin index [" << binIdx
+                              << "] out of range, predict value[" << value
+                              << "]";
+    real w = supportWeight ? weightD[i] : 1.0;
+    if (labelD[i] == kNegativeLabel_) {
+      statNeg_[binIdx] += w;
+    } else {
+      statPos_[binIdx] += w;
+    }
+    pos += outputDim;
+  }
+  return 0;
+}
+
+void AucEvaluator::distributeEval(ParameterClient2* client) {
+  client->reduce(statPos_, statPos_, kBinNum_ + 1, FLAGS_trainer_id, 0);
+  client->reduce(statNeg_, statNeg_, kBinNum_ + 1, FLAGS_trainer_id, 0);
+}
+
+double AucEvaluator::calcAuc() {
+  double totPos = 0.0;
+  double totNeg = 0.0;
+  double totPosPrev = 0.0;
+  double totNegPrev = 0.0;
+  double auc = 0.0;
+
+  int64_t idx = kBinNum_;
+  while (idx >= 0) {
+    totPosPrev = totPos;
+    totNegPrev = totNeg;
+    totPos += statPos_[idx];
+    totNeg += statNeg_[idx];
+    auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
+    --idx;
+  }
+
+  if (totPos > 0.0 && totNeg > 0.0) {
+    return auc / totPos / totNeg;
+  } else {
+    return 0.0;
+  }
+}
+
+// class RankAucEvaluator
+REGISTER_EVALUATOR(rankauc, RankAucEvaluator);
+
+void RankAucEvaluator::start() { Evaluator::start(); }
+void RankAucEvaluator::updateSamplesNum(
+    const std::vector<Argument>& arguments) {
+  numSamples_ += arguments[0].getNumSequences();
+}
+real RankAucEvaluator::evalImp(std::vector<Argument>& arguments) {
+  CHECK_GE(arguments.size(), 2U);
+  CHECK_LE(arguments.size(), 3U);
+  double batchAuc = 0.0;
+  output_ = arguments[0].value;
+  click_ = arguments[1].value;
+  size_t batchSize = output_->getHeight();
+  CHECK(!output_->useGpu()) << "RankAUC evaluator does not support GPU!";
+
+  if (arguments.size() == 3U) {
+    pv_ = arguments[2].value;
+  } else {
+    Matrix::resizeOrCreate(pv_, batchSize, 1, false, false);
+    std::fill(pv_->getData(), pv_->getData() + batchSize, 1.0);
+  }
+
+  real* outputData = output_->getData();
+  real* clickData = click_->getData();
+  real* pvData = pv_->getData();
+
+  auto startPos = arguments[0].sequenceStartPositions->getVector(false);
+  const int* startPosData = startPos->getData();
+  size_t batchNum = startPos->getSize() - 1;
+  for (size_t i = 0; i < batchNum; ++i) {
+    int beginPos = startPosData[i];
+    int endPos = startPosData[i + 1];
+    batchAuc += calcRankAuc(outputData + beginPos, clickData + beginPos,
+                            pvData + beginPos, endPos - beginPos);
+  }
+  return batchAuc;
+}
+
+double RankAucEvaluator::calcRankAuc(real* outputData, real* clickData,
+                                     real* pvData, size_t size) {
+  outputPair_.clear();
+  for (size_t i = 0; i < size; ++i) {
+    outputPair_.push_back(std::make_pair(outputData[i], i));
+  }
+  std::sort(outputPair_.begin(), outputPair_.end(),
+            [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
+              return a.first > b.first;
+            });
+  double aucTmp = 0.0;
+  double clickSum = 0.0;
+  double oldClickSum = 0.0;
+  double noClick = 0.0;
+  double noClickSum = 0.0;
+
+  double lastScore = outputPair_[0].first + 1.0;
+  for (size_t i = 0; i < size; ++i) {
+    if (lastScore != outputPair_[i].first) {
+      aucTmp += (clickSum + oldClickSum) * noClick / 2.0;
+      oldClickSum = clickSum;
+      noClick = 0.0;
+      lastScore = outputPair_[i].first;
+    }
+    size_t id = outputPair_[i].second;
+    noClick += pvData[id] - clickData[id];
+    noClickSum += noClick;
+    clickSum += clickData[id];
+  }
+  aucTmp += (clickSum + oldClickSum) * noClick / 2.0;
+  return (clickSum * noClickSum) == 0.0 ? 0.0
+                                        : aucTmp / (clickSum * noClickSum);
+}
+
+// class PrecisionRecallEvaluator
+REGISTER_EVALUATOR(precision_recall, PrecisionRecallEvaluator);
+
+void PrecisionRecallEvaluator::start() {
+  Evaluator::start();
+  statsInfo_.clear();
+}
+
+real PrecisionRecallEvaluator::evalImp(std::vector<Argument>& arguments) {
+  REGISTER_TIMER("PrecisionRecallEvaluator");
+  CHECK_GE(arguments.size(), (size_t)2);
+  CHECK_LE(arguments.size(), (size_t)3);
+  MatrixPtr output = arguments[0].value;
+  IVectorPtr label = arguments[1].ids;
+  MatrixPtr multiBinaryLabel = arguments[1].value;
+  bool supportWeight = (3 == arguments.size()) ? true : false;
+  MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
+  if (nullptr == output || (nullptr == label && nullptr == multiBinaryLabel) ||
+      (supportWeight && nullptr == weight)) {
+    return 0;
+  }
+
+  size_t insNum = output->getHeight();
+  size_t outputDim = output->getWidth();
+  if (label != nullptr) {
+    CHECK_EQ(insNum, label->getSize());
+  } else {
+    CHECK_EQ(insNum, multiBinaryLabel->getHeight());
+    CHECK_EQ(outputDim, multiBinaryLabel->getWidth());
+  }
+  if (supportWeight) {
+    CHECK_EQ(insNum, weight->getHeight());
+    CHECK_EQ((size_t)1, weight->getWidth());
+  }
+
+  if (statsInfo_.size() != outputDim) {
+    statsInfo_.clear();
+    statsInfo_.resize(outputDim);
+  }
+
+  isMultiBinaryLabel_ = (nullptr == label) ? true : false;
+  if (label != nullptr) {
+    if (dynamic_cast<GpuMatrix*>(output.get())) {
+      Matrix::resizeOrCreate(cpuOutput_, insNum, outputDim, false, false);
+      cpuOutput_->copyFrom(*output);
+      IVector::resizeOrCreate(cpuLabel_, insNum, false);
+      cpuLabel_->copyFrom(*label);
+      if (supportWeight) {
+        Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
+        cpuWeight_->copyFrom(*weight);
+      }
+
+      output = cpuOutput_;
+      label = cpuLabel_;
+      weight = cpuWeight_;
+    }
+    calcStatsInfo(output, label, weight);
+  } else {
+    // Not support GPU for multi binary labels
+    CHECK(dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()));
+    calcStatsInfoMulti(output, multiBinaryLabel, weight);
+  }
+  return 0;
+}
+
+void PrecisionRecallEvaluator::printStats(std::ostream& os) {
+  int label = config_.positive_label();
+  if (label != -1) {
+    CHECK(label >= 0 && label < (int)statsInfo_.size())
+        << "positive_label [" << label << "] should be in range [0, "
+        << statsInfo_.size() << ")";
+    double precision =
+        calcPrecision(statsInfo_[label].TP, statsInfo_[label].FP);
+    double recall = calcRecall(statsInfo_[label].TP, statsInfo_[label].FN);
+    os << "positive_label=" << label << " precision=" << precision
+       << " recall=" << recall
+       << " F1-score=" << calcF1Score(precision, recall);
+    return;
+  }
+
+  // micro average method: precision = (TP1+TP2)/(TP1+FP1+TP2+FP2)
+  // macro average method: precision = (precision1+precision2)/2
+  double microTotalTP = 0;
+  double microTotalFP = 0;
+  double microTotalFN = 0;
+  double macroAvgPrecision = 0;
+  double macroAvgRecall = 0;
+  size_t numLabels = statsInfo_.size();
+  for (size_t i = 0; i < numLabels; ++i) {
+    microTotalTP += statsInfo_[i].TP;
+    microTotalFP += statsInfo_[i].FP;
+    microTotalFN += statsInfo_[i].FN;
+    macroAvgPrecision += calcPrecision(statsInfo_[i].TP, statsInfo_[i].FP);
+    macroAvgRecall += calcRecall(statsInfo_[i].TP, statsInfo_[i].FN);
+  }
+  macroAvgPrecision /= numLabels;
+  macroAvgRecall /= numLabels;
+  double macroAvgF1Score = calcF1Score(macroAvgPrecision, macroAvgRecall);
+  os << "macro-average-precision=" << macroAvgPrecision
+     << " macro-average-recall=" << macroAvgRecall
+     << " macro-average-F1-score=" << macroAvgF1Score;
+
+  double microAvgPrecision = calcPrecision(microTotalTP, microTotalFP);
+  double microAvgRecall = calcPrecision(microTotalTP, microTotalFN);
+  double microAvgF1Score = calcF1Score(microAvgPrecision, microAvgRecall);
+  if (!isMultiBinaryLabel_) {
+    // precision and recall are equal in this case
+    os << " micro-average-precision=" << microAvgPrecision;
+  } else {
+    os << " micro-average-precision=" << microAvgPrecision
+       << " micro-average-recall=" << microAvgRecall
+       << " micro-average-F1-score=" << microAvgF1Score;
+  }
+}
+
+void PrecisionRecallEvaluator::calcStatsInfo(const MatrixPtr& output,
+                                             const IVectorPtr& label,
+                                             const MatrixPtr& weight) {
+  size_t insNum = output->getHeight();
+  size_t dim = output->getWidth();
+  real* outputD = output->getData();
+  int* labelD = label->getData();
+  real* weightD = (weight != nullptr) ? weight->getData() : nullptr;
+  for (size_t i = 0; i < insNum; ++i) {
+    CHECK_GE(labelD[i], 0);
+    CHECK_LT((size_t)labelD[i], dim);
+    size_t maxIdx = 0;
+    real maxValue = outputD[i * dim];
+    for (size_t j = 1; j < dim; ++j) {
+      size_t idx = i * dim + j;
+      if (maxValue < outputD[idx]) {
+        maxIdx = j;
+        maxValue = outputD[idx];
+      }
+    }
+
+    real w = (weightD != nullptr) ? weightD[i] : 1.0;
+    if (maxIdx == (size_t)labelD[i]) {
+      statsInfo_[maxIdx].TP += w;  // true positive for labelD[i]
+      // true negative for all labels except for labelD[i]
+      for (size_t j = 0; j < dim; ++j) {
+        statsInfo_[j].TN += w;
+      }
+      statsInfo_[maxIdx].TN -= w;
+    } else {
+      statsInfo_[labelD[i]].FN += w;  // false negative for labelD[i]
+      statsInfo_[maxIdx].FP += w;     // false positive for maxIdx
+      // true negatives for all labels except for maxIdx and labelD[i]
+      for (size_t j = 0; j < dim; ++j) {
+        statsInfo_[j].TN += w;
+      }
+      statsInfo_[maxIdx].TN -= w;
+      statsInfo_[labelD[i]].TN -= w;
+    }
+  }
+}
+
+void PrecisionRecallEvaluator::calcStatsInfoMulti(const MatrixPtr& output,
+                                                  const MatrixPtr& label,
+                                                  const MatrixPtr& weight) {
+  size_t insNum = output->getHeight();
+  size_t dim = output->getWidth();
+  real* outputD = output->getData();
+  auto labelD = dynamic_cast<CpuSparseMatrix*>(label.get());
+  real* weightD = (weight != nullptr) ? weight->getData() : nullptr;
+  real threshold = config_.classification_threshold();
+  for (size_t i = 0; i < insNum; ++i) {
+    for (size_t j = 0; j < dim; ++j) {
+      real w = (weightD != nullptr) ? weightD[i] : 1.0;
+      size_t idx = i * dim + j;
+      if (outputD[idx] < threshold) {
+        statsInfo_[j].TN += w;  // true negative
+      } else {
+        statsInfo_[j].FP += w;  // false positive
+      }
+    }
+
+    const int* cols = labelD->getRowCols(i);
+    for (size_t j = 0; j < labelD->getColNum(i); ++j) {
+      CHECK_LT(size_t(cols[j]), dim);
+      real w = (weightD != nullptr) ? weightD[i] : 1.0;
+      size_t idx = i * dim + cols[j];
+      if (outputD[idx] < threshold) {
+        statsInfo_[cols[j]].FN += w;  // false negative
+        statsInfo_[cols[j]].TN -= w;  // true negative
+      } else {
+        statsInfo_[cols[j]].TP += w;  // true positive
+        statsInfo_[cols[j]].FP -= w;  // false positive
+      }
+    }
+  }
+}
+
+void PrecisionRecallEvaluator::distributeEval(ParameterClient2* client) {
+  size_t size = 4 * statsInfo_.size();
+  double* buf = new double[size];
+  for (size_t i = 0; i < statsInfo_.size(); ++i) {
+    buf[4 * i + 0] = statsInfo_[i].TP;
+    buf[4 * i + 1] = statsInfo_[i].TN;
+    buf[4 * i + 2] = statsInfo_[i].FP;
+    buf[4 * i + 3] = statsInfo_[i].FN;
+  }
+  client->reduce(buf, buf, size, FLAGS_trainer_id, 0);
+  for (size_t i = 0; i < statsInfo_.size(); ++i) {
+    statsInfo_[i].TP = buf[4 * i + 0];
+    statsInfo_[i].TN = buf[4 * i + 1];
+    statsInfo_[i].FP = buf[4 * i + 2];
+    statsInfo_[i].FN = buf[4 * i + 3];
+  }
+  delete[] buf;
+}
+
+REGISTER_EVALUATOR(pnpair, PnpairEvaluator);
+void PnpairEvaluator::start() {
+  Evaluator::start();
+  memset(pairArray_, 0, sizeof(pairArray_));
+  predictArray_.clear();
+}
+
+real PnpairEvaluator::evalImp(std::vector<Argument>& arguments) {
+  CHECK_GE(arguments.size(), 3UL);
+  CHECK_LE(arguments.size(), 4UL);
+  MatrixPtr output = arguments[0].value;
+  IVectorPtr label = arguments[1].ids;
+  IVectorPtr info = arguments[2].ids;
+  bool supportWeight = (4 == arguments.size()) ? true : false;
+  MatrixPtr weight = supportWeight ? arguments[3].value : nullptr;
+  if (nullptr == output || nullptr == label ||
+      (supportWeight && nullptr == weight)) {
+    return 0;
+  }
+  size_t height = output->getHeight();
+  size_t width = output->getWidth();
+  CHECK_EQ(height, label->getSize());
+  CHECK_EQ(height, info->getSize());
+  if (supportWeight) {
+    CHECK_EQ(height, weight->getHeight());
+    CHECK_EQ((size_t)1, weight->getWidth());
+  }
+
+  if (dynamic_cast<GpuMatrix*>(output.get())) {
+    Matrix::resizeOrCreate(cpuOutput_, height, width, false, false);
+    IVector::resizeOrCreate(cpuLabel_, height, false);
+    IVector::resizeOrCreate(cpuInfo_, height, false);
+    cpuOutput_->copyFrom(*output);
+    cpuLabel_->copyFrom(*label);
+    cpuInfo_->copyFrom(*info);
+
+    output = cpuOutput_;
+    label = cpuLabel_;
+    info = cpuInfo_;
+
+    if (supportWeight) {
+      Matrix::resizeOrCreate(cpuWeight_, height, (size_t)1, false, false);
+      cpuWeight_->copyFrom(*weight);
+      weight = cpuWeight_;
+    }
+  }
+
+  real* outputs = output->getData();
+  int* labels = label->getData();
+  int* infos = info->getData();
+  real* weights = supportWeight ? weight->getData() : nullptr;
+  for (size_t i = 0; i < output->getHeight(); i++) {
+    real y1 = outputs[i * width + (width - 1)];
+    real w = supportWeight ? weights[i] : 1.0;
+    predictArray_.push_back(PredictionResult(y1, labels[i], infos[i], w));
+  }
+  return 0;
+}
+
+void PnpairEvaluator::stat(size_t start, size_t end, PredictionResult* answers,
+                           double& pos, double& neg, double& spe) {
+  for (size_t i = start; i < end; i++) {
+    for (size_t j = i + 1; j < end; j++) {
+      CHECK_EQ(answers[i].queryid, answers[j].queryid);
+      // The pair weight is the mean of the two samples' weight
+      double weight = (answers[i].weight + answers[j].weight) / 2.0;
+      if (answers[i].label != answers[j].label) {
+        if ((answers[i].out > answers[j].out &&
+             answers[i].label > answers[j].label) ||
+            (answers[i].out < answers[j].out &&
+             answers[i].label < answers[j].label)) {
+          pos += weight;
+        } else if ((answers[i].out > answers[j].out &&
+                    answers[i].label < answers[j].label) ||
+                   (answers[i].out < answers[j].out &&
+                    answers[i].label > answers[j].label)) {
+          neg += weight;
+        } else {
+          spe += weight;
+        }
+      }
+    }
+  }
+}
+
+void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
+  std::sort(predictArray.begin(), predictArray.end(),
+            [](const PredictionResult& x, const PredictionResult& y) {
+              return x.queryid < y.queryid;
+            });
+
+  double pos = 0;
+  double neg = 0;
+  double special = 0;
+  auto start = predictArray.begin();
+  while (start != predictArray.end()) {
+    auto end = std::find_if(
+        start + 1, predictArray.end(),
+        [=](const PredictionResult& x) { return x.queryid != start->queryid; });
+    CHECK(end != start);
+    stat(start - predictArray.begin(), end - predictArray.begin(),
+         predictArray.data(), pos, neg, special);
+
+    start = end;
+  }
+
+  pairArray_[0] += pos;
+  pairArray_[1] += neg;
+
+  LOG(INFO) << " calc total pos pair: " << pos
+            << " calc total neg pair: " << neg
+            << " calc total special pair: " << special;
+}
+
+ClassRegistrar<Evaluator> Evaluator::registrar_;
+Evaluator* Evaluator::create(const EvaluatorConfig& config) {
+  Evaluator* evaluator = nullptr;
+  if (config.type() == "classification_error") {
+    evaluator = new ClassificationErrorEvaluator();
+  } else if (config.type() == "sum") {
+    evaluator = new SumEvaluator();
+  } else if (config.type() == "last-column-sum") {
+    evaluator = new ColumnSumEvaluator(-1);
+  } else if (config.type() == "last-column-auc") {
+    evaluator = new AucEvaluator(-1);
+  } else {
+    evaluator = registrar_.createByType(config.type());
+  }
+  evaluator->init(config);
+  return evaluator;
+}
+
+class ValuePrinter : public Evaluator {
+public:
+  ValuePrinter() {}
+
+  virtual void eval(const NeuralNetwork& nn) {
+    for (const std::string& name : config_.input_layers()) {
+      const Argument& argu = nn.getLayer(name)->getOutput();
+      if (argu.value) {
+        std::ostringstream os;
+        argu.value->print(os);
+        LOG(INFO) << "layer=" << name << " value matrix:\n" << os.str();
+      }
+      if (argu.ids) {
+        std::ostringstream os;
+        argu.ids->print(os, argu.ids->getSize());
+        LOG(INFO) << "layer=" << name << " ids vector:\n" << os.str();
+      }
+      if (auto startPos = argu.sequenceStartPositions) {
+        std::ostringstream os;
+        startPos->getVector(false)->print(os, startPos->getSize());
+        LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
+      }
+      if (auto subStartPos = argu.subSequenceStartPositions) {
+        std::ostringstream os;
+        subStartPos->getVector(false)->print(os, subStartPos->getSize());
+        LOG(INFO) << "layer=" << name << " sub-sequence pos vector:\n"
+                  << os.str();
+      }
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
+};
+REGISTER_EVALUATOR(value_printer, ValuePrinter);
+
+class GradientPrinter : public Evaluator {
+public:
+  GradientPrinter() {}
+
+  virtual void eval(const NeuralNetwork& nn) {
+    for (const std::string& name : config_.input_layers()) {
+      const Argument& argu = nn.getLayer(name)->getOutput();
+      if (argu.grad) {
+        std::ostringstream os;
+        argu.grad->print(os);
+        LOG(INFO) << "layer=" << name << " grad matrix:\n" << os.str();
+      }
+      if (auto startPos = argu.sequenceStartPositions) {
+        std::ostringstream os;
+        startPos->getVector(false)->print(os, startPos->getSize());
+        LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
+      }
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
+};
+REGISTER_EVALUATOR(gradient_printer, GradientPrinter);
+
+class MaxIdPrinter : public Evaluator {
+private:
+  IVectorPtr maxIds_;
+  MatrixPtr maxValues_;
+
+public:
+  MaxIdPrinter() {}
+
+  virtual void eval(const NeuralNetwork& nn) {
+    for (const std::string& name : config_.input_layers()) {
+      const Argument& argu = nn.getLayer(name)->getOutput();
+      if (argu.value) {
+        size_t height = argu.value->getHeight();
+        size_t width = config_.num_results();
+        IVector::resizeOrCreate(maxIds_, height * width, false);
+        Matrix::resizeOrCreate(maxValues_, height, width, false);
+        argu.value->rowMax(*maxIds_, *maxValues_);
+        std::ostringstream os;
+        int* ids = maxIds_->getData();
+        real* values = maxValues_->getData();
+        for (size_t i = 0; i < height; ++i) {
+          for (size_t j = 0; j < width; ++j) {
+            size_t pos = i * width + j;
+            os << ids[pos] << " : " << values[pos] << ", ";
+          }
+          os << std::endl;
+        }
+        LOG(INFO) << "layer=" << name << " row max id vector:\n" << os.str();
+      }
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
+};
+REGISTER_EVALUATOR(max_id_printer, MaxIdPrinter);
+
+class MaxFramePrinter : public Evaluator {
+private:
+  IVectorPtr maxIds_;
+  MatrixPtr maxValues_;
+  MatrixPtr value_;
+
+public:
+  MaxFramePrinter() {
+    value_ =
+        Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, false);
+  }
+
+  virtual void eval(const NeuralNetwork& nn) {
+    for (const std::string& name : config_.input_layers()) {
+      const Argument& argu = nn.getLayer(name)->getOutput();
+
+      CHECK_EQ(argu.value->getWidth(), 1LU);
+      size_t numSequences = argu.getNumSequences();
+      const int* starts = argu.sequenceStartPositions->getData(false);
+
+      std::ostringstream os;
+      for (size_t i = 0; i < numSequences; ++i) {
+        size_t offset = starts[i];
+        size_t size = starts[i + 1] - starts[i];
+        value_->setData(argu.value->getData() + offset, 1LU, size);
+
+        size_t height = 1LU;
+        size_t width = std::min((size_t)config_.num_results(), size);
+        IVector::resizeOrCreate(maxIds_, height * width, false);
+        Matrix::resizeOrCreate(maxValues_, height, width, false);
+
+        value_->rowMax(*maxIds_, *maxValues_);
+
+        int* ids = maxIds_->getData();
+        real* values = maxValues_->getData();
+        for (size_t j = 0; j < width; ++j) {
+          os << ids[j] << " : " << values[j] << ", ";
+        }
+        os << "total " << size << " frames" << std::endl;
+      }
+      LOG(INFO) << "layer=" << name << " sequence max frames:\n" << os.str();
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
+};
+REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter);
+
+/**
+ * Sequence text printer will print text according to index matrix and a
+ * dictionary. There can be multiple input to this layer:
+ *   1) If there is only one input, the input must be a matrix containing
+ *      the sequence of indices;
+ *   2) If there are more than one input, the first input should be ids,
+ *      and are interpreted as sample ids.
+ *
+ * The output format will be:
+ *   1) sequence without sub-sequence, and there is probability.
+ *      id \t prob space_seperated_tokens_from_dictionary_according_to_seq
+ *   2) sequence without sub-sequence, and there is not probability.
+ *      id \t space_seperated_tokens_from_dictionary_according_to_seq
+ *   3) sequence with sub-sequence, and there is not probability.
+ *      id \t space_seperated_tokens_from_dictionary_according_to_sub_seq
+ *      \t \t space_seperated_tokens_from_dictionary_according_to_sub_seq
+ *      ...
+ *
+ * Typically SequenceTextPrinter layer takes output of maxid or RecurrentGroup
+ * with maxid (when generating) as an input.
+ *
+ */
+class SequenceTextPrinter : public Evaluator {
+private:
+  /// dict_file, which contains a list of tokens
+  std::vector<std::string> dict_;
+  /// result_file, which is the output file
+  std::ofstream os_;
+  /// True/False, to indicate whether to use space to separate output tokens.
+  /// Default is True. No space is added if set to False.
+  bool delimited_;
+  /// store the cpu version of argument.ids
+  std::vector<IVectorPtr> cpuIds_;
+  /// store the probability associated with each sequence
+  std::vector<MatrixPtr> cpuIn_;
+
+public:
+  SequenceTextPrinter() {}
+
+  virtual void init(const EvaluatorConfig& config) {
+    Evaluator::init(config);
+    if (!config.dict_file().empty()) {
+      loadFileList(config.dict_file(), dict_);
+    }
+
+    os_.open(config.result_file(), std::ofstream::trunc);
+    CHECK(os_.is_open()) << "Failed to open file " << config.result_file();
+    delimited_ = config.delimited();
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    CHECK_GE(arguments.size(), 1LU);
+    bool hasId = arguments.size() > 1;
+    size_t numSequences = arguments[0].getNumSequences();
+    if (hasId) {
+      CHECK_EQ(arguments[0].ids->getSize(), numSequences)
+          << "first input must be sample id.";
+    }
+    for (size_t i = hasId ? 1 : 0; i < arguments.size(); ++i) {
+      CHECK_EQ((size_t)arguments[i].getNumSequences(), numSequences);
+    }
+
+    auto resizeVector = [](IVectorPtr& dest, const IVectorPtr& src) {
+      if (src && src->useGpu()) {
+        IVector::resizeOrCreate(dest, src->getSize(), false);
+        dest->copyFrom(*src);
+      } else {
+        dest = src;
+      }
+    };
+
+    auto resizeMatrix = [](MatrixPtr& dest, const MatrixPtr& src) {
+      if (src && src->useGpu()) {
+        Matrix::resizeOrCreate(dest, src->getHeight(), src->getWidth(), false,
+                               false);
+        dest->copyFrom(*src);
+      } else {
+        dest = src;
+      }
+    };
+
+    cpuIds_.resize(arguments.size());
+    cpuIn_.resize(arguments.size());
+    for (size_t i = 0; i < arguments.size(); ++i) {
+      resizeVector(cpuIds_[i], arguments[i].ids);
+      resizeMatrix(cpuIn_[i], arguments[i].in);
+    }
+
+    int* sampleIds = nullptr;
+    if (hasId) {
+      sampleIds = cpuIds_[0]->getData();
+    }
+
+    for (size_t i = 0; i < numSequences; ++i) {
+      os_ << (hasId ? sampleIds[i] : i);
+      for (size_t j = hasId ? 1 : 0; j < arguments.size(); ++j) {
+        int* output = cpuIds_[j]->getData();
+        const int* starts = arguments[j].sequenceStartPositions->getData(false);
+
+        auto seqPrint = [&](int start, int end) {
+          os_ << "\t";
+          for (int k = start; k < end; k++) {
+            int id = output[k];
+            os_ << (delimited_ ? " " : "");
+            if (!dict_.empty()) {
+              CHECK_LT((size_t)id, dict_.size());
+              os_ << dict_[id];
+            } else {
+              os_ << id;
+            }
+          }
+        };
+
+        if (arguments[j].hasSubseq()) {
+          // print sequence with sub-sequence
+          const int* subStarts =
+              arguments[j].subSequenceStartPositions->getData(false);
+          int subSeqId_start = 0;
+          int subSeqId_end = 0;
+          for (size_t k = 0; k < (size_t)arguments[j].getNumSubSequences() + 1;
+               ++k) {
+            if (starts[i] == subStarts[k]) subSeqId_start = k;
+            if (starts[i + 1] == subStarts[k]) subSeqId_end = k;
+          }
+          for (int k = subSeqId_start; k < subSeqId_end; k++) {
+            seqPrint(subStarts[k], subStarts[k + 1]);
+            os_ << std::endl;
+          }
+
+        } else {
+          // print sequence without sub-sequence
+          if (arguments[j].in) {  // beam print
+            real* probs = cpuIn_[j]->rowBuf(i);
+            os_ << std::endl;
+            int start = starts[i];
+            int seqEnd = starts[i + 1];
+            for (size_t k = 0; k < arguments[j].in->getWidth(); ++k) {
+              if (start == seqEnd) {
+                break;
+              }
+              int end = start + output[start] + 2;
+              CHECK_LE(end, seqEnd);
+              CHECK_EQ(output[end - 1], -1);
+              os_ << k << "\t" << probs[k];
+              seqPrint(start + 1, end - 1);
+              os_ << std::endl;
+              start = end;
+            }
+          } else {
+            seqPrint(starts[i], starts[i + 1]);
+          }
+        }
+      }
+      os_ << std::endl;
+    }
+    return 0;
+  }
+};
+REGISTER_EVALUATOR(seq_text_printer, SequenceTextPrinter);
+
+class ClassificationErrorPrinter : public ClassificationErrorEvaluator {
+public:
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    MatrixPtr errorMat = calcError(arguments);
+
+    std::ostringstream os;
+    errorMat->print(os);
+    LOG(INFO) << "Printer=" << config_.name() << " Classification Error:\n"
+              << os.str();
+
+    if (auto startPos = arguments[0].sequenceStartPositions) {
+      std::ostringstream os;
+      startPos->getVector(false)->print(os, startPos->getSize());
+      LOG(INFO) << "Printer=" << config_.name() << " sequence pos vector:\n"
+                << os.str();
+    }
+    return 0;
+  }
+};
+REGISTER_EVALUATOR(classification_error_printer, ClassificationErrorPrinter);
+
+}  // namespace paddle
diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h
new file mode 100644
index 00000000000000..316219b4fb292b
--- /dev/null
+++ b/paddle/gserver/evaluators/Evaluator.h
@@ -0,0 +1,330 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "paddle/pserver/ParameterClient2.h"
+#include "paddle/utils/ClassRegistrar.h"
+#include "ModelConfig.pb.h"
+#include "paddle/parameter/Argument.h"
+#include <fstream>
+
+namespace paddle {
+
+class NeuralNetwork;
+
+#define REGISTER_EVALUATOR(__type_name, __class_name)                \
+  static InitFunction __reg_type_##__type_name([]() {                \
+    Evaluator::registrar_.registerClass<__class_name>(#__type_name); \
+  })
+
+class Evaluator {
+public:
+  static Evaluator* create(const EvaluatorConfig& config);
+
+  Evaluator() : numSamples_(0), totalScore_(0) {}
+
+  virtual ~Evaluator() {}
+
+  virtual void init(const EvaluatorConfig& config) { config_ = config; }
+
+  /**
+   * start to evaluate some data
+   */
+  virtual void start() {
+    numSamples_ = 0;
+    totalScore_ = 0;
+  }
+
+  /**
+   * Process a batch of data.
+   */
+  virtual void eval(const NeuralNetwork& nn);
+
+  /**
+   * Process a batch of data.
+   * return the score for the batch if it make sense to sum the score across
+   * batches. Otherwise evaluator should return 0 and override finish() and
+   * printStats() to do the right calculation.
+   */
+  virtual real evalImp(std::vector<Argument>& arguments) = 0;
+
+  /**
+   * Update the number of processed samples
+   */
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    numSamples_ += arguments[0].getBatchSize();
+  }
+
+  // finish() should be called before distributeEval
+  virtual void distributeEval(ParameterClient2* client) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  void mergeResultsOfAllClients(ParameterClient2* client) {
+    double data[2] = {totalScore_, numSamples_};
+    client->reduce(data, data, 2, FLAGS_trainer_id, 0);
+    totalScore_ = data[0];
+    numSamples_ = data[1];
+  }
+
+  /**
+   * finish the evaluation.
+   */
+  virtual void finish() {}
+
+  // finish() should be called before printStats
+  virtual void printStats(std::ostream& os) {
+    os << config_.name() << "="
+       << (numSamples_ ? totalScore_ / numSamples_ : 0);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os,
+                                  Evaluator& evaluator) {
+    evaluator.printStats(os);
+    return os;
+  }
+
+  friend std::ostream&& operator<<(std::ostream&& os,    // NOLINT
+                                   Evaluator& evaluator) {
+    evaluator.printStats(os);
+    return std::move(os);
+  }
+
+  static ClassRegistrar<Evaluator> registrar_;
+
+protected:
+  EvaluatorConfig config_;
+  double numSamples_;
+  double totalScore_;
+};
+
+class DummyEvaluator : public Evaluator {
+public:
+  DummyEvaluator() {}
+  virtual void init(const EvaluatorConfig&) {}
+  virtual void start() {}
+  virtual void eval(const NeuralNetwork&) {}
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    (void)arguments;
+    return -1;
+  }
+  virtual void finish() {}
+  virtual void printStats(std::ostream&) {}
+};
+
+class AucEvaluator : public Evaluator {
+public:
+  /**
+   * @brief evaluate AUC using colIdx-th column as prediction.
+   *
+   * colIdx = 0: the 0-th column.
+   * colIdx > 0: the colIdx-th column.
+   * colIdx < 0: the last colIdx-th column.
+   *
+   */
+  AucEvaluator(int32_t colIdx)
+      : colIdx_(colIdx),
+        realColumnIdx_(0),
+        cpuOutput_(nullptr),
+        cpuLabel_(nullptr),
+        cpuWeight_(nullptr) {}
+
+  virtual void start();
+
+  virtual real evalImp(std::vector<Argument>& arguments);
+
+  virtual void printStats(std::ostream& os) {
+    os << config_.name() << "=" << calcAuc();
+  }
+
+  virtual void distributeEval(ParameterClient2* client);
+
+private:
+  static const uint32_t kBinNum_ = (1 << 24) - 1;
+  static const int kNegativeLabel_ = 0;
+  double statPos_[kBinNum_ + 1];
+  double statNeg_[kBinNum_ + 1];
+  int32_t colIdx_;
+  uint32_t realColumnIdx_;
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
+  MatrixPtr cpuWeight_;
+
+  AucEvaluator() {}
+
+  inline static double trapezoidArea(double X1, double X2, double Y1,
+                                     double Y2) {
+    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
+  }
+
+  double calcAuc();
+};
+
+/**
+ * @brief RankAucEvaluator calculates the AUC of each list
+ * (i.e., titles under the same query), and averages them.
+ *
+ * Each list should be organized as a sequence.
+ * The inputs of this evaluator is [output, click, pv].
+ * If pv is not provided, it will be set to 1.
+ * The types of click and pv are dense value.
+ */
+class RankAucEvaluator : public Evaluator {
+public:
+  // evaluate ranking AUC
+  virtual void start();
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments);
+
+  virtual real evalImp(std::vector<Argument>& arguments);
+
+  virtual void distributeEval(ParameterClient2* client) {
+    mergeResultsOfAllClients(client);
+  }
+
+private:
+  MatrixPtr output_;
+  MatrixPtr click_;
+  MatrixPtr pv_;
+  std::vector<std::pair<real, int>> outputPair_;
+
+  double calcRankAuc(real* outputData, real* clickData, real* pvData,
+                     size_t size);
+};
+
+class PrecisionRecallEvaluator : public Evaluator {
+public:
+  // Evaluate precision, recall and F1 score
+  PrecisionRecallEvaluator()
+      : isMultiBinaryLabel_(false),
+        cpuOutput_(nullptr),
+        cpuLabel_(nullptr),
+        cpuWeight_(nullptr) {}
+
+  virtual void start();
+
+  virtual real evalImp(std::vector<Argument>& arguments);
+
+  virtual void printStats(std::ostream& os);
+
+  virtual void distributeEval(ParameterClient2* client);
+
+  struct StatsInfo {
+    double TP;  // numbers of true positives
+    double TN;  // numbers of true negatives
+    double FP;  // numbers of false positives
+    double FN;  // numbers of false negatives
+
+    StatsInfo() : TP(0.0), TN(0.0), FP(0.0), FN(0.0) {}
+  };
+
+private:
+  bool isMultiBinaryLabel_;
+  std::vector<StatsInfo> statsInfo_;
+
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
+  MatrixPtr cpuWeight_;
+
+  void calcStatsInfo(const MatrixPtr& output, const IVectorPtr& label,
+                     const MatrixPtr& weight);
+
+  void calcStatsInfoMulti(const MatrixPtr& output, const MatrixPtr& label,
+                          const MatrixPtr& weight);
+
+  inline static double calcPrecision(double TP, double FP) {
+    if (TP > 0.0 || FP > 0.0) {
+      return TP / (TP + FP);
+    } else {
+      return 1.0;
+    }
+  }
+
+  inline static double calcRecall(double TP, double FN) {
+    if (TP > 0.0 || FN > 0.0) {
+      return TP / (TP + FN);
+    } else {
+      return 1.0;
+    }
+  }
+
+  inline static double calcF1Score(double precision, double recall) {
+    if (precision > 0.0 || recall > 0.0) {
+      return 2 * precision * recall / (precision + recall);
+    } else {
+      return 0;
+    }
+  }
+};
+
+/**
+ * Positive-negative pair rate Evaluator
+ */
+class PnpairEvaluator : public Evaluator {
+public:
+  PnpairEvaluator()
+      : cpuOutput_(nullptr),
+        cpuLabel_(nullptr),
+        cpuInfo_(nullptr),
+        cpuWeight_(nullptr) {}
+
+  virtual void start();
+  virtual real evalImp(std::vector<Argument>& arguments);
+
+  struct PredictionResult {
+    PredictionResult(real __out, int __label, int __queryid, real __weight)
+        : out(__out), label(__label), queryid(__queryid), weight(__weight) {}
+    real out;
+    int label;
+    int queryid;
+    real weight;
+  };
+  std::vector<PredictionResult> predictArray_;
+  void printPredictResults() {
+    std::ofstream fs(FLAGS_predict_file);
+    CHECK(fs) << "Fail to open " << FLAGS_predict_file;
+    for (auto& res : predictArray_) {
+      fs << res.out << " " << res.label << " " << res.queryid << std::endl;
+    }
+  }
+
+  void stat(size_t start, size_t end, PredictionResult* answers, double& pos,
+            double& neg, double& spe);
+  void calc(std::vector<PredictionResult>& predictArray);
+
+  virtual void finish() { calc(predictArray_); }
+
+  virtual void printStats(std::ostream& os) {
+    os << " pos/neg"
+       << "=" << pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    client->reduce(pairArray_, pairArray_, kPairArrayNum_, FLAGS_trainer_id, 0);
+    LOG(INFO) << " distribute eval calc total pos pair: " << pairArray_[0]
+              << " calc total neg pair: " << pairArray_[1];
+  }
+
+private:
+  static const uint32_t kPairArrayNum_ = 2;
+  double pairArray_[kPairArrayNum_];
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
+  IVectorPtr cpuInfo_;
+  MatrixPtr cpuWeight_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/GradientMachine.cpp b/paddle/gserver/gradientmachines/GradientMachine.cpp
new file mode 100644
index 00000000000000..b20525f66431e1
--- /dev/null
+++ b/paddle/gserver/gradientmachines/GradientMachine.cpp
@@ -0,0 +1,144 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "GradientMachine.h"
+
+#include "paddle/utils/Logging.h"
+#include <fstream>
+
+#include "hl_gpu.h"
+#include "NeuralNetwork.h"
+#include "ParallelNeuralNetwork.h"
+#include "MultiGradientMachine.h"
+#include "NeuralNetwork.h"
+#include "MultiNetwork.h"
+#include "GradientMachineMode.h"
+
+namespace paddle {
+
+GradientMachine* GradientMachine::create(
+    const ModelConfig& config, int mode,
+    const std::vector<ParameterType>& parameterTypes) {
+  if (auto gm = IGradientMachineMode::tryCreateGradientMachine(mode, config)) {
+    return gm;
+  }
+  if (FLAGS_trainer_count > 1) {
+    return new MultiGradientMachine(config, FLAGS_use_gpu);
+  }
+  if (FLAGS_trainer_count == 1) {  // single
+    NeuralNetwork* nn;
+    if (config.type() == "multi_nn") {
+      /* multi submodel calculate, thread(s) will be initialized inside */
+      nn = new MultiNetwork("root");
+    } else if (FLAGS_parallel_nn) {
+      /* multi threads calculate */
+      nn = new ParallelNeuralNetwork();
+    } else {
+      /* single thread calculate */
+      nn = NeuralNetwork::create(config);
+    }
+    ParamInitCallback testParamInitCb =
+        [](int paramId, Parameter* para) { para->enableType(PARAMETER_VALUE); };
+    nn->init(config, mode == kTesting ? testParamInitCb : nullptr,
+             parameterTypes);
+    return nn;
+  }
+  LOG(FATAL) << "Unknown model type: " << config.type();
+  return nullptr;
+}
+
+GradientMachine* GradientMachine::create(const std::string& modelFile,
+                                         DataConfig* dataConfig) {
+  std::ifstream is(modelFile);
+  CHECK(is) << "Fail to open " << modelFile;
+  return create(is, dataConfig);
+}
+
+GradientMachine* GradientMachine::create(std::istream& is,
+                                         DataConfig* dataConfig) {
+  TrainerConfig trainerConfig;
+  GradientMachine* ret = create(is, &trainerConfig);
+  if (dataConfig && trainerConfig.has_data_config()) {
+    *dataConfig = trainerConfig.data_config();
+  }
+  return ret;
+}
+
+GradientMachine* GradientMachine::create(const std::string& modelFile,
+                                         TrainerConfig* trainerConfig) {
+  std::ifstream is(modelFile);
+  CHECK(is) << "Fail to open " << modelFile;
+  return create(is, trainerConfig);
+}
+
+GradientMachine* GradientMachine::create(std::istream& is,
+                                         TrainerConfig* trainerConfig) {
+  TrainerConfig trainerConfigTemp;
+  int64_t size;
+  CHECK(is.read((char*)&size, sizeof(size))) << "Fail to read ";
+  std::string buf;
+  buf.resize(size);
+  CHECK(is.read(&buf[0], size)) << "Fail to read ";
+  CHECK(trainerConfigTemp.ParseFromString(buf)) << "Fail to parse config";
+  std::unique_ptr<GradientMachine> machine(
+      create(trainerConfigTemp.model_config()));
+  std::vector<ParameterPtr>& parameters = machine->getParameters();
+  for (auto& para : parameters) {
+    para->load(is);
+  }
+
+  machine->onLoadParameter();
+
+  if (trainerConfig) {
+    *trainerConfig = trainerConfigTemp;
+  }
+
+  return machine.release();
+}
+
+void GradientMachine::saveParameters(const std::string& dir) const {
+  LOG(INFO) << "Saving parameters to " << dir;
+
+  for (auto& para : parameters_) {
+    std::string filename = dir + "/" + para->getName();
+    if (para->isFullSize()) {
+      para->save(filename);
+    }
+  }
+}
+
+void GradientMachine::loadParameters(const std::string& dir) {
+  LOG(INFO) << "Loading parameters from " << dir;
+
+  for (auto& para : parameters_) {
+    std::string filename = dir + "/" + para->getName();
+    if (para->isFullSize()) {
+      para->load(filename);
+    }
+  }
+}
+
+void GradientMachine::randParameters() {
+  LOG(INFO) << "Initing parameters..";
+
+  for (auto& para : parameters_) {
+    if (para->isFullSize()) {
+      para->randomize();
+    }
+  }
+  LOG(INFO) << "Init parameters done.";
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
new file mode 100644
index 00000000000000..7233f985c56e96
--- /dev/null
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -0,0 +1,273 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "paddle/math/Matrix.h"
+#include "paddle/parameter/Parameter.h"
+#include "paddle/parameter/ParameterUpdaterBase.h"
+#include "paddle/utils/Thread.h"
+#include "TrainerConfig.pb.h"
+#include "ModelConfig.pb.h"
+#include "paddle/gserver/dataproviders/DataProvider.h"
+#include "paddle/gserver/evaluators/Evaluator.h"
+#include "paddle/gserver/layers/Layer.h"
+
+namespace paddle {
+/**
+ * @brief A gradient machine is capable of calculating some outputs given
+ *        some inputs and performing gradient calculation based on the
+ *        derivative from the outputs.
+ *
+ * A gradient machine can be either a full neural network or part of a neural
+ * network.
+ *
+ * Usage for training:
+ *
+ *  1. Prepare inArgs. Put your input data into inArgs[i].value.
+ *
+ *  2. Call forward(inArgs, &outArgs)
+ *
+ *  3. Calculate gradient with respect to outArgs[i]->value
+ *     and fill them into outArgs[i]->grad.
+ *     This step can be skipped if your the outputs are from cost layers.
+ *
+ *  4. Call backward(). After backward, gradient of each parameter is
+ *     accumulated to getParameters()[i]->getBuf(PARAMETER_GRADIENT)
+ *
+ *  5. Update parameter value getParameters()[i]->getBuf(PARAMETER_VALUE) using
+ *     gradients.
+ *
+ *  6. Clear gradients to zero.
+ *
+ * Usage for prediction:
+ *
+ *  1. Prepare inArgs. Put your input data into inArgs[i].value.
+ *
+ *  2. Call forward(inArgs, &outArgs)
+ *
+ *  3. Obtain the prediction result from outArgs[i]
+ */
+
+typedef std::vector<LayerStatePtr> MachineState;
+
+class GradientMachine;
+
+typedef std::shared_ptr<GradientMachine> GradientMachinePtr;
+
+class GradientMachine {
+public:
+  enum CreateMode {
+    kNormal = 0,
+    kSgdSparseCpuTraining = 3,
+    kTesting = 4,
+    kCustom = 10
+  };
+
+  /**
+   * Create a gradient machine from ModelConfig
+   * Parameter will have parameterTypes
+   */
+  static GradientMachine* create(
+      const ModelConfig& config, int mode = kNormal,
+      const std::vector<ParameterType>& parameterTypes =
+          std::vector<ParameterType>{PARAMETER_VALUE, PARAMETER_GRADIENT,
+                                     PARAMETER_MOMENTUM});
+
+  /**
+   * Create a gradient machine from the merged model file.
+   * The merged model file can be generated using tools/merge_model
+   * If dataConfig is not null, it will be filled with the DataConfig
+   * from the TrainerConfig
+   */
+  static GradientMachine* create(const std::string& modelFile,
+                                 DataConfig* dataConfig);
+
+  /**
+   * Create a gradient machine from a stream which contains the merged
+   * model file. The merged model file can be generated using tools/merge_model
+   * If dataConfig is not null, it will be filled with the DataConfig
+   * from the TrainerConfig
+   */
+  static GradientMachine* create(std::istream& is, DataConfig* dataConfig);
+
+  /**
+   * Create a gradient machine from the merged model file.
+   * The merged model file can be generated using tools/merge_model
+   * If trainerConfig is not null, it will be filled with the TrainerConfig
+   */
+  static GradientMachine* create(const std::string& modelFile,
+                                 TrainerConfig* trainerConfig);
+
+  /**
+   * Create a gradient machine from a stream which contains the merged
+   * model file. The merged model file can be generated using tools/merge_model
+   * If trainerConfig is not null, it will be filled with the TrainerConfig
+   */
+  static GradientMachine* create(std::istream& is,
+                                 TrainerConfig* trainerConfig);
+
+  virtual ~GradientMachine() {}
+
+  /**
+   * Prefetch row ids of sparse parameter.
+   */
+  virtual void prefetch(const std::vector<Argument>& inArgs) { (void)inArgs; }
+
+  /**
+   * @brief Forward propagation.
+   *
+   * Calculate outputs (outArgs) based the inputs (inArgs)
+   *
+   * @note: if passType==PASS_TEST, then backward() should not be called
+   */
+  virtual void forward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs, PassType passType) = 0;
+
+  /**
+   * @brief Backward propagation.
+   *
+   * Calculate the gradient of inArgs and parameter.
+   *
+   * This function should only be called after a corresponding forward() call.
+   * The caller is responsible for filling the correct grad for the outArgs
+   * obtained using forward().
+   *
+   * It may also change the grad field for the inArgs supplied at forward()
+   */
+  virtual void backward(const UpdateCallback& callback = nullptr) = 0;
+
+  /**
+   * Combine forward() and backward(). For multithread training, this
+   * may be faster.
+   *
+   * @note: passType PASS_TEST is not allowed for forwardBackward().
+   */
+  virtual void forwardBackward(const std::vector<Argument>& inArgs,
+                               std::vector<Argument>* outArgs,
+                               PassType passType,
+                               const UpdateCallback& callback = nullptr) {
+    forward(inArgs, outArgs, passType);
+    backward(callback);
+  }
+
+  // see comment in Layer.h for the function with the same name
+  virtual void resetState() {}
+
+  // set machine state
+  virtual void setState(const MachineState& machineState) {}
+
+  // save machine state
+  virtual void getState(MachineState& machineState) {}
+
+  virtual void onPassEnd() = 0;
+
+  /**
+   * Create an evaluator which can be used for eval()
+   */
+  virtual Evaluator* makeEvaluator() = 0;
+
+  /**
+   * evaluate using the given evaluator
+   */
+  virtual void eval(Evaluator* evaluator) = 0;
+
+  std::vector<ParameterPtr>& getParameters() { return parameters_; }
+
+  std::vector<ParameterPtr>& getNonStaticParameters() {
+    if (nonStaticParameters_.empty()) {
+      for (auto para : parameters_) {
+        if (!para->isStatic()) {
+          nonStaticParameters_.push_back(para);
+        }
+      }
+    }
+    return nonStaticParameters_;
+  }
+
+  inline bool hasStaticParameters() {
+    return parameters_.size() != getNonStaticParameters().size();
+  }
+
+  /**
+   * @brief   Used before formal training, start work-threads and set
+   *          trainer Parameters;
+   *
+   * @note    This function will only been implemented and used in a
+   *          multithreaded environment.
+   */
+ virtual void start(const TrainerConfig& config,
+                     DataProviderPtr dataProvider) {
+    (void)config;
+    (void)dataProvider;
+  }
+
+  /**
+   * @brief   check  each work-thread whether is failed/error/finish,
+   *          if not, return ture, and yes return false.
+   *
+   * @note    This function will only been implemented and used in a
+   *          multithreaded environment.
+   */
+  virtual void finish() {}
+
+  /**
+   * @brief   set the training status a "finished" value, the sub_work_threads
+   *          will option the change, and then exit.
+   *
+   * @note    This function will only been implemented and used in a
+   *          multithreaded environment.
+   */
+  virtual bool trainIsOn() { return true; }
+
+  /**
+   * @brief   when all or some of the sub-workThreads are suspended to waiting
+   *          controller's instructions, and after some processing done in the
+   *          controller, it will call this function to wake up all the pending
+   *          thread.
+   *
+   * @note    This function will only been implemented and used in a
+   *          multithreaded environment.
+   */
+  virtual void restart() {}
+
+
+  // Set the gradient of the output from outside.
+  virtual void setOutputGrad(const std::vector<Argument>& args) {
+    LOG(FATAL) << "Not implemented!";
+  }
+
+  void saveParameters(const std::string& dir) const;
+
+  void loadParameters(const std::string& dir);
+
+  void randParameters();
+
+  virtual void getStats(real& cost, int64_t& numProcessed) {
+    (void)cost;
+    (void)numProcessed;
+  }
+
+protected:
+  virtual void onLoadParameter() {}
+
+  std::vector<ParameterPtr> parameters_;
+  std::vector<ParameterPtr> nonStaticParameters_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/GradientMachineMode.cpp b/paddle/gserver/gradientmachines/GradientMachineMode.cpp
new file mode 100644
index 00000000000000..4a90a4a566ac9b
--- /dev/null
+++ b/paddle/gserver/gradientmachines/GradientMachineMode.cpp
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "GradientMachineMode.h"
+
+namespace paddle {
+std::unordered_map<int32_t, std::unique_ptr<IGradientMachineMode>>
+    IGradientMachineMode::modes_;
+}
diff --git a/paddle/gserver/gradientmachines/GradientMachineMode.h b/paddle/gserver/gradientmachines/GradientMachineMode.h
new file mode 100644
index 00000000000000..9aff9c616cf514
--- /dev/null
+++ b/paddle/gserver/gradientmachines/GradientMachineMode.h
@@ -0,0 +1,148 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "GradientMachine.h"
+#include "unordered_map"
+
+namespace paddle {
+
+class IGradientMachineMode {
+public:
+  virtual ~IGradientMachineMode() {}
+
+public:  // interfaces
+  /**
+   * @brief create current mode's gradient machine by model config.
+   * @param config model config
+   */
+  virtual GradientMachine* create(const ModelConfig& config) = 0;
+
+  /**
+   * @brief shouldBeMe the current mode of GradientMachine should be this mode.
+   * @param algo training algorithm name.
+   * @param trainerCount trainer count.
+   * @param isLocal is local mode (without pserver)
+   * @param isGpu is using gpu.
+   * @return true if mode should be this mode.
+   */
+  virtual bool shouldBeMe(
+      const std::string& algo,
+      size_t trainerCount,
+      bool isLocal,
+      bool isGpu) const = 0;
+
+  /**
+   * @brief Is data must be in cpu even if using gpu mode.
+   * @param trainerCount trainer count
+   * @return true if data must be gpu.
+   */
+  virtual bool isDataMustInCpu(size_t trainerCount) const = 0;
+
+  /**
+   * @brief Need not to use mini-batch method, and should train all data in one
+   * batch in one pass.
+   */
+  virtual bool needTrainWholeDataInOneBatch() const = 0;
+
+public:  // static methods.
+  /**
+   * @brief register a custom gradient machine mode.
+   * @note For user to register a custom gradient machine mode, id should >=
+   * kCustom.
+   * @param mode mode id.
+   * @param ptr mode description object.
+   */
+  static void regGradientMachineMode(
+      int32_t mode, std::unique_ptr<IGradientMachineMode>&& ptr) {
+    modes_.insert(std::make_pair(mode, std::move(ptr)));
+  }
+
+  /**
+   * @brief get custom mode from mode id.
+   * @param mode mode id
+   * @return mode description object.
+   */
+  static IGradientMachineMode* mode(int32_t mode) {
+    if (modes_.find(mode) != modes_.end()) {
+      return modes_[mode].get();
+    } else {
+      return nullptr;
+    }
+  }
+
+  /**
+   * @brief helper function to test trainWholeDataInOneBatch or not for mode
+   */
+  static bool trainWholeDataInOneBatch(int32_t mode) {
+    if (modes_.find(mode) != modes_.end()) {
+      return modes_[mode]->needTrainWholeDataInOneBatch();
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * @brief Try to get custom mode if we can.
+   * @param [out] mode the custom mode id.
+   * @param [in] algo algorithm name
+   * @param [in] trainerCount trainer count.
+   * @param [in] isLocal is local or not
+   * @param [in] isGpu using gpu or not.
+   * @return true if there is a custom mode fit these conditions.
+   */
+  static bool tryGetMode(int* mode, const std::string& algo,
+                         int32_t trainerCount,
+                         bool isLocal, bool isGpu) {
+    for (auto it = modes_.begin(); it != modes_.end(); ++it) {
+      if (it->second->shouldBeMe(algo, trainerCount, isLocal, isGpu)) {
+        *mode = it->first;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /**
+   * @brief helper function for data must in cpu
+   */
+  static bool dataMustInCpu(int32_t mode, size_t trainerCount) {
+    if (modes_.find(mode) != modes_.end()) {
+      return modes_[mode]->isDataMustInCpu(trainerCount);
+    } else {
+      // provide data to cpu if using synchronized multi-gpu gradient machine.
+      return trainerCount > 1;
+    }
+  }
+
+  /**
+   * @brief try to create gradient machine by mode & config.
+   * @return nullptr if we cannot create a gradient machine by such mode.
+   */
+  static GradientMachine* tryCreateGradientMachine(
+      int32_t mode, const ModelConfig& config) {
+    auto m = IGradientMachineMode::mode(mode);
+    if (m) {
+      return m->create(config);
+    } else {
+      return nullptr;
+    }
+  }
+
+private:
+  static std::unordered_map<int32_t, std::unique_ptr<IGradientMachineMode>>
+    modes_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
new file mode 100644
index 00000000000000..787ce703a08aef
--- /dev/null
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -0,0 +1,889 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "MultiGradientMachine.h"
+
+#include "paddle/utils/Logging.h"
+
+#include "paddle/utils/Stat.h"
+
+#include "NeuralNetwork.h"
+#include "ParallelNeuralNetwork.h"
+
+P_DEFINE_bool(allow_only_one_model_on_one_gpu, true,
+              "If true, do not allow multiple models on one GPU device");
+#ifdef PADDLE_METRIC_LEARNING
+P_DECLARE_bool(external);
+#endif
+
+namespace paddle {
+
+// get types of the parameters which need to be merged after backward()
+static void fillMergeTypes(PassType passType,
+    std::vector<ParameterType>* mergeTypes) {
+  mergeTypes->clear();
+  if (passType != PASS_TEST) {
+    mergeTypes->push_back(PARAMETER_GRADIENT);
+  }
+}
+
+MultiGradientMachine::MultiGradientMachine(
+    const ModelConfig& config, bool useGpu)
+    : useGpu_(useGpu),
+      trainerBarrier_(FLAGS_trainer_count),
+      allBarrier_(FLAGS_trainer_count + 1),
+      inArgsCopied_(false) {
+#ifdef PADDLE_METRIC_LEARNING
+  isPassGrad_ = FLAGS_external;
+#else
+  isPassGrad_ = false;
+#endif
+  numThreads_ = FLAGS_trainer_count;
+  if (useGpu) {
+    //! TODO(yuyang18): When useGpu=false && paddle is not compiled with gpu,
+    //! the hl_get_device_count will get an error result. It seems should return
+    //! 0 when hppl is not compiled as gpu version.
+    numDevices_ = hl_get_device_count();
+  } else {
+    numDevices_ = 0;
+  }
+  ParamInitCallback mainParamInitCb = [this](int paramId, Parameter* para) {
+    // only create buf for CPU parameters
+    // GPU parameters will be created in each thread
+    if (para->useGpu()) return;
+
+    if (para->isSparseRemoteUpdate()) {
+      para->enableType(
+        PARAMETER_VALUE,
+        FLAGS_loadsave_parameters_in_pserver
+          ? Parameter::MAT_SPARSE_ROW_PREFETCH
+          : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
+      para->enableType(
+        PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
+    } else if (para->isGradSparseUpdate()) {
+      para->enableType(PARAMETER_VALUE);
+      para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW_IDS);
+      SparseRowIdsCpuMatrix* mat = dynamic_cast<SparseRowIdsCpuMatrix*>(
+          para->getMat(PARAMETER_GRADIENT).get());
+      mat->setNumOfThreads(FLAGS_trainer_count);
+    } else if (para->isValueShared()) {
+      para->enableType(PARAMETER_VALUE, Parameter::MAT_VALUE_SHARED);
+      if (!para->isStatic()) {
+        para->enableType(PARAMETER_GRADIENT);
+      }
+    } else {
+      para->enableType(PARAMETER_VALUE);
+      if (!para->isStatic()) {
+        para->enableType(PARAMETER_GRADIENT);
+      }
+    }
+  };
+
+  NeuralNetwork* nn = NeuralNetwork::create(config);
+  nn->init(config, mainParamInitCb);
+  gradientMachine_.reset(nn);
+  parameters_ = gradientMachine_->getParameters();
+
+  numLogicalDevices_ = 0;
+  if (useGpu_) {
+    numLogicalDevices_ = 1;
+
+    for (size_t pid = 0; pid  < parameters_.size(); pid++) {
+      if (parameters_[pid]->getConfig().device() + 1 > numLogicalDevices_) {
+        numLogicalDevices_ = parameters_[pid]->getConfig().device() + 1;
+      }
+    }
+    LOG(INFO) << "numLogicalDevices=" << numLogicalDevices_
+              << " numThreads=" << numThreads_
+              << " numDevices=" << numDevices_;
+
+    if (numLogicalDevices_ * numThreads_ > numDevices_
+        && FLAGS_allow_only_one_model_on_one_gpu) {
+      LOG(FATAL) << "trainer_count * num_devices_in_model "
+                 << "(" << numThreads_ << "*" << numLogicalDevices_ << ")"
+                 << "=" << numThreads_ * numLogicalDevices_
+                 << " exceeds number of GPU devices(" << numDevices_ << ")";
+    }
+    numLogicalDevices_ = std::min(numLogicalDevices_, numDevices_);
+
+    /* Enables direct access to memory allocations on a peer device */
+    for (int i = 0; i < numThreads_; i++) {
+      for (int d = 0; d < numLogicalDevices_; ++d) {
+        enablePeerAccess(logicalDeviceId2RealDeviceId(d, i),
+                         logicalDeviceId2RealDeviceId(d, i + 1));
+        enablePeerAccess(logicalDeviceId2RealDeviceId(d, i),
+                         logicalDeviceId2RealDeviceId(d, i - 1));
+      }
+    }
+  }
+
+  for (int i = 0; i < numThreads_; ++i) {
+    threads_.emplace_back(
+        new TrainerThread(
+            config,
+            i,
+            this));
+  }
+
+  bufferSizes_.resize(numLogicalDevices_, 0);
+  paraMainThread_.reserve(parameters_.size());
+  int pid = 0;
+  for (auto& para : parameters_) {
+    if (para->isStatic() || !para->useGpu()) {
+      paraMainThread_.push_back(0);
+    } else {
+      int end = pid++ % numThreads_;
+      paraMainThread_.push_back(end);
+      int paraDeviceId = para->getDeviceId();
+      if (paraDeviceId == -1) paraDeviceId = 0;
+      paraDeviceId = paraDeviceId % numLogicalDevices_;
+      if (para->getSize() > bufferSizes_[paraDeviceId]) {
+        bufferSizes_[paraDeviceId] = para->getSize();
+        VLOG(1) << "bufferSize[" << paraDeviceId << "]" << para->getSize();
+      }
+    }
+  }
+
+  // TODO(xuwei06) Instead of using maximal buffer size, we may use a smaller
+  // fixed buffer size and use pipeline to dispatch parameter value and merge
+  // parameter gradient, which may be faster.
+
+  // combination of all trainers mainPara into GradientMachine parameters
+  hasNonstaticCpuParamters_ = false;
+  for (size_t pid = 0; pid  < parameters_.size(); pid++) {
+    if (parameters_[pid]->useGpu()) {
+      parameters_[pid] = threads_[paraMainThread_[pid]]->getParameters()[pid];
+    } else if (!parameters_[pid]->isStatic()) {
+      hasNonstaticCpuParamters_ = true;
+    }
+  }
+
+  gradBufs_.resize(numThreads_);
+  for (int i = 0; i < numThreads_; ++i) {
+    gradBufs_[i].resize(numLogicalDevices_);
+    for (int d = 0; d < numLogicalDevices_; ++d) {
+      gradBufs_[i][d].sem.post();
+    }
+  }
+
+  outArgStream_ = HPPL_STREAM_1;
+
+  for (auto& thread : threads_) {
+    thread->start();
+  }
+}
+
+std::vector<const std::vector<ParameterPtr>*>
+MultiGradientMachine::getSlaveParameters() {
+  std::vector<const std::vector<ParameterPtr>*> vec;
+  vec.reserve(threads_.size());
+  for (auto& thread : threads_) {
+    vec.push_back(&thread->getParameters());
+  }
+  return vec;
+}
+
+void MultiGradientMachine::notifyGradientTransfer(int paramId) {
+  gradQueue_.enqueue(paramId);
+}
+
+void MultiGradientMachine::allocGradBufs() {
+  if (numLogicalDevices_ == 0) return;
+  if (gradBufs_[0][0].bufs.size() >= mergeTypes_.size()) return;
+
+  for (int i = 0; i < numThreads_; i++) {
+    for (int d = 0; d < numLogicalDevices_; ++d) {
+      if (bufferSizes_[d] == 0) continue;
+      SetDevice device(logicalDeviceId2RealDeviceId(d, i));
+      for (size_t j = 0; j < mergeTypes_.size(); j++) {
+        gradBufs_[i][d].bufs.push_back(
+          Vector::create(bufferSizes_[d], /* useGpu= */true));
+      }
+    }
+  }
+}
+
+void MultiGradientMachine::prefetch(const std::vector<Argument>& inArgs) {
+  // Each gradient machine in threads needs to do prefetch on its own
+  // part of inArgs. So we need to first divide inArgs to each thread
+  inArgs_ = inArgs;
+  startTask(TASK_COPY_IN_ARGS);
+
+  for (auto& para : parameters_) {
+    if (para->isSparseRemoteUpdate()) {
+      auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
+          para->getMat(PARAMETER_VALUE).get());
+      mat->clearIndices();
+    }
+  }
+
+  waitForCopyInArgs();
+
+  // Because SparsePrefetchRowCpuMatrix can only be changed by ONE thread
+  // at one time, we need to do prefetch sequentially
+  for (auto& thread : threads_) {
+    thread->prefetch();
+  }
+
+  for (auto& para : parameters_) {
+    if (para->isSparseRemoteUpdate()) {
+      auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
+          para->getMat(PARAMETER_VALUE).get());
+      mat->setupIndices();
+      auto matGrad = dynamic_cast<SparseRowCpuMatrix*>(
+          para->getMat(PARAMETER_GRADIENT).get());
+      matGrad->reserveStore();
+    }
+  }
+}
+
+void MultiGradientMachine::forward(
+    const std::vector<Argument>& inArgs,
+    std::vector<Argument>* outArgs,
+    PassType passType) {
+  forwardImp(inArgs, outArgs, passType, TASK_FORWARD);
+}
+
+void MultiGradientMachine::forwardImp(
+    const std::vector<Argument>& inArgs,
+    std::vector<Argument>* outArgs,
+    PassType passType,
+    TaskType taskType) {
+  updateThreadParameters();
+  passType_ = passType;
+
+  if (!inArgsCopied_) {
+    inArgs_ = inArgs;
+    inArgsCopied_ = false;
+  }
+
+  fillMergeTypes(passType, &mergeTypes_);
+  allocGradBufs();
+  startTask(taskType);
+
+  getOutArgs(outArgs, passType);
+}
+
+void MultiGradientMachine::backward(const UpdateCallback& callback) {
+  backwardCallback_ = callback;
+  startTask(TASK_BACKWARD);
+  backwardImp(callback);
+}
+
+void MultiGradientMachine::forwardBackward(
+    const std::vector<Argument>& inArgs,
+    std::vector<Argument>* outArgs,
+    PassType passType,
+    const UpdateCallback& callback) {
+  backwardCallback_ = callback;
+  forwardImp(inArgs, outArgs, passType, TASK_FORWARD_BACKWARD);
+  backwardImp(callback);
+}
+
+void MultiGradientMachine::backwardImp(
+    const UpdateCallback& callback) {
+  for (size_t i = 0; i < parameters_.size(); i++) {
+    if (!parameters_[i]->useGpu() || parameters_[i]->isStatic()) continue;
+    REGISTER_TIMER("controller_dequeue");
+    gradQueue_.dequeue();
+  }
+  if (hasNonstaticCpuParamters()) {
+    waitAfterMerge();
+    if (backwardCallback_) {
+      for (auto& para : parameters_) {
+        if (!para->useGpu() && !para->isStatic()) {
+          backwardCallback_(para.get());
+        }
+      }
+    }
+  }
+}
+
+void MultiGradientMachine::updateThreadParameters() {
+  for (size_t pid = 0; pid < parameters_.size(); ++pid) {
+    if (!parameters_[pid]->useGpu()) continue;
+    if (!parameters_[pid]->isValueUpdated()) continue;
+    parameters_[pid]->clearValueUpdated();
+    for (int i = 0; i < (int)threads_.size(); i++) {
+      threads_[i]->incUpdateCounter();
+    }
+    // NotifyValueReady should happen after that all threads' incUpdateCounter()
+    // are called so that the counters are correct when notifyValueReady()
+    // is called.
+    threads_[paraMainThread_[pid]]->notifyValueReady(pid);
+  }
+}
+
+void MultiGradientMachine::onPassEnd() {
+  for (auto& thread : threads_) {
+    thread->onPassEnd();
+  }
+}
+
+void MultiGradientMachine::finish() {
+  for (auto& thread : threads_) {
+    thread->stop();
+  }
+}
+
+Evaluator* MultiGradientMachine::makeEvaluator() {
+  return threads_[0]->getGradientMachine()->makeEvaluator();
+}
+
+void MultiGradientMachine::eval(Evaluator* evaluator) {
+  for (auto& thread : threads_) {
+    SetDevice device(thread->getDeviceId());
+    thread->getGradientMachine()->eval(evaluator);
+  }
+}
+
+void MultiGradientMachine::getOutArgs(
+    std::vector<Argument>* outArgs,
+    PassType passType) {
+  for (auto& thread : threads_) {
+    REGISTER_TIMER("waitOutArgs");
+    thread->waitOutArgsReady();
+  }
+  outArgs_.resize(threads_[0]->getOutArgs().size());
+
+  REGISTER_TIMER("copyOutArgs");
+  for (size_t i = 0; i < outArgs_.size(); ++i) {
+    std::vector<Argument> args;
+    args.reserve(threads_.size());
+    for (auto& thread : threads_) {
+      args.push_back(thread->getOutArgs()[i]);
+    }
+    outArgs_[i].concat(args, useGpu_, outArgStream_, passType);
+  }
+
+  if (useGpu_) {
+    hl_stream_synchronize(outArgStream_);
+  }
+
+  *outArgs = outArgs_;
+}
+
+
+void MultiGradientMachine::setOutputGrad(const std::vector<Argument>& args) {
+  CHECK_EQ(args.size(), outArgs_.size());
+  for (size_t i = 0; i < args.size(); i++) {
+    outArgs_[i].grad = args[i].grad;
+  }
+}
+
+void MultiGradientMachine::startTask(TaskType taskType) {
+  taskType_ = taskType;
+  for (auto& thread : threads_) {
+    thread->notifyTaskReady();
+  }
+}
+
+TrainerThread::TrainerThread(
+    const ModelConfig& config,
+    int threadId,
+    MultiGradientMachine* multiMachine)
+    : multiMachine_(multiMachine),
+      config_(config),
+      threadId_(threadId),
+      inArgsCopied_(false) {
+  int numThreads = multiMachine->getNumThreads();
+
+  auto& mainParas = multiMachine->getParameters();
+
+  using std::placeholders::_1;
+  using std::placeholders::_2;
+
+  partnerId_ = mod(threadId_ - 1, numThreads);
+
+  deviceId_ = !multiMachine_->useGpu() ? -1
+      : multiMachine_->logicalDeviceId2RealDeviceId(0, threadId_);
+  SetDevice gpuDevice(deviceId_);
+
+  NeuralNetwork* nn = nullptr;
+  if (!multiMachine->useGpu() || !FLAGS_parallel_nn) {
+    nn = NeuralNetwork::create(config);
+  } else {
+    nn = new ParallelNeuralNetwork();
+    for (auto& paraConfig : *config_.mutable_parameters()) {
+      if (paraConfig.device() != -1) {
+        paraConfig.set_device(
+          multiMachine_->logicalDeviceId2RealDeviceId(
+            paraConfig.device(), threadId_));
+      }
+    }
+    for (auto& layerConfig : *config_.mutable_layers()) {
+      if (layerConfig.device() != -1) {
+        layerConfig.set_device(
+          multiMachine_->logicalDeviceId2RealDeviceId(
+            layerConfig.device(), threadId_));
+      }
+    }
+  }
+  // Only GPU do not share parameter values with main paramters.
+  ParamInitCallback slaveParamInitCb = std::bind(parameterInitNN, _1, _2,
+                                                 &mainParas);
+  nn->init(config_, slaveParamInitCb);
+  gradientMachine_.reset(nn);
+  parameters_ = gradientMachine_->getParameters();
+  if (!FLAGS_parallel_nn) {
+    for (auto& para : parameters_) {
+      para->setDevice(deviceId_);
+    }
+  }
+
+  backwardCallback_ = std::bind(
+      &TrainerThread::backwardCallback,
+      this, std::placeholders::_1);
+
+  gradStream_ = HPPL_STREAM_2;
+  valueStream_ = HPPL_STREAM_3;
+  stopping_ = false;
+  updateCounter_ = 0;
+  parameterUpdated_ = false;
+}
+
+TrainerThread::~TrainerThread() {
+  stop();
+}
+
+void TrainerThread::start() {
+  gradientMachine_->start(*(TrainerConfig*)nullptr, (DataProviderPtr)nullptr);
+
+  computeThread_.reset(new std::thread(
+      [this](){ computeThread(); }));
+
+  if (multiMachine_->useGpu()) {
+    gradCollectThread_.reset(new std::thread(
+      [this](){ gradCollectThread(); }));
+
+    valueDispatchThread_.reset(new std::thread(
+      [this](){ valueDispatchThread(); }));
+
+    copyThread_.reset(new std::thread(
+      [this](){ copyGradToBufferThread(); }));
+  }
+}
+
+void TrainerThread::stop() {
+  if (stopping_) return;
+
+  stopping_ = true;
+
+  if (computeThread_) {
+    taskReadySem_.post();
+    computeThread_->join();
+  }
+  if (gradCollectThread_) {
+    gradQueue_.enqueue(0);
+    gradCollectThread_->join();
+  }
+  if (copyThread_) {
+    gradBufQueue_.enqueue(0);
+    copyThread_->join();
+  }
+  if (valueDispatchThread_) {
+    valueReadyQueue_.enqueue(0);
+    valueDispatchThread_->join();
+  }
+}
+
+void TrainerThread::computeThread() {
+  VLOG(1) << "gradComputeThread " << threadId_;
+
+  if (deviceId_ >= 0) {
+    hl_init(deviceId_);
+  }
+
+  while (true) {
+    {
+      REGISTER_TIMER("taskSem_wait");
+      taskReadySem_.wait();
+    }
+
+    if (stopping_) break;
+
+    switch (multiMachine_->getTaskType()) {
+      case MultiGradientMachine::TASK_FORWARD_BACKWARD:
+        forward();
+        backward();
+        break;
+      case MultiGradientMachine::TASK_FORWARD:
+        forward();
+        break;
+      case MultiGradientMachine::TASK_BACKWARD:
+        backward();
+        break;
+      case MultiGradientMachine::TASK_COPY_IN_ARGS:
+        copyInArgs();
+        inArgsCopied_ = true;
+        multiMachine_->waitForCopyInArgs();
+        break;
+    }
+  }
+}
+
+void TrainerThread::prefetch() {
+  SetDevice setDevice(deviceId_);
+  gradientMachine_->prefetch(inArgs_);
+}
+
+void TrainerThread::forward() {
+  if (!inArgsCopied_) {
+    REGISTER_TIMER("copyInArgs");
+    copyInArgs();
+  } else {
+    inArgsCopied_ = false;
+  }
+
+  if (multiMachine_->getPassType() != PASS_TEST) {
+    REGISTER_TIMER("clearGradient");
+    // For main parameter, the user of MultiGpuSyncMachine is responsible
+    // for setting the gradient to zero
+    for (size_t i = 0; i < parameters_.size(); i++) {
+      if (parameters_[i]->useGpu()) {
+        if (multiMachine_->paraMainThread(i) != threadId_) {
+          SetDevice device(parameters_[i]->getDeviceId());
+          parameters_[i]->clearGradient();
+        }
+      } else {
+        parameters_[i]->clearGradient();
+      }
+    }
+  }
+
+  {
+    REGISTER_TIMER("wait_value");
+    valueReadyCond_.wait(
+        [this]() {
+          return !parameterUpdated_;
+        });
+  }
+
+  {
+    fillMergeTypes(multiMachine_->getPassType(), &mergeTypes_);
+  }
+
+  {
+    REGISTER_TIMER("thread_forward");
+    gradientMachine_->forward(
+        inArgs_, &outArgs_, multiMachine_->getPassType());
+  }
+  outArgsReadySem_.post();
+}
+
+void TrainerThread::backward() {
+  REGISTER_TIMER("thread_backward");
+  if (multiMachine_->isPassGrad()) {
+    copyOutputGrad();
+  }
+  gradientMachine_->backward(backwardCallback_);
+  if (multiMachine_->hasNonstaticCpuParamters()) {
+    mergeCpuGradients();
+  }
+}
+
+void TrainerThread::backwardCallback(Parameter* para) {
+  // CPU parameters are merged in the end
+  if (!para->useGpu()) return;
+
+  int paramId = para->getID();
+  if (multiMachine_->getNumThreads() == 1) {
+    // no need to do merge if there is only one thread
+    doCallback(paramId);
+  } else if (threadId_ ==
+             mod(multiMachine_->paraMainThread(paramId) - 1,
+                 multiMachine_->getNumThreads())) {
+    notifyCopyGradToBuffer(paramId);
+  } else {
+    notifyGradientCollect(paramId);
+  }
+}
+
+void TrainerThread::copyGradToBufferThread() {
+  VLOG(1) << "copyGradToBufferThread " << threadId_;
+
+  if (deviceId_ >= 0) {
+    hl_init(deviceId_);
+  }
+  auto& partnerThread = multiMachine_->getThread(partnerId_);
+  auto& gradBufs = multiMachine_->getGradBuf(partnerId_);
+
+  while (true) {
+    int pid = gradBufQueue_.dequeue();
+    if (stopping_) break;
+
+    int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId(
+      parameters_[pid]->getDeviceId(), threadId_);
+
+    auto& gradBuf = gradBufs[pdeviceId];
+
+    {
+      REGISTER_TIMER("waitBufferReady");
+      gradBuf.sem.wait();
+    }
+
+    {
+      REGISTER_TIMER("copyGradToBuffer");
+      SetDevice setDevice(parameters_[pid]->getDeviceId());
+      for (size_t i = 0; i < mergeTypes_.size(); ++i) {
+        gradBuf.bufs[i]->resize(
+          parameters_[pid]->getBuf(mergeTypes_[i])->getSize());
+        gradBuf.bufs[i]->copyFrom(
+            *parameters_[pid]->getBuf(mergeTypes_[i]), gradStream_);
+      }
+      hl_stream_synchronize(gradStream_);
+    }
+    partnerThread->notifyGradientCollect(pid);
+  }
+}
+
+void TrainerThread::gradCollectThread() {
+  VLOG(1) << "gradCollectThread " << threadId_;
+
+  if (deviceId_ >= 0) {
+    hl_init(deviceId_);
+  }
+
+  std::vector<size_t> gradReadyCount(parameters_.size(), 0);
+
+  auto& gradBufs = multiMachine_->getGradBuf(threadId_);
+
+  while (true) {
+    int pid = gradQueue_.dequeue();
+    if (stopping_) break;
+
+    if (++gradReadyCount[pid] < 2) continue;
+    gradReadyCount[pid] = 0;
+    int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId(
+      parameters_[pid]->getDeviceId(), threadId_);
+
+    auto& gradBuf = gradBufs[pdeviceId];
+
+    {
+      REGISTER_TIMER("mergeGrad");
+      for (size_t i = 0; i < mergeTypes_.size(); ++i) {
+        ParameterType type = mergeTypes_[i];
+        const VectorPtr& localGrad = parameters_[pid]->getBuf(type);
+        SetDevice setDevice(parameters_[pid]->getDeviceId());
+        localGrad->add(*gradBuf.bufs[i]);
+      }
+    }
+
+    gradBuf.sem.post();
+
+    if (multiMachine_->paraMainThread(pid) == threadId_) {
+      doCallback(pid);
+    } else {
+      notifyCopyGradToBuffer(pid);
+    }
+  }
+}
+
+void TrainerThread::doCallback(int pid) {
+  REGISTER_TIMER("callback");
+  auto& gpuThreads = multiMachine_->getAllThreads();
+  if (multiMachine_->getBackwardCallback()) {
+    // The callback supplied by the user of MultiGradientMachine may handle
+    // the parameter update using the gradient.
+    multiMachine_->getBackwardCallback()(parameters_[pid].get());
+    if (parameters_[pid]->isValueUpdated()) {
+      parameters_[pid]->clearValueUpdated();
+      for (auto& thread : gpuThreads) {
+        thread->incUpdateCounter();
+      }
+      notifyValueReady(pid);
+    }
+  }
+  multiMachine_->notifyGradientTransfer(pid);
+}
+
+void TrainerThread::valueDispatchThread() {
+  VLOG(1) << "valueDispatchThread " << threadId_;
+
+  if (deviceId_ >= 0) {
+    hl_init(deviceId_);
+  }
+
+  auto& thread = multiMachine_->getThread(partnerId_);
+
+  while (true) {
+    int pid;
+    {
+      REGISTER_TIMER("value_dequeue");
+      pid = valueReadyQueue_.dequeue();
+    }
+    if (stopping_) break;
+
+    if (multiMachine_->paraMainThread(pid) == partnerId_) continue;
+
+    {
+      REGISTER_TIMER("copyValue");
+      SetDevice setDevice(parameters_[pid]->getDeviceId());
+      thread->getValueBuf(pid)->copyFrom(*getValueBuf(pid), valueStream_);
+      hl_stream_synchronize(valueStream_);
+    }
+
+    thread->notifyValueReady(pid);
+  }
+}
+
+void TrainerThread::notifyValueReady(int paramId) {
+  if (--updateCounter_ == 0) {
+    valueReadyCond_.notify_all(
+        [this] { parameterUpdated_ = false; });
+  }
+
+  notifyValueDispatch(paramId);
+}
+
+void TrainerThread::copyInArgs() {
+  const std::vector<Argument>& fullInArgs = multiMachine_->getInArgs();
+  int     numThreads = multiMachine_->getAllThreads().size();
+  int32_t numSequences = fullInArgs[0].getNumSequences();
+  int32_t startSeq = numSequences * threadId_ / numThreads;
+  int32_t endSeq = numSequences * (threadId_ + 1) / numThreads;
+  int32_t copySize = endSeq - startSeq;
+
+  /**
+   * For the first copy, need to allocate space here
+   */
+  if (inArgs_.size() == 0) {
+    inArgs_.resize(fullInArgs.size());
+  }
+
+  if (copySize == 0) {
+    return;
+  }
+
+  for (size_t i=0; i < fullInArgs.size(); i++) {
+    inArgs_[i].resizeAndCopyFrom(
+        fullInArgs[i], startSeq, copySize,
+        FLAGS_parallel_nn ? false : multiMachine_->useGpu());
+  }
+}
+
+void TrainerThread::mergeCpuGradients() {
+  CHECK_EQ(mergeTypes_.size(), 1UL);
+  CHECK_EQ(mergeTypes_[0], PARAMETER_GRADIENT);
+
+  {
+    REGISTER_TIMER("waitbeforeMerge");
+    multiMachine_->waitBeforeMerge();
+  }
+  std::vector<const std::vector<ParameterPtr>*> slaveParameters =
+      multiMachine_->getSlaveParameters();
+
+  CHECK(slaveParameters.size());
+  for (auto& para : multiMachine_->getNonStaticParameters()) {
+    if (para->useGpu()) continue;
+    if (para->isSparseRemoteUpdate()) {
+      REGISTER_TIMER("mergeRemoteGradSparse");
+      mergeGradSparseRemote(para.get(), slaveParameters);
+    } else if (para->isGradSparseUpdate()) {
+      REGISTER_TIMER("mergeGradSparse");
+      mergeGradSparse(para.get(), slaveParameters);
+    } else {
+      REGISTER_TIMER("mergeGradDense");
+      mergeGradDense(para.get(), slaveParameters);
+    }
+  }
+  {
+    REGISTER_TIMER("waitbeforeMerge");
+    multiMachine_->waitAfterMerge();
+  }
+}
+
+void TrainerThread::mergeGradSparse(
+    Parameter* para,
+    std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
+  size_t pid = para->getID();
+  SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>(
+      para->getMat(PARAMETER_GRADIENT).get());
+  std::vector<uint32_t>& ids = mainMat->getIds(threadId_);
+
+  ids.clear();
+  for (auto slaveParams : slaveParameters) {
+    SparseRowCpuMatrix* mat =
+        dynamic_cast<SparseRowCpuMatrix*>((*slaveParams)[pid]
+                                              ->getMat(PARAMETER_GRADIENT)
+                                              .get());
+    mat->addTo(*mainMat, ids, threadId_, multiMachine_->getNumThreads());
+    // we use a sample hash method(%) instead of range partition,
+    // because range partition has balance issue sometimes,
+    // when feature ids are not generated from hashcode.
+  }
+  uniqueIds(ids);
+}
+
+void TrainerThread::mergeGradSparseRemote(
+    Parameter* para,
+    std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
+  size_t pid = para->getID();
+  SparseRowCpuMatrix* mainMat =
+      dynamic_cast<SparseRowCpuMatrix*>(para->getMat(PARAMETER_GRADIENT).get());
+
+  mainMat->checkIndices();
+  mainMat->zeroMemThread(threadId_, multiMachine_->getNumThreads());
+
+  for (auto slaveParams : slaveParameters) {
+    SparseRowCpuMatrix* mat = dynamic_cast<SparseRowCpuMatrix*>(
+        (*slaveParams)[pid]->getMat(PARAMETER_GRADIENT).get());
+    mat->addTo(*mainMat, threadId_, multiMachine_->getNumThreads());
+  }
+}
+
+void TrainerThread::mergeGradDense(
+    Parameter* para,
+    std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
+  size_t pid = para->getID();
+  auto interval =
+      calcSplitArrayInterval(para->getSize(), (size_t)threadId_,
+                             multiMachine_->getNumThreads(), 8LU /*for avx*/);
+  size_t startSeq = interval.first;
+  size_t copySize = interval.second - interval.first;
+
+  // setup sub bufs
+  CpuVector destGrad(0, nullptr);
+  destGrad.subVecFrom(*para->getBuf(PARAMETER_GRADIENT), startSeq, copySize);
+
+  // merge
+  CpuVector slaveGradSub(0, nullptr);
+  for (auto slaveParams : slaveParameters) {
+    slaveGradSub.subVecFrom(
+      *(*slaveParams)[pid]->getBuf(PARAMETER_GRADIENT),
+      startSeq, copySize);
+    destGrad.add(slaveGradSub);
+  }
+}
+
+void TrainerThread::copyOutputGrad() {
+  const std::vector<Argument>& outputGradArgs = multiMachine_->outArgs_;
+  int numThreads = multiMachine_->getAllThreads().size();
+  int32_t numSequences = outputGradArgs[0].getNumSequences();
+  int32_t startSeq = numSequences * threadId_ / numThreads;
+  int32_t endSeq = numSequences * (threadId_ + 1) / numThreads;
+  int32_t copySize = endSeq - startSeq;
+  outArgs_.resize(outputGradArgs.size());
+  for (size_t i = 0; i < outputGradArgs.size(); i++) {
+    outArgs_[i].resizeAndCopyFrom(outputGradArgs[i], startSeq, copySize,
+                                  multiMachine_->useGpu(),
+                                  HPPL_STREAM_DEFAULT);
+  }
+  if (multiMachine_->useGpu()) {
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  }
+  gradientMachine_->setOutputGrad(outArgs_);
+}
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h
new file mode 100644
index 00000000000000..7c4ec4f6d2563f
--- /dev/null
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -0,0 +1,493 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <atomic>
+
+#include "GradientMachine.h"
+
+#include "paddle/utils/Queue.h"
+#include "paddle/utils/Locks.h"
+#include "hl_gpu.h"
+
+namespace paddle {
+
+class TrainerThread;
+
+typedef Queue<int> PidQueue;
+typedef std::unique_ptr<TrainerThread> TrainerThreadPtr;
+
+struct GradBuffer {
+  // GradBuffer is used for gathering gradient for GPU parameters
+  int paramId;
+
+  // sem is used to notify that the local gradient merge of the current thread
+  // finished for the current thread.
+  Semaphore sem;
+
+  std::vector<VectorPtr> bufs;  // bufs[mergeIndex]
+};
+
+/**
+ *  A MultiGradientMachine is a synchronous GradientMachine which devides
+ *  one data batch into several smaller batches and assign each one small batch
+ *  to one computint thread for computation. After each thread finishes
+ *  computation, it merges result (including output Argument and gradient during
+ *  backward()). It basically is the same as single thread gradient machine,
+ *  except that it uses multi-thread to do the computation.
+ *
+ *  It handles GPU and Cpu parameters differently.  In GPU, one computing thread
+ *  generally corresponds to one GPU device. Thus, each thread keeps a separate
+ *  copy of the parameter in its own device's memory. In CPU, we only need to keep
+ *  one copy of the parameters in the main memory. After, each computing thread
+ *  computes its own parameter gradient, the update process needs to accumulate
+ *  the parameter gradients from all the computing threads, and update the
+ *  accumulated parameter gradient to the corresponding parameter value.
+ *
+ *  Each GPU parameter is assigned to a thread called its main thread. For each
+ *  parameter, the accumulation of its gradients and the update of its value
+ *  happens in its main thread. The main thread first gather the parameter
+ *  gradients from all the computing thread. Then, it performs parameter update.
+ *  After a gradient is updated by the main thread, it is scattered to all the
+ *  computing thread so that the parameters in all the computing threads are
+ *  synchronized. The scatter and gather process are implemented by ring-style
+ *  communication. Assume we have N computing threads, its thread ids will be
+ *  0, 1, ..., N-1. For each parameter, the id of the main thread is specified in
+ *  paraMainThread_[pid], where pid is the id of the parameter. Each thread i only
+ *  sends data to its partner thread (i - 1) % N. For example, for a parameter
+ *  gradient that is computed in thread 4, and its main thread is 2. Its
+ *  traveling process would be 4, 5,..., N-1, 0, 1, 2. In each step, the gradient
+ *  buffer is added to the local gradient, and the local gradient is then copied
+ *  to the gradient buffer of the next thread. At last, its main thread 2 will
+ *  get the accumulated parameter gradient. For the same parameter, after its
+ *  value is updated, the value's traveling process would be 2, 1, 0, N-1, ... 3.
+ *  At the end, all the computing threads would have the updated parameter value.
+ *
+ *  A computing thread (TrainerThread) uses 4 threads to do different jobs:
+ *
+ *  1. computeThread(): performing forward(), backward(), prefetch().
+ *
+ *  2. valueDispatchThread(): copying parameter values to partner thread.
+ *
+ *  3. copyGradToBufferThread(): copying parameter gradient to partner thread.
+ *
+ *  4. gradCollectThread(): merging the gradient from step 3 with local gradient
+ *     and call the callback supplied by the user to update parameter value.
+ *
+ *  CPU parameter value has only one copy. And their gradients are merged at the
+ *  end of backward().
+ *
+ *  * Handling of sparse update
+ *  Currently, sparse update is only supported for CPU parameters.
+
+ *  Sparse updates refers to gradient caculation where the gradient is sparse. For
+ *  example, if the input argument to a 'fc' layer is sparse, the gradient of the
+ *  weight matrix of this layer will be sparse. It is usually more efficient to
+ *  treat the gradient explicitly as sparse vector during the parameter update.
+
+ *  There are two types of sparse updates called local sparse update and remote
+ *  sparse update.
+
+ *  For both types of sparse updates, there is one copy of parameter value and
+ *  gradient called main parameter value and gradient, and there is a copy of
+ *  parameter value and gradient for each computing thread called slave parameter
+ *  value and gradient. The slave parameter values are always shared with the
+ *  corresponding main parameter value. The slave parameter grad is a sparse row
+ *  matrix. The sparse pattern for slave parameter grads are different, because
+ *  the small batches for each computing thread might have different sparsity
+ *  pattern.
+
+ *  1. Local sparse update
+ *
+ *     Main parameter value type is MAT_NORMAL. It is a dense matrix.
+ *
+ *     Main parameter grad type is MAT_SPARSE_ROW_IDS (SparseRowIdsCpuMatrix)
+ *     It is also a dense matrix, but the updated values are specified by IDS.
+ *
+ *     Slave parameter value shares with main parameter value.
+ *
+ *     Slave parameter grad type is MAT_SPARSE_ROW_AUTO_GROW
+ *     (SparseAutoGrowRowCpuMatrix). It is a sparse row matrix.
+ *
+ *     During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will
+ *     gather all the non-zero gradient. And After backward(), they will be merged
+ *     into main parameter grad (SparseRowIdsCpuMatrix), with indices indicating
+ *     which rows have nonzero gradient.
+ *
+ *  2. Remote sparse update
+ *
+ *     Main parameter value type is MAT_SPARSE_ROW_PREFETCH(_FULL_SIZE)
+ *     (SparsePrefetchRowCpuMatrix). MAT_SPARSE_ROW_PREFETCH is a sparse matrix.
+ *     MAT_SPARSE_ROW_PREFETCH_FULL_SIZE is a dense matrix. However, only the
+ *     parameter values that are prefetched is up-to-date.
+ *
+ *     Main parameter grad type is MAT_SPARSE_ROW (SparseRowCpuMatrix).
+ *     And it shares sparse pattern with value by sharing indexDictHandle_, which
+ *     is an internal data structure used by SparseRowCpuMatrixto specify the
+ *     sparsity pattern of Slave parameter value shares with main parameter value.
+ *
+ *     Slave parameter grad type is MAT_SPARSE_ROW_AUTO_GROW
+ *     (SparsePrefetchRowCpuMatrix). It is a sparse row matrix
+ *
+ *     During prefetch(), all the layers will indicates which rows of each
+ *     parameter are needed. Then the framework will retrieve those rows from
+ *     parameter server.
+ *
+ *     During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will
+ *     gather all the non-zero gradient. And After backward(), they will be merged
+ *     into main parameter grad (SparseRowCpuMatrix). And the framework will send
+ *     the merged gradient to parameter server.
+ */
+class MultiGradientMachine : public GradientMachine {
+public:
+  enum TaskType {
+    TASK_FORWARD_BACKWARD = 0,
+    TASK_FORWARD = 1,
+    TASK_BACKWARD = 2,
+    TASK_COPY_IN_ARGS = 3,
+  };
+
+  explicit MultiGradientMachine(const ModelConfig& config, bool useGpu);
+
+  virtual void prefetch(const std::vector<Argument>& inArgs);
+
+  virtual void forward(
+      const std::vector<Argument>& inArgs,
+      std::vector<Argument>* outArgs,
+      PassType passType);
+
+  virtual void backward(const UpdateCallback& callback = nullptr);
+
+  void forwardBackward(
+    const std::vector<Argument>& inArgs,
+    std::vector<Argument>* outArgs,
+    PassType passType,
+    const UpdateCallback& callback);
+
+  virtual void onPassEnd();
+
+  virtual void finish();
+
+  virtual Evaluator* makeEvaluator();
+
+  virtual void eval(Evaluator* evaluator);
+
+  bool useGpu() const {
+    return useGpu_;
+  }
+
+  // @return whether to pass the gradients in outArgs_ to each threads.
+  bool isPassGrad() { return isPassGrad_; }
+
+  // @brief set whether to pass the gradient in outArgs_ to each threads.
+  void setPassGrad(bool isPass) { isPassGrad_ = isPass; }
+
+  // Set the gradients of the outputs.
+  // The gradietns will be copied to each thread in the computing threads.
+  virtual void setOutputGrad(const std::vector<Argument>& args);
+
+protected:
+  friend class TrainerThread;
+
+  std::vector<TrainerThreadPtr>& getAllThreads() {
+    return threads_;
+  }
+  // Calculate the real device id based on the logical device id and the
+  // thread id.
+  int logicalDeviceId2RealDeviceId(int logicalId, int threadId = 0) const {
+    if (logicalId == -1) {
+      logicalId = 0;
+    }
+    return mod(logicalId + FLAGS_gpu_id + threadId * numLogicalDevices_,
+               numDevices_);
+  }
+
+  // Calculate the logical device id based on the real device id and the
+  // thread id.
+  int realDeviceId2LogicalDeviceId(int realId, int threadId = 0) const {
+    if (realId == -1) {
+      return 0;
+    } else {
+      return mod(realId - FLAGS_gpu_id - threadId * numLogicalDevices_,
+                 numDevices_);
+    }
+  }
+
+  std::vector<const std::vector<ParameterPtr>*> getSlaveParameters();
+
+  bool hasNonstaticCpuParamters() const {
+    return hasNonstaticCpuParamters_;
+  }
+
+  // Called TrainerThread to wait before merging CPU parameter gradients.
+  void waitBeforeMerge() { trainerBarrier_.wait(); }
+
+  // called by MultiGradientMachine and TrainerThread to wait after merging
+  // CPU parameter graidents.
+  void waitAfterMerge() { allBarrier_.wait(); }
+
+  // called by MultiGradientMachine and TrainerThread to wait for copyInArgs()
+  // finishing
+  void waitForCopyInArgs() { allBarrier_.wait(); }
+
+  TrainerThreadPtr& getThread(int threadId) {
+    return threads_[threadId];
+  }
+
+  std::vector<GradBuffer>& getGradBuf(int threadId) {
+    return gradBufs_[threadId];
+  }
+
+  PassType getPassType() const {
+    return passType_;
+  }
+
+  // Called by TrainerThread to notify MultiGradientMachine that the gradient
+  // for paramId is ready
+  void notifyGradientTransfer(int paramId);
+
+  const std::vector<Argument>& getInArgs() {
+    return inArgs_;
+  }
+
+  TaskType getTaskType() const {
+    return taskType_;
+  }
+
+  const UpdateCallback& getBackwardCallback() const {
+    return backwardCallback_;
+  }
+
+  int getNumDevices() const {
+    return numDevices_;
+  }
+
+  int getNumLogicalDevices() const {
+    return numLogicalDevices_;
+  }
+
+  int getNumThreads() const {
+    return numThreads_;
+  }
+
+  int paraMainThread(int pid) const {
+    return paraMainThread_[pid];
+  }
+
+protected:
+  virtual void forwardImp(
+      const std::vector<Argument>& inArgs,
+      std::vector<Argument>* outArgs,
+      PassType passType,
+      TaskType taskType);
+
+  virtual void backwardImp(
+      const UpdateCallback& callback = NULL);
+
+  // update all parameters
+  void updateThreadParameters();
+
+  void startTask(TaskType taskType);
+
+  void getOutArgs(std::vector<Argument>* outArgs, PassType passType);
+
+  void allocGradBufs();
+
+protected:
+  bool useGpu_;
+
+  bool hasNonstaticCpuParamters_;
+
+  // store main parameter only
+  std::unique_ptr<GradientMachine> gradientMachine_;
+
+  std::vector<TrainerThreadPtr> threads_;
+  std::vector<int> paraMainThread_;
+  std::vector<std::vector<GradBuffer>> gradBufs_;  // [threadId][deviceId]
+  std::vector<size_t> bufferSizes_;
+
+  PassType passType_;
+  TaskType taskType_;
+  PidQueue gradQueue_;
+  std::vector<Argument> inArgs_;
+  std::vector<Argument> outArgs_;
+  hl_stream_t outArgStream_;
+
+  // ParameterType which needs to be merged from each GPU
+  std::vector<ParameterType> mergeTypes_;
+  int numDevices_;  /* number of gpu devices */
+  int numLogicalDevices_;  // number of GPU used by one NN
+  int numThreads_;  /* number of train threads */
+
+  UpdateCallback backwardCallback_;
+
+  // barrrier for threads_
+  ThreadBarrier trainerBarrier_;
+
+  // barrier for both MultiGradientMachine and threds_
+  ThreadBarrier allBarrier_;
+
+  // indicate whether inArgs is copied before forward()
+  bool inArgsCopied_;
+
+  // Whether to copy the gradient back from an external input.
+  bool isPassGrad_;
+};
+
+class TrainerThread {
+public:
+  TrainerThread(
+      const ModelConfig& config,
+      int threadId,
+      MultiGradientMachine* multiMachine);
+
+  ~TrainerThread();
+
+  void start();
+
+  void onPassEnd() {
+    gradientMachine_->onPassEnd();
+  }
+
+  void waitOutArgsReady() {
+    outArgsReadySem_.wait();
+  }
+
+  void notifyTaskReady() {
+    taskReadySem_.post();
+  }
+
+  int getDeviceId() const {
+    return deviceId_;
+  }
+
+  GradientMachine* getGradientMachine() {
+    return gradientMachine_.get();
+  }
+
+  const std::vector<ParameterPtr>& getParameters() {
+    return parameters_;
+  }
+
+  void stop();
+
+  void notifyValueReady(int paramId);
+
+  const VectorPtr& getValueBuf(int paramId) {
+    return parameters_[paramId]->getBuf(PARAMETER_VALUE);
+  }
+
+  const std::vector<Argument>& getOutArgs() {
+    return outArgs_;
+  }
+
+  void incUpdateCounter(int n = 1) {
+    updateCounter_ += n;
+    parameterUpdated_ = true;
+  }
+
+  void notifyGradientCollect(int paramId) {
+    gradQueue_.enqueue(paramId);
+  }
+
+  void notifyCopyGradToBuffer(int paramId) {
+    gradBufQueue_.enqueue(paramId);
+  }
+
+  void notifyValueDispatch(int paramId) {
+    valueReadyQueue_.enqueue(paramId);
+  }
+
+  void prefetch();
+
+  // copy the output gradient from the main GradientMachine.
+  void copyOutputGrad();
+
+protected:
+  void mergeCpuGradients();
+
+  void mergeGradSparse(
+    Parameter* para,
+    std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
+
+  void mergeGradSparseRemote(
+    Parameter* para,
+    std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
+
+  void mergeGradDense(
+    Parameter* para,
+    std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
+
+  void computeThread();
+  void valueDispatchThread();
+  void copyGradToBufferThread();
+  void gradCollectThread();
+
+  void copyInArgs();
+  void forward();
+  void backward();
+  void backwardCallback(Parameter* para);
+
+  // call the actuall callback supplied by the caller of
+  // GradientMachine::backward
+  void doCallback(int pid);
+
+protected:
+  MultiGradientMachine* multiMachine_;
+  ModelConfig config_;
+  bool stopping_;   // whether the thread should stop
+  int partnerId_;   // the threads form which to collect gradient
+  int threadId_;    // from 0 to #threads-1
+  int deviceId_;
+  std::unique_ptr<GradientMachine> gradientMachine_;
+  std::vector<ParameterPtr> parameters_;
+
+  // ParameterType which needs to be merged from each GPU
+  std::vector<ParameterType> mergeTypes_;
+
+  std::unique_ptr<std::thread> computeThread_;  // compute thread
+  std::vector<Argument> inArgs_;
+  std::vector<Argument> outArgs_;
+  Semaphore taskReadySem_;
+  Semaphore outArgsReadySem_;
+
+  std::unique_ptr<std::thread> copyThread_;  // copy thread
+  PidQueue gradBufQueue_;  // queue of gradient needs to be copied to partner
+  hl_stream_t gradStream_;
+
+  std::unique_ptr<std::thread> gradCollectThread_;  // grad merge thread
+  // queue of gradient needs to be merged with gradient coopied by
+  // copyGradToBufferThread
+  PidQueue gradQueue_;
+  UpdateCallback backwardCallback_;
+
+  std::unique_ptr<std::thread> valueDispatchThread_;  // value dispatch thread
+  // queue of the parameter whose the vale are ready for copy
+  PidQueue valueReadyQueue_;
+
+  // used to notify all the parameter values are ready
+  LockedCondition valueReadyCond_;
+
+  hl_stream_t valueStream_;
+  std::atomic<int> updateCounter_;  // how many parameters are updated
+  bool parameterUpdated_;
+
+  // indicate whether inArgs is copied before forward()
+  bool inArgsCopied_;
+};
+
+
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.cpp b/paddle/gserver/gradientmachines/MultiNetwork.cpp
new file mode 100644
index 00000000000000..d30ca6f28e8647
--- /dev/null
+++ b/paddle/gserver/gradientmachines/MultiNetwork.cpp
@@ -0,0 +1,185 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
+#include <algorithm>
+
+#include "MultiNetwork.h"
+
+#include "NeuralNetwork.h"
+#include "ParallelNeuralNetwork.h"
+
+namespace paddle {
+
+void MultiNetwork::init(const ModelConfig& config, ParamInitCallback callback,
+                        const std::vector<ParameterType>& parameterTypes,
+                        bool useGpu) {
+  CHECK_GT(config.sub_models_size(), 1) << "sub_models_size should GT 1";
+  // check submodel[0] is root
+  CHECK_EQ("root", config.sub_models(0).name())
+      << "sub_models(0) should be root";
+  // ignore root
+  subNetworks_.resize(config.sub_models_size() - 1);
+  // base class
+  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
+  // sub networks
+  for (int i = 1; i < config.sub_models_size(); ++i) {
+    std::string subModelName = config.sub_models(i).name();
+    if (FLAGS_parallel_nn) {
+      subNetworks_[i - 1] = std::unique_ptr<ParallelNeuralNetwork>(
+                           new ParallelNeuralNetwork(subModelName, this));
+    } else {
+      subNetworks_[i - 1] = std::unique_ptr<NeuralNetwork>(
+                           NeuralNetwork::newNeuralNetwork(subModelName, this));
+    }
+    subNetworks_[i - 1]->init(config);
+  }
+}
+
+void MultiNetwork::prefetch(const std::vector<Argument>& inArgs) {
+  std::vector<std::vector<Argument>> argumentGroups;
+  Argument::splitByDataId(inArgs, &argumentGroups);
+  // check group size is equal to sub network size
+  CHECK_EQ(argumentGroups.size(), subNetworks_.size());
+  for (size_t i = 0; i < subNetworks_.size(); i++) {
+    if (argumentGroups[i].size() == 1 && argumentGroups[i][0].dataId == -1) {
+      // check input args: if dataId is -1, then skip this sub network
+      continue;
+    }
+    subNetworks_[i]->prefetch(argumentGroups[i]);
+  }
+}
+
+void MultiNetwork::forward(const std::vector<Argument>& inArgs,
+                           std::vector<Argument>* outArgs, PassType passType) {
+  // split inArgs to several vectors
+  std::vector<std::vector<Argument>> argumentGroups;
+  Argument::splitByDataId(inArgs, &argumentGroups);
+
+  // check group size is equal to sub network size
+  CHECK_EQ(argumentGroups.size(), subNetworks_.size());
+  std::vector<Argument> tempOutArgs;
+  outArgs->clear();
+
+  for (size_t i = 0; i < subNetworks_.size(); i++) {
+    tempOutArgs.clear();
+    if (argumentGroups[i].size() == 1 && argumentGroups[i][0].dataId == -1) {
+      // check input args: if dataId is -1, then skip this sub network
+      continue;
+    }
+    subNetworks_[i]->forward(argumentGroups[i], &tempOutArgs, passType);
+    for (const auto& elem : tempOutArgs) {
+      outArgs->push_back(elem);
+      outArgs->back().dataId = i;
+    }
+  }
+}
+
+void MultiNetwork::backward(const UpdateCallback& callback) {
+  for (size_t i = 0; i < subNetworks_.size(); i++) {
+    subNetworks_[i]->backward(callback);
+  }
+}
+
+void MultiNetwork::forwardBackward(const std::vector<Argument>& inArgs,
+                                   std::vector<Argument>* outArgs,
+                                   PassType passType,
+                                   const UpdateCallback& callback) {
+  forward(inArgs, outArgs, passType);
+  backward(callback);
+}
+
+void MultiNetwork::onPassEnd() {
+  for (size_t i = 0; i < subNetworks_.size(); i++) {
+    subNetworks_[i]->onPassEnd();
+  }
+}
+
+void MultiNetwork::start(const TrainerConfig& config,
+                         DataProviderPtr dataProvider) {
+  for (auto& subNetwork : subNetworks_) {
+    subNetwork->start(config, dataProvider);
+  }
+}
+
+void MultiNetwork::finish() {
+  for (size_t i = 0; i < subNetworks_.size(); i++) {
+    subNetworks_[i]->finish();
+  }
+}
+
+class MultiCombinedEvaluator : public Evaluator {
+public:
+  MultiCombinedEvaluator() {}
+  void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
+    evaluators_.emplace_back(std::move(evaluator));
+  }
+  virtual void start() {
+    for (auto& evaluator : evaluators_) {
+      evaluator->start();
+    }
+  }
+
+  virtual void finish() {
+    for (auto& evaluator : evaluators_) {
+      evaluator->finish();
+    }
+  }
+
+  virtual void eval(const NeuralNetwork& nn) {
+    const MultiNetwork& multiNetwork = dynamic_cast<const MultiNetwork&>(nn);
+    CHECK_EQ(evaluators_.size(), multiNetwork.getSubNetworks().size());
+    int size = evaluators_.size();
+    for (int i = 0; i < size; i++) {
+      // one evaluator for one subNetwork
+      evaluators_[i]->eval(*multiNetwork.getSubNetworks()[i]);
+    }
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    (void)arguments;
+    return -1;
+  }
+
+  virtual void printStats(std::ostream& os) {
+    for (auto& evaluator : evaluators_) {
+      evaluator->printStats(os);
+      os << ' ';
+    }
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    for (auto& evaluator : evaluators_) {
+      evaluator->distributeEval(client);
+    }
+  }
+
+protected:
+  std::vector<std::unique_ptr<Evaluator>> evaluators_;
+};
+
+Evaluator* MultiNetwork::makeEvaluator() {
+  MultiCombinedEvaluator* multiCombinedEvaluator = new MultiCombinedEvaluator();
+  for (size_t i = 0; i < subNetworks_.size(); i++) {
+    std::unique_ptr<Evaluator> evaluator(subNetworks_[i]->makeEvaluator());
+    multiCombinedEvaluator->addEvaluator(std::move(evaluator));
+  }
+  return multiCombinedEvaluator;
+}
+
+void MultiNetwork::eval(Evaluator* evaluator) { evaluator->eval(*this); }
+
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.h b/paddle/gserver/gradientmachines/MultiNetwork.h
new file mode 100644
index 00000000000000..a162420c3bfe63
--- /dev/null
+++ b/paddle/gserver/gradientmachines/MultiNetwork.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "GradientMachine.h"
+#include "NeuralNetwork.h"
+
+#include "paddle/utils/Locks.h"
+
+namespace paddle {
+
+class MultiNetwork : public NeuralNetwork {
+public:
+  explicit MultiNetwork(std::string subModelName = "")
+      : NeuralNetwork(subModelName) {}
+
+  virtual void init(const ModelConfig& config, ParamInitCallback callback,
+                    const std::vector<ParameterType>& parameterTypes,
+                    bool useGpu);
+
+  virtual void prefetch(const std::vector<Argument>& inArgs);
+
+  virtual void forward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs, PassType passType);
+
+  virtual void backward(const UpdateCallback& callback = nullptr);
+
+  void forwardBackward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs, PassType passType,
+                       const UpdateCallback& callback);
+
+  virtual void onPassEnd();
+
+  virtual Evaluator* makeEvaluator();
+
+  virtual void eval(Evaluator* evaluator);
+
+  const std::vector<std::unique_ptr<NeuralNetwork>>& getSubNetworks() const {
+    return subNetworks_;
+  }
+
+  virtual void start(const TrainerConfig& config,
+                     DataProviderPtr dataProvider);
+
+  virtual void finish();
+
+protected:
+  std::vector<std::unique_ptr<NeuralNetwork>> subNetworks_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
new file mode 100644
index 00000000000000..fca52828957a2d
--- /dev/null
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -0,0 +1,400 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Util.h"
+
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/CustomStackTrace.h"
+
+#include "paddle/utils/Stat.h"
+#include "hl_gpu.h"
+#include "NeuralNetwork.h"
+#include "RecurrentGradientMachine.h"
+#include "MultiNetwork.h"
+#include "paddle/gserver/layers/AgentLayer.h"
+
+namespace paddle {
+void parameterInitNN(int paramId, Parameter* para,
+                     std::vector<ParameterPtr>* sharedParams) {
+  // Create parameters values.
+  if (!para->useGpu() && sharedParams) {
+    para->enableSharedType(PARAMETER_VALUE,
+                           (*sharedParams)[paramId]->getBuf(PARAMETER_VALUE),
+                           (*sharedParams)[paramId]->getMat(PARAMETER_VALUE));
+  } else {
+    if (para->isSparseRemoteUpdate()) {
+      para->enableType(
+          PARAMETER_VALUE, FLAGS_loadsave_parameters_in_pserver
+                              ? Parameter::MAT_SPARSE_ROW_PREFETCH
+                              : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
+    } else {
+      para->enableType(PARAMETER_VALUE);
+    }
+  }
+  // Create parameter gradients.
+  if (para->isSparseRemoteUpdate() && !sharedParams) {
+    para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
+  } else if (para->isGradSparseUpdate()) {
+    para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW_AUTO_GROW);
+  } else if (!para->isStatic()) {
+    para->enableType(PARAMETER_GRADIENT);
+  }
+}
+
+NeuralNetwork* NeuralNetwork::create(const ModelConfig& config) {
+  if (config.type() == "recurrent_nn") {
+    return newNeuralNetwork("root");
+  } else if (config.type() == "multi_nn") {
+    return new MultiNetwork("root");
+  } else {
+    return newNeuralNetwork();
+  }
+}
+
+std::map<std::string, bool> NeuralNetwork::dllInitMap;
+
+void NeuralNetwork::init(const ModelConfig& config, ParamInitCallback callback,
+                         const std::vector<ParameterType>& parameterTypes,
+                         bool useGpu) {
+  using std::placeholders::_1;
+  using std::placeholders::_2;
+  ParamInitCallback paramCallback = nullptr;
+  if (callback != nullptr) {
+    paramSelfInited_ = false;
+    paramCallback = callback;
+  } else {
+    paramSelfInited_ = true;
+    paramCallback = std::bind(parameterInitNN, _1, _2, nullptr);
+  }
+  config_ = config;
+
+  if (rootNetwork_ != nullptr) {
+    // direct use parameters_ and parameterMap_ from base network
+    CHECK_EQ((size_t)config.parameters_size(),
+             rootNetwork_->getParameters().size());
+    parameters_ = rootNetwork_->getParameters();
+    parameterMap_ = *(rootNetwork_->getParameterMap());
+  } else {
+    parameters_.reserve(config.parameters_size());
+    for (const auto& para_config : config.parameters()) {
+      auto parameter = std::make_shared<Parameter>(para_config, useGpu,
+                                                   /*initialize=*/false);
+      paramCallback(parameters_.size(), parameter.get());
+      if (!callback) {
+        for (ParameterType type :
+                 (parameter->isStatic()
+                  ? std::vector<ParameterType>{PARAMETER_VALUE}
+                  : parameterTypes)) {
+          if (type != PARAMETER_VALUE && type != PARAMETER_GRADIENT) {
+            parameter->enableType(type);
+          }
+        }
+      }
+      parameter->setID(parameters_.size());
+      parameters_.push_back(parameter);
+      CHECK(!parameterMap_.count(parameter->getName()));
+      parameterMap_[parameter->getName()] = parameter;
+    }
+  }
+
+  auto layerCreate = [&](const LayerConfig& layer_config) {
+    auto layer = Layer::create(layer_config);
+    CHECK(layer) << "Create layer failed. Layer name:" << layer->getName();
+    layers_.push_back(layer);
+    CHECK(!layerMap_.count(layer->getName()));
+    layerMap_[layer->getName()] = layer;
+  };
+
+  auto subModelConfig =
+      std::find_if(config.sub_models().begin(), config.sub_models().end(),
+                   [=](const SubModelConfig& sub_model) {
+                     return sub_model.name() == subModelName_;
+                   });
+  bool useSubModel = (subModelConfig != config.sub_models().end());
+  CHECK_EQ(useSubModel, !subModelName_.empty());
+  if (useSubModel) {
+    layers_.reserve(subModelConfig->layer_names_size());
+    for (const auto& layer_name : subModelConfig->layer_names()) {
+      auto layer_config =
+          std::find_if(config.layers().begin(), config.layers().end(),
+                       [=](const LayerConfig& layer_config) {
+                         return layer_config.name() == layer_name;
+                       });
+      CHECK(layer_config != config.layers().end());
+      layerCreate(*layer_config);
+    }
+  } else {
+    layers_.reserve(config.layers_size());
+    for (const auto& layer_config : config.layers()) {
+      bool useLayer = true;
+      if (config.has_external_config()) {
+        useLayer = true;
+        for (const auto& name : config.external_config().layer_names()) {
+          if (layer_config.name() == name) {
+            useLayer = false;
+            break;
+          }
+        }
+      }
+      if (useLayer) {
+        layerCreate(layer_config);
+      }
+    }
+  }
+
+  for (const auto& layer : layers_) {
+    layer->init(layerMap_, parameterMap_);
+    layer->initSubNetwork(this /*root*/, config_, parameterTypes, useGpu);
+  }
+
+  for (const auto& layer_name :
+       (useSubModel ? subModelConfig->input_layer_names()
+                    : config.input_layer_names())) {
+    auto it = layerMap_.find(layer_name);
+    CHECK(it != layerMap_.end());
+    dataLayers_.push_back(std::dynamic_pointer_cast<DataLayer>(it->second));
+  }
+
+  for (const auto& layer_name :
+       (useSubModel ? subModelConfig->output_layer_names()
+                    : config.output_layer_names())) {
+    auto it = layerMap_.find(layer_name);
+    CHECK(it != layerMap_.end());
+    outputLayers_.push_back(it->second);
+  }
+}
+
+void NeuralNetwork::connect(LayerPtr agentLayer, LayerPtr realLayer,
+                            int height) {
+  AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get());
+  CHECK_NOTNULL(agent);
+  agent->setRealLayer(realLayer, height);
+}
+
+void NeuralNetwork::connect(std::string agentLayerName, NeuralNetwork* srcNN,
+                            std::string realLayerName) {
+  connect(this->getLayer(agentLayerName), srcNN->getLayer(realLayerName));
+}
+
+void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
+  CHECK_EQ(inArgs.size(), dataLayers_.size());
+
+  if (paramSelfInited_) {
+    for (auto& para : parameters_) {
+      if (para->isSparseRemoteUpdate()) {
+        auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
+          para->getMat(PARAMETER_VALUE).get());
+        para->clearGradient();
+        mat->clearIndices();
+      }
+    }
+  }
+
+  for (size_t i = 0; i != dataLayers_.size(); ++i) {
+    if (FLAGS_parallel_nn) {
+      const_cast<Argument&>(inArgs[i]).deviceId = -1;
+    }
+    dataLayers_[i]->setData(inArgs[i]);
+  }
+
+  for (auto& layer : layers_) {
+    layer->prefetch();
+  }
+
+  if (paramSelfInited_) {
+    for (auto& para : parameters_) {
+      if (para->isSparseRemoteUpdate()) {
+        auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
+          para->getMat(PARAMETER_VALUE).get());
+        mat->setupIndices();
+        auto matGrad = dynamic_cast<SparseRowCpuMatrix*>(
+          para->getMat(PARAMETER_GRADIENT).get());
+        matGrad->reserveStore();
+      }
+    }
+  }
+}
+
+void NeuralNetwork::forward(const std::vector<Argument>& inArgs,
+                            std::vector<Argument>* outArgs, PassType passType) {
+  CHECK_EQ(inArgs.size(), dataLayers_.size());
+  outArgs->resize(outputLayers_.size());
+  for (size_t i = 0; i != dataLayers_.size(); ++i) {
+    dataLayers_[i]->setData(inArgs[i]);
+  }
+
+  {
+    for (auto& layer : layers_) {
+      REGISTER_TIMER_INFO("ForwardTimer", layer->getName().c_str());
+      gLayerStackTrace.push(layer->getName());
+      layer->forward(passType);
+    }
+  }
+
+  outArgs->clear();
+  outArgs->reserve(outputLayers_.size());
+  for (auto& layer : outputLayers_) {
+    outArgs->push_back(layer->getOutput());
+  }
+  if (passType == PASS_TEST) {
+    gLayerStackTrace.clear();
+  }
+}
+
+void NeuralNetwork::resetState() {
+  for (auto& layer : layers_) {
+    layer->resetState();
+  }
+}
+
+void NeuralNetwork::setState(const MachineState& machineState) {
+  for (size_t i = 0; i < layers_.size(); i++) {
+    if (machineState[i] != nullptr) {
+      layers_[i]->setState(machineState[i]);
+    }
+  }
+}
+
+void NeuralNetwork::getState(MachineState& machineState) {
+  machineState.clear();
+  machineState.reserve(layers_.size());
+  for (auto& layer : layers_) {
+    LayerStatePtr p = layer->getState();
+    machineState.push_back(p);
+  }
+}
+
+void NeuralNetwork::backward(const UpdateCallback& callback) {
+  FOR_EACH_R(layer, layers_) {
+    REGISTER_TIMER_INFO("BackwardTimer", (*layer)->getName().c_str());
+    if ((*layer)->needGradient()) {
+      (*layer)->backward(callback);
+    }
+    gLayerStackTrace.pop((*layer)->getName());
+  }
+}
+
+MatrixPtr NeuralNetwork::getLayerOutput(const std::string& layerName) {
+  auto it = layerMap_.find(layerName);
+  CHECK(it != layerMap_.end()) << "Cannot find layer: " << layerName;
+  return it->second->getOutputValue();
+}
+void NeuralNetwork::onPassEnd() {
+  for (auto& layer : layers_) {
+    layer->onPassEnd();
+  }
+}
+
+class CombinedEvaluator : public Evaluator {
+public:
+  CombinedEvaluator() {}
+  void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
+    evaluators_.emplace_back(std::move(evaluator));
+  }
+  virtual void start() {
+    for (auto& evaluator : evaluators_) {
+      evaluator->start();
+    }
+  }
+
+  virtual void finish() {
+    for (auto& evaluator : evaluators_) {
+      evaluator->finish();
+    }
+  }
+
+  virtual void eval(const NeuralNetwork& nn) {
+    for (auto& evaluator : evaluators_) {
+      evaluator->eval(nn);
+    }
+  }
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    (void)arguments;
+    return -1;
+  }
+  virtual void printStats(std::ostream& os) {
+    for (auto& evaluator : evaluators_) {
+      evaluator->printStats(os);
+      os << ' ';
+    }
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    for (auto& evaluator : evaluators_) {
+      evaluator->distributeEval(client);
+    }
+  }
+
+protected:
+  std::vector<std::unique_ptr<Evaluator>> evaluators_;
+};
+
+Evaluator* NeuralNetwork::makeEvaluator() {
+  CombinedEvaluator* combinedEvaluator = new CombinedEvaluator();
+  auto subModelConfig =
+      std::find_if(config_.sub_models().begin(), config_.sub_models().end(),
+                   [=](const SubModelConfig& sub_model) {
+                     return sub_model.name() == subModelName_;
+                   });
+  bool useSubModel = (subModelConfig != config_.sub_models().end());
+  CHECK_EQ(useSubModel, !subModelName_.empty());
+  if (useSubModel) {
+    // create the evaluators that belong to CURRENT submodel
+    for (int i = 0; i < subModelConfig->evaluator_names_size(); ++i) {
+      // find evaluator by name
+      auto thisEvalConfig = std::find_if(
+          config_.evaluators().begin(), config_.evaluators().end(),
+          [=](const EvaluatorConfig& ecfg) {
+            return ecfg.name() == subModelConfig->evaluator_names(i);
+          });
+      bool validConfig = (thisEvalConfig != config_.evaluators().end());
+      if (validConfig) {
+        std::unique_ptr<Evaluator> evaluator(
+            Evaluator::create(*thisEvalConfig));
+        combinedEvaluator->addEvaluator(std::move(evaluator));
+      }
+    }
+  } else {
+    for (const EvaluatorConfig& evalConfig : config_.evaluators()) {
+      std::unique_ptr<Evaluator> evaluator(Evaluator::create(evalConfig));
+      combinedEvaluator->addEvaluator(std::move(evaluator));
+    }
+  }
+  return combinedEvaluator;
+}
+
+void NeuralNetwork::eval(Evaluator* evaluator) { evaluator->eval(*this); }
+
+void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
+  CHECK_GE(outputLayers_.size(), args.size());
+  for (size_t i = 0; i < args.size(); ++i) {
+    outputLayers_[i]->getOutput().grad = args[i].grad;
+  }
+}
+
+extern NeuralNetwork* newCustomNeuralNetwork(
+    const std::string& name, NeuralNetwork* network) __attribute__((weak));
+
+NeuralNetwork* NeuralNetwork::newNeuralNetwork(
+    const std::string& name,
+    NeuralNetwork* rootNetwork) {
+  if (newCustomNeuralNetwork) {
+    return newCustomNeuralNetwork(name, rootNetwork);
+  } else {
+    return new NeuralNetwork(name, rootNetwork);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
new file mode 100644
index 00000000000000..1b440042d72640
--- /dev/null
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -0,0 +1,154 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <memory>
+#include <map>
+#include <functional>
+
+#include "paddle/utils/ClassRegistrar.h"
+#include "paddle/parameter/Parameter.h"
+#include "ModelConfig.pb.h"
+#include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/gserver/layers/CostLayer.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/gserver/dataproviders/DataProvider.h"
+#include "paddle/gserver/layers/Layer.h"
+
+namespace paddle {
+/*
+ * @brief  Init function for the parameters.
+ * @param paramId: the id of the parameter to init.
+ * @param para: the pointer to the parameter to init.
+ * @param sharedParams: the pointer to an array of the parameter to be shared.
+ *                      If it is null, no parameter sharing is used.
+ *                      Only CPU paramters can be shared.
+ * It handles CPU, CPU sparse, CPU sparse remote,
+ * and GPU parameters differently. If the type
+ * of a parameter is NORMAL. Basically nothing need to be done.
+ * CPU value: NORMAL.
+ * CPU param: NORMAL.
+ *
+ * CPU sparse value: NORMAL.
+ * CPU sparse gradient: MAT_SPARSE_ROW_AUTO_GROW.
+ *
+ * CPU sparse remote value: MAT_SPARSE_ROW_PREFETCH(_FULL_SIZE).
+ * CPU sparse remote gradient: MAT_SPARSE_ROW (!sharedParams)
+ *                             MAT_SPARSE_ROW_AUTO_GROW (sharedParams)
+ *
+ * GPU value: NORMAL
+ * GPU param: NORMAL
+ */
+void parameterInitNN(int paramId, Parameter* para,
+                     std::vector<ParameterPtr>* sharedParams);
+
+
+class NeuralNetwork : public GradientMachine {
+public:
+  virtual void init(
+      const ModelConfig& config, ParamInitCallback callback = nullptr,
+      const std::vector<ParameterType>&
+          parameterTypes = std::vector<ParameterType>{PARAMETER_VALUE,
+                                                      PARAMETER_GRADIENT,
+                                                      PARAMETER_MOMENTUM},
+      bool useGpu = FLAGS_use_gpu);
+
+  // connect two submodels
+  // down-submodel's output become up-submodel's input
+  // *realLayer* is down-submodel's output layer
+  // *agentLayer* is up-submodel's input agent layer
+  // by default, connection is one by one,
+  // if the agent height is smaller than real layer, *height* has to be filled
+  static void connect(LayerPtr agentLayer, LayerPtr realLayer, int height = 0);
+  void connect(std::string agentLayerName, NeuralNetwork* srcNN,
+               std::string realLayerName);
+
+  virtual void prefetch(const std::vector<Argument>& inArgs);
+
+  virtual void forward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs, PassType passType);
+
+  virtual void backward(const UpdateCallback& callback = nullptr);
+
+  MatrixPtr getLayerOutput(const std::string& layerName);
+  const LayerPtr& getLayer(const std::string& layerName) const {
+    auto it = layerMap_.find(layerName);
+    CHECK(it != layerMap_.end()) << "Unknown layer " << layerName;
+    return it->second;
+  }
+
+  virtual void onPassEnd();
+
+  virtual Evaluator* makeEvaluator();
+
+  virtual void eval(Evaluator* evaluator);
+  virtual void resetState();
+  virtual void setOutputGrad(const std::vector<Argument>& args);
+
+  // set machine state
+  virtual void setState(const MachineState& machineState);
+
+  // get machine state
+  virtual void getState(MachineState& machineState);
+
+  static NeuralNetwork* create(const ModelConfig& config);
+
+  ParameterMap* getParameterMap() { return &parameterMap_; }
+
+  /**
+   * @brief Access each layer as a for each loop.
+   * @param callback invoke with each layer.
+   */
+  template <typename T>
+  void forEachLayer(T callback) {
+    for (auto & l : layers_) {
+      if (callback(l)) {
+        break;
+      }
+    }
+  }
+
+
+  static NeuralNetwork* newNeuralNetwork(const std::string& name = "",
+                                        NeuralNetwork* rootNetwork = nullptr);
+
+protected:
+  // rootNetwork: used in MultiNetwork
+  // sub networks can get parameters_ and parameterMap_ from base NeuralNetwork
+  NeuralNetwork(std::string subModelName = "",
+                NeuralNetwork* rootNetwork = nullptr)
+      : subModelName_(subModelName),
+        rootNetwork_(rootNetwork) {}
+
+  std::string subModelName_;
+  ModelConfig config_;
+  std::vector<LayerPtr> layers_;
+  ParameterMap parameterMap_;
+  LayerMap layerMap_;
+
+  std::vector<DataLayerPtr> dataLayers_;
+  std::vector<LayerPtr> outputLayers_;
+
+  static std::map<std::string, bool> dllInitMap;
+
+  NeuralNetwork* rootNetwork_;
+
+  // Whether parameter of this NN is initialized by its own
+  // (i.e., not by callback supplied with the caller)
+  bool paramSelfInited_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
new file mode 100644
index 00000000000000..952df60a7d7866
--- /dev/null
+++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
@@ -0,0 +1,209 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
+
+#include "ParallelNeuralNetwork.h"
+
+#include <pthread.h>
+#include <sched.h>
+
+namespace paddle {
+
+void ParallelNeuralNetwork::init(
+    const ModelConfig& config, ParamInitCallback callback,
+    const std::vector<ParameterType>& parameterTypes, bool useGpu) {
+  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
+
+  useGpu_ = useGpu;
+  numDevices_ = 0;
+  if (useGpu_) {
+    numDevices_ = hl_get_device_count();
+  }
+
+  for (auto& layer : layers_) {
+    int deviceId = layer->getDeviceId();
+    CHECK_LT(deviceId, numDevices_);
+    addComputeThread(deviceId);
+  }
+}
+
+void ParallelNeuralNetwork::addComputeThread(int deviceId) {
+  for (auto& thread : threads_) {
+    if (thread->getDeviceId() == deviceId) {
+      return;
+    }
+  }
+
+  threads_.emplace_back(new ParallelThread(threads_.size(), deviceId,
+                                           deviceId >= 0 ? useGpu_ : false));
+}
+
+void ParallelNeuralNetwork::waitAllThread() {
+  for (auto& thread : threads_) {
+    thread->jobEnqueue(NULL, TASK_END_LAYER);
+  }
+
+  for (size_t i = 0; i < threads_.size(); i++) {
+    threads_[i]->queue_.waitEmpty();
+  }
+}
+
+void ParallelNeuralNetwork::dispatchByDeviceId(int deviceId, LayerPtr layer,
+                                               TaskType task) {
+  for (auto& thread : threads_) {
+    if (thread->getDeviceId() == deviceId) {
+      thread->jobEnqueue(layer, task);
+      return;
+    }
+  }
+  LOG(FATAL) << "No specific device thread ";
+}
+
+void ParallelNeuralNetwork::forward(const std::vector<Argument>& inArgs,
+                                    std::vector<Argument>* outArgs,
+                                    PassType passType) {
+  for (auto& thread : threads_) {
+    thread->setForwardPassType(passType);
+  }
+  CHECK_EQ(inArgs.size(), dataLayers_.size());
+  outArgs->resize(outputLayers_.size());
+  for (size_t i = 0; i != dataLayers_.size(); ++i) {
+    const_cast<Argument&>(inArgs[i]).deviceId = -1;
+    dataLayers_[i]->setData(inArgs[i]);
+  }
+
+  for (auto& layer : layers_) {
+    dispatchByDeviceId(layer->getDeviceId(), layer, TASK_FORWARD);
+  }
+
+  {
+    REGISTER_TIMER("forwardTime");
+    waitAllThread();
+  }
+  outArgs->clear();
+  outArgs->reserve(outputLayers_.size());
+  for (auto& layer : outputLayers_) {
+    outArgs->push_back(layer->getOutput());
+  }
+}
+
+void ParallelNeuralNetwork::backward(const UpdateCallback& callback) {
+  for (auto& thread : threads_) {
+    thread->setBackwardCallback(callback);
+  }
+
+  FOR_EACH_R(layer, layers_) {
+    dispatchByDeviceId((*layer)->getDeviceId(), *layer, TASK_BACKWARD);
+  }
+  {
+    REGISTER_TIMER("backwardTime");
+    waitAllThread();
+  }
+}
+
+void ParallelNeuralNetwork::forwardBackward(const std::vector<Argument>& inArgs,
+                                            std::vector<Argument>* outArgs,
+                                            PassType passType,
+                                            const UpdateCallback& callback) {
+  forward(inArgs, outArgs, passType);
+  backward(callback);
+}
+
+void ParallelNeuralNetwork::start(const TrainerConfig& config,
+                                  DataProviderPtr dataProvider) {
+  (void)config;
+  (void)dataProvider;
+
+  for (auto& thread : threads_) {
+    thread->start();
+  }
+}
+
+ParallelThread::ParallelThread(int threadId, int deviceId, bool useGpu)
+    : threadId_(threadId), deviceId_(deviceId), useGpu_(useGpu) {}
+
+ParallelThread::~ParallelThread() { stop(); }
+
+void ParallelThread::stop() {
+  if (computeThread_) {
+    jobEnqueue(NULL, TASK_THREAD_FINISH);
+    computeThread_->join();
+    computeThread_.reset(nullptr);
+  }
+}
+
+void ParallelThread::computeThread() {
+  LOG(INFO) << "gradComputeThread " << threadId_;
+
+  if (useGpu_) {
+    hl_init(deviceId_);
+  }
+
+  while (true) {
+    struct Job job_work = queue_.dequeue();
+
+    if (job_work.task_ == TASK_END_LAYER) {
+      continue;
+    } else if (job_work.task_ == TASK_THREAD_FINISH) {
+      break;
+    }
+
+    if (TASK_FORWARD == job_work.task_) {
+      {
+        REGISTER_TIMER_INFO("waitInputValue",
+                            job_work.layer_->getName().c_str());
+        job_work.layer_->waitInputValue();
+      }
+      {
+        REGISTER_TIMER_INFO("threadForwardTimer",
+                            job_work.layer_->getName().c_str());
+        job_work.layer_->forward(passType_);
+      }
+      {
+        REGISTER_TIMER_INFO("copyOutputToOtherDevice",
+                            job_work.layer_->getName().c_str());
+        job_work.layer_->copyOutputToOtherDevice();
+      }
+    } else {
+      {
+        REGISTER_TIMER_INFO("waitAndMergeOutputGrad",
+                            job_work.layer_->getName().c_str());
+        job_work.layer_->waitAndMergeOutputGrad();
+      }
+      {
+        REGISTER_TIMER_INFO("threadBackwardTimer",
+                            job_work.layer_->getName().c_str());
+        job_work.layer_->backward(backwardCallback_);
+      }
+      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+      job_work.layer_->markAllInputGrad();
+    }
+  }
+}
+
+void ParallelThread::start() {
+  computeThread_.reset(new std::thread([this]() { computeThread(); }));
+}
+
+void ParallelThread::jobEnqueue(LayerPtr layer, TaskType task) {
+  struct Job job_work;
+  job_work.layer_ = layer;
+  job_work.task_ = task;
+  queue_.enqueue(job_work);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
new file mode 100644
index 00000000000000..eaf2376e132b27
--- /dev/null
+++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
@@ -0,0 +1,108 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "NeuralNetwork.h"
+
+namespace paddle {
+
+class ParallelThread;
+
+enum TaskType {
+  TASK_FORWARD = 0,
+  TASK_BACKWARD = 1,
+  TASK_END_LAYER = 2,
+  TASK_THREAD_FINISH = 3,
+};
+
+/**
+ * A ParallelNeuralNetwork is capable of calculating a neural network through
+ * multiple threads in parallel.
+ */
+class ParallelNeuralNetwork : public NeuralNetwork {
+public:
+  ParallelNeuralNetwork(std::string subModelName = "",
+      NeuralNetwork* rootNetwork = nullptr)
+    : NeuralNetwork(subModelName, rootNetwork) {}
+
+  virtual void init(
+      const ModelConfig &config, ParamInitCallback callback = nullptr,
+      const std::vector<ParameterType> &
+          parameterTypes = std::vector<ParameterType>{PARAMETER_VALUE,
+                                                      PARAMETER_GRADIENT,
+                                                      PARAMETER_MOMENTUM},
+      bool useGpu = FLAGS_use_gpu);
+
+  virtual void forward(const std::vector<Argument> &inArgs,
+                       std::vector<Argument> *outArgs, PassType passType);
+
+  virtual void backward(const UpdateCallback &callback = nullptr);
+
+  void forwardBackward(const std::vector<Argument> &inArgs,
+                       std::vector<Argument> *outArgs, PassType passType,
+                       const UpdateCallback &callback = NULL);
+
+  virtual void start(const TrainerConfig &config, DataProviderPtr dataProvider);
+
+  void addComputeThread(int deviceId);
+
+  void dispatchByDeviceId(int deviceId, LayerPtr layer, TaskType task);
+
+  void waitAllThread();
+
+  // virtual void eval(Evaluator* evaluator);
+
+protected:
+  bool useGpu_;
+  int numDevices_; /* number of gpu devices */
+  std::vector<std::unique_ptr<ParallelThread>> threads_;
+};
+
+class ParallelThread {
+public:
+  ParallelThread(int threadId, int deviceId, bool useGpu);
+  ~ParallelThread();
+  void jobEnqueue(LayerPtr layer, TaskType task);
+  void start();
+  void stop();
+  int getDeviceId() const { return deviceId_; }
+
+  void setBackwardCallback(const UpdateCallback &callback) {
+    backwardCallback_ = callback;
+  }
+  void setForwardPassType(PassType passType) { passType_ = passType; }
+
+protected:
+  void computeThread();
+
+public:
+  struct Job {
+    LayerPtr layer_;
+    TaskType task_;
+  };
+  typedef Queue<Job> JobQueue;
+  JobQueue queue_;
+
+protected:
+  int threadId_;  // from 0 to #threads-1
+  int deviceId_;  // the GPU device Id which the computeThread_ used
+  bool useGpu_;
+  std::unique_ptr<std::thread> computeThread_;
+  bool stopping_;  // whether the thread should stop
+  UpdateCallback backwardCallback_;
+  PassType passType_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
new file mode 100644
index 00000000000000..7bc5fe51813c94
--- /dev/null
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -0,0 +1,1174 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
+#include "paddle/utils/Flags.h"
+#include <algorithm>
+#include <functional>
+#include <dlfcn.h>
+#include <limits>
+#include <cmath>
+#include "RecurrentGradientMachine.h"
+#include "NeuralNetwork.h"
+#include "paddle/gserver/layers/AgentLayer.h"
+
+P_DEFINE_string(diy_beam_search_prob_so, "", "the diy beam search cost so");
+
+static const char* DIY_CALC_PROB_SYMBOL_NAME = "calc_prob";
+static const char* DIY_START_CALC_PROB_SYMBOL_NAME = "start_calc_prob";
+static const char* DIY_FINISH_CALC_PROB_SYMBOL_NAME = "finish_calc_prob";
+
+namespace paddle {
+
+/**
+ * Start Custom Calculate Probability callback type.
+ *
+ * @param nNode, nodes: the path will be explored. nNodes is array size.
+ *                      nodes is array elements.
+ *
+ * @return: A custom handler id that will passed to another callback.
+ */
+typedef int (*DiyStartCalcProbCallback)(size_t nNodes, int* nodes);
+
+/**
+ * Doing Custom Calculation of Probability callback type.
+ *
+ * @param handler: User custom handler. The return value from start calc prob.
+ * @param nNode, nodes: Array. The current path.
+ * @param curProb: The current log probability that neural network returns.
+ *
+ * @return: Log probability which user calculated, it will be updated to this
+ *          path.
+ * @NOTE: Return -INFINITY will DROP this path IMMEDIATELY!!
+ */
+typedef real (*DiyCalcProbCallback)(int handler, size_t nNodes, int* nodes,
+                                    real curProb, bool atEos);
+
+/**
+ * Finish Custom Calculation of Probability callback type.
+ *
+ * @param handler: User custom handler. The return value from start calc prob.
+ */
+typedef void (*DiyStopCalcProbCallback)(int handler);
+
+static DiyCalcProbCallback gDiyProbMethod = nullptr;
+static DiyStartCalcProbCallback gDiyProbStart = nullptr;
+static DiyStopCalcProbCallback gDiyProbStop = nullptr;
+static void* gDiyProbHandle = nullptr;
+
+static void exit_diy_prob() { dlclose(gDiyProbHandle); }
+
+template <typename SymbolType>
+static inline SymbolType loadDiySymbol(const char* symbolName) {
+  void* sym = dlsym(gDiyProbHandle, symbolName);
+  CHECK(sym) << "Cannot load symbol " << symbolName << " from "
+             << FLAGS_diy_beam_search_prob_so;
+  return reinterpret_cast<SymbolType>(sym);
+}
+
+static InitFunction __init__diy_prob_method([] {
+  std::string soName = FLAGS_diy_beam_search_prob_so;
+  if (!soName.empty()) {
+    gDiyProbHandle = dlopen(soName.c_str(), RTLD_LAZY);
+    CHECK(gDiyProbHandle) << "Cannot Open DIY Prob So " << soName;
+    atexit(exit_diy_prob);
+    gDiyProbMethod =
+        loadDiySymbol<decltype(gDiyProbMethod)>(DIY_CALC_PROB_SYMBOL_NAME);
+    gDiyProbStart =
+        loadDiySymbol<decltype(gDiyProbStart)>(DIY_START_CALC_PROB_SYMBOL_NAME);
+    gDiyProbStop =
+        loadDiySymbol<decltype(gDiyProbStop)>(DIY_FINISH_CALC_PROB_SYMBOL_NAME);
+  }
+}, std::numeric_limits<int>::max());
+
+class BeamSearchControlCallbacks {
+public:
+  RecurrentGradientMachine::BeamSearchCandidatesAdjustCallback
+      beamSearchCandidateAdjust;
+  RecurrentGradientMachine::NormOrDropNodeCallback normOrDropNode;
+  RecurrentGradientMachine::DropCallback stopDetermineCandidates;
+
+  //! for gcc46 aggregate initialization is not very well, so we need to
+  //! explicit
+  BeamSearchControlCallbacks(
+      const RecurrentGradientMachine::BeamSearchCandidatesAdjustCallback&
+          candidateAdjust,
+      const RecurrentGradientMachine::NormOrDropNodeCallback& norm,
+      const RecurrentGradientMachine::DropCallback& stop)
+      : beamSearchCandidateAdjust(candidateAdjust),
+        normOrDropNode(norm),
+        stopDetermineCandidates(stop) {}
+};
+
+class BeamSearchStatisticsCallbacks {
+public:
+  RecurrentGradientMachine::EachStepCallback onEachStepStarted;
+  RecurrentGradientMachine::EachStepCallback onEachStepStoped;
+
+  BeamSearchStatisticsCallbacks(
+      const RecurrentGradientMachine::EachStepCallback& start,
+      const RecurrentGradientMachine::EachStepCallback& stop)
+      : onEachStepStarted(start), onEachStepStoped(stop) {}
+};
+
+RecurrentGradientMachine::RecurrentGradientMachine(
+    const std::string& subModelName, NeuralNetwork* rootNetwork)
+    : NeuralNetwork(subModelName),
+      rootNetwork_(rootNetwork),
+      beamSearchCtrlCallbacks_(nullptr),
+      beamSearchStatistics_(nullptr) {
+  CHECK(!subModelName_.empty());
+}
+
+/**
+ * bias layer, as input of memory frame 0 will give vector of zeros
+ * if bias parameter is not set.
+ *
+ * boot bias layer create directly in recurrent gradient machine, because:
+ *
+ * 1. It is only one frame, so it should not be placed in layer group,
+ *    which is one instance for every one frame.
+ *
+ * 2. It is no input layer, so it need resetHeight() before forward(),
+ *    and resetHeight() must be called in recurrent gradient machine,
+ *    so it's should not be placed in root network.
+ */
+class BootBiasLayer : public Layer {
+protected:
+  std::unique_ptr<Weight> biases_;
+  IVectorPtr cpuIds_;
+
+public:
+  explicit BootBiasLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+    if (!Layer::init(layerMap, parameterMap)) return false;
+
+    if (biasParameter_) {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+    }
+    return true;
+  }
+
+  void resetHeight(int height) {
+    if (config_.has_bos_id()) {  // used as a constant id layerConfig
+      IVector::resizeOrCreate(output_.ids, height, useGpu_);
+      output_.ids->reset((int)config_.bos_id());
+    } else {
+      resetOutput(height, getSize());
+    }
+  }
+
+  virtual void forward(PassType passType) {
+    if (biases_) {
+      MatrixPtr outV = getOutputValue();
+      outV->addBias(*(biases_->getW()), 1);
+      forwardActivation();
+    }
+  }
+
+  virtual void backward(const UpdateCallback& callback) {
+    if (biases_) {
+      backwardActivation();
+      biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+      biases_->getParameterPtr()->incUpdate(callback);
+    }
+  }
+};
+
+void RecurrentGradientMachine::init(
+    const ModelConfig& config, ParamInitCallback callback,
+    const std::vector<ParameterType>& parameterTypes, bool useGpu) {
+  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
+  useGpu_ = useGpu;
+
+  auto subModelConfig =
+      std::find_if(config.sub_models().begin(), config.sub_models().end(),
+                   [this](const SubModelConfig& sub_model) {
+                     return sub_model.name() == this->subModelName_;
+                   });
+  CHECK(subModelConfig != config.sub_models().end());
+  reversed_ = subModelConfig->reversed();
+
+  inFrameLines_.resize(subModelConfig->in_links_size());
+  for (size_t i = 0; i < inFrameLines_.size(); ++i) {
+    inFrameLines_[i].linkName = subModelConfig->in_links(i).link_name();
+    inFrameLines_[i].inLayer =
+        rootNetwork_->getLayer(subModelConfig->in_links(i).layer_name());
+    inFrameLines_[i].hasSubseq = subModelConfig->in_links(i).has_subseq();
+  }
+
+  outFrameLines_.resize(subModelConfig->out_links_size());
+  for (size_t i = 0; i < outFrameLines_.size(); ++i) {
+    auto& linkPair = subModelConfig->out_links(i);
+    outFrameLines_[i].layerName = linkPair.layer_name();
+    outFrameLines_[i].agentLayer = rootNetwork_->getLayer(linkPair.link_name());
+  }
+
+  memoryFrameLines_.resize(subModelConfig->memories_size());
+  for (size_t i = 0; i < memoryFrameLines_.size(); ++i) {
+    auto& memoryConfig = subModelConfig->memories(i);
+    memoryFrameLines_[i].layerName = memoryConfig.layer_name();
+    memoryFrameLines_[i].linkName = memoryConfig.link_name();
+    auto agentConfig =
+        std::find_if(config.layers().begin(), config.layers().end(),
+                     [&memoryConfig](const LayerConfig& layerConfig) {
+                       return layerConfig.name() == memoryConfig.link_name();
+                     });
+    CHECK(agentConfig != config.layers().end());
+    if (memoryConfig.has_boot_layer_name()) {
+      memoryFrameLines_[i].rootLayer =
+          rootNetwork_->getLayer(memoryConfig.boot_layer_name());
+
+      LayerConfig scatterConfig = *agentConfig;
+      memoryFrameLines_[i].is_sequence = memoryConfig.is_sequence();
+      memoryFrameLines_[i].rootAgent.reset(
+          memoryConfig.is_sequence()
+              ? new SequenceScatterAgentLayer(scatterConfig)
+              : new ScatterAgentLayer(scatterConfig));
+      memoryFrameLines_[i].rootAgent->init(LayerMap(), parameterMap_);
+
+      memoryFrameLines_[i].bootLayer = memoryFrameLines_[i].rootAgent;
+    } else {
+      LayerConfig biasConfig = *agentConfig;
+      if (memoryConfig.has_boot_bias_parameter_name()) {
+        biasConfig.set_bias_parameter_name(
+            memoryConfig.boot_bias_parameter_name());
+        biasConfig.set_active_type(memoryConfig.boot_bias_active_type());
+      } else if (memoryConfig.has_boot_with_const_id()) {
+        biasConfig.set_bos_id(memoryConfig.boot_with_const_id());
+      }
+      memoryFrameLines_[i].biasLayer.reset(new BootBiasLayer(biasConfig));
+      memoryFrameLines_[i].biasLayer->init(LayerMap(), parameterMap_);
+
+      memoryFrameLines_[i].bootLayer = memoryFrameLines_[i].biasLayer;
+    }
+
+    if (subModelConfig->has_generator()) {
+      memoryFrameLines_[i].scatterAgents.resize(2);
+      for (auto& agent : memoryFrameLines_[i].scatterAgents) {
+        agent.reset(memoryConfig.is_sequence()
+                        ? new SequenceScatterAgentLayer(*agentConfig)
+                        : new ScatterAgentLayer(*agentConfig));
+        agent->init(LayerMap(), parameterMap_);
+      }
+    }
+  }
+
+  if (subModelConfig->has_generator()) {
+    generator_.config = subModelConfig->generator();
+    eosFrameLine_.reset(new EosFrameLine);
+    maxSequenceLength_ = generator_.config.max_num_frames();
+  }
+
+  // get parameters actually used by this Layer Group
+  resizeOrCreateFrames(1);
+  for (auto& para : frames_[0]->getParameters()) {
+    if (para->getSharedCount() > 0) {
+      parameterIds_.push_back(para->getID());
+    }
+  }
+  for (auto& para : parameters_) {  // bias layer parameters
+    if (para->getSharedCount() > 0) {
+      parameterIds_.push_back(para->getID());
+    }
+  }
+
+  if (subModelConfig->evaluator_names_size() > 0) {
+    evaluator_.reset(frames_[0]->makeEvaluator());
+  }
+}
+
+void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) {
+  if ((size_t)numFrames <= frames_.size()) {
+    return;
+  }
+
+  frames_.reserve(numFrames);
+  for (auto& inFrameLine : inFrameLines_) {
+    inFrameLine.agents.reserve(numFrames);
+  }
+  for (auto& outFrameLine : outFrameLines_) {
+    outFrameLine.frames.reserve(numFrames);
+  }
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    memoryFrameLine.frames.reserve(numFrames);
+    memoryFrameLine.agents.reserve(numFrames);
+  }
+  if (eosFrameLine_) {
+    eosFrameLine_->layers.reserve(numFrames);
+  }
+
+  ParamInitCallback subParamInitCb = [this](int paramId, Parameter* para) {
+    para->enableSharedType(PARAMETER_VALUE,
+                           this->parameters_[paramId]->getBuf(PARAMETER_VALUE),
+                           this->parameters_[paramId]->getMat(PARAMETER_VALUE));
+    para->enableSharedType(
+        PARAMETER_GRADIENT,
+        this->parameters_[paramId]->getBuf(PARAMETER_GRADIENT),
+        this->parameters_[paramId]->getMat(PARAMETER_GRADIENT));
+  };
+
+  for (int i = frames_.size(); i < numFrames; ++i) {
+    std::unique_ptr<NeuralNetwork> frame(
+          NeuralNetwork::newNeuralNetwork(subModelName_));
+    frame->init(config_, subParamInitCb);
+
+    for (auto& inFrameLine : inFrameLines_) {
+      inFrameLine.agents.push_back(frame->getLayer(inFrameLine.linkName));
+    }
+
+    for (auto& outFrameLine : outFrameLines_) {
+      outFrameLine.frames.push_back(frame->getLayer(outFrameLine.layerName));
+    }
+    for (auto& memoryFrameLine : memoryFrameLines_) {
+      memoryFrameLine.frames.push_back(
+          frame->getLayer(memoryFrameLine.layerName));
+      memoryFrameLine.agents.push_back(
+          frame->getLayer(memoryFrameLine.linkName));
+    }
+    if (eosFrameLine_) {
+      eosFrameLine_->layers.push_back(
+          frame->getLayer(generator_.config.eos_layer_name()));
+    }
+
+    frames_.emplace_back(std::move(frame));
+  }
+}
+
+void RecurrentGradientMachine::resizeBootFrame(int numSequences) {
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    if (memoryFrameLine.biasLayer) {
+      auto biasLayer =
+          dynamic_cast<BootBiasLayer*>(memoryFrameLine.biasLayer.get());
+      CHECK_NOTNULL(biasLayer);
+      biasLayer->resetHeight(numSequences);
+    } else {  // check input root layer height
+      CHECK_EQ(numSequences,
+               memoryFrameLine.rootLayer->getOutput().getNumSequences());
+    }
+  }
+}
+
+void RecurrentGradientMachine::prefetch(const std::vector<Argument>& inArgs) {
+  LOG(FATAL) << "should not use this function";
+}
+
+void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
+                                       std::vector<Argument>* outArgs,
+                                       PassType passType) {
+  if (inFrameLines_.empty() && passType == PASS_TEST) {
+    generateSequence();
+    return;
+  }  // else forward..
+
+  const Argument& input = inFrameLines_[0].inLayer->getOutput();
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  size_t numSequences = input.getNumSequences();
+  const int* starts = input.sequenceStartPositions->getData(false);
+  bool hasSubseq = input.hasSubseq();
+  // check hasSubseq in both config and input are the same
+  CHECK_EQ(hasSubseq, inFrameLines_[0].hasSubseq);
+
+  CHECK_EQ(starts[numSequences], batchSize);
+  CHECK(input.sequenceStartPositions);
+
+  // check other inputs has same sequence length and start
+  for (size_t i = 1; i < inFrameLines_.size(); ++i) {
+    const Argument& input1 = inFrameLines_[i].inLayer->getOutput();
+    CHECK_EQ((size_t)input1.getNumSequences(), numSequences);
+    // check all inputs should have same hasSubseq flag
+    CHECK_EQ(input.hasSubseq(), inFrameLines_[0].hasSubseq);
+    CHECK_EQ(input1.getBatchSize(), batchSize);
+    CHECK(std::equal(starts, starts + numSequences + 1,
+                     input1.sequenceStartPositions->getData(false)));
+  }
+
+  if (hasSubseq) {
+    CHECK(input.subSequenceStartPositions);
+    size_t numSubSequences = input.getNumSubSequences();
+    const int* subStarts = input.subSequenceStartPositions->getData(false);
+    CHECK_EQ(subStarts[numSubSequences], batchSize);
+    // if hasSubseq, check other inputs has same sub-sequence and sub-start
+    for (size_t i = 1; i < inFrameLines_.size(); ++i) {
+      const Argument& input1 = inFrameLines_[i].inLayer->getOutput();
+      CHECK_EQ((size_t)input1.getNumSubSequences(), numSubSequences);
+      CHECK(std::equal(subStarts, subStarts + numSubSequences + 1,
+                       input1.subSequenceStartPositions->getData(false)));
+    }
+  }
+
+  seqLengthAndStart_.clear();
+  input.getSeqLengthAndStart(&seqLengthAndStart_, &maxSequenceLength_);
+  resizeOrCreateFrames(maxSequenceLength_);
+  resizeBootFrame(numSequences);
+
+  AsyncGpuBlock asyncGpuBlock;
+  createInFrameInfo(input, passType);
+
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    if (memoryFrameLine.rootAgent) {
+      auto scatterAgent =
+          dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
+      createMemoryFrameInfo(&memoryFrameLine, passType);
+      scatterAgent->setRealLayerAndOutput(
+          memoryFrameLine.rootLayer, memoryFrameLine.outArg,
+          memoryFrameLine.allIds,
+          /* idIndex */ 0, memoryFrameLine.allIds->getSize());
+      if (memoryFrameLine.is_sequence) {  // memoryConfig is sequence
+        int size = memoryFrameLine.sequenceStartPositions->getSize();
+        scatterAgent->setSequenceStartPositions(
+            memoryFrameLine.sequenceStartPositions,
+            /* seqStartPosIndex */ 0, size);
+      }
+    }
+  }
+
+  for (auto& outFrameLine : outFrameLines_) {
+    auto gatherAgent =
+        dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
+    CHECK_NOTNULL(gatherAgent);
+    gatherAgent->copyIdAndSequenceInfo(input, info_.allIds, info_.idIndex);
+  }
+
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    int idSize = info_.idIndex[i + 1] - info_.idIndex[i];
+
+    // connect in_links
+    for (auto& inFrameLine : inFrameLines_) {
+      auto scatterAgent =
+          dynamic_cast<ScatterAgentLayer*>(inFrameLine.agents[i].get());
+      scatterAgent->setRealLayerAndOutput(inFrameLine.inLayer,
+                                          inFrameLine.outArg, info_.allIds,
+                                          info_.idIndex[i], idSize);
+      if (hasSubseq) {
+        int size = info_.seqStartPosIndex[i + 1] - info_.seqStartPosIndex[i];
+        scatterAgent->setSequenceStartPositions(
+            info_.sequenceStartPositions, info_.seqStartPosIndex[i], size);
+      }
+    }
+
+    // connect out_links
+    for (auto& outFrameLine : outFrameLines_) {
+      auto gatherAgent =
+          dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
+      gatherAgent->addRealLayer(outFrameLine.frames[i]);
+    }
+
+    // connect memory links
+    for (auto& memoryFrameLine : memoryFrameLines_) {
+      NeuralNetwork::connect(
+          memoryFrameLine.agents[i],
+          i == 0 ? memoryFrameLine.bootLayer : memoryFrameLine.frames[i - 1],
+          idSize /*height of agent*/);
+    }
+  }
+
+  REGISTER_TIMER_INFO("RecurrentFwTime", "RecurrentFwTime");
+  // forward
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    memoryFrameLine.bootLayer->forward(passType);
+  }
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    const std::vector<Argument> inArgs;
+    std::vector<Argument> outArgs;
+    frames_[i]->forward(inArgs, &outArgs, passType);
+  }
+  if (evaluator_ && passType == PASS_TEST) {
+    this->eval(evaluator_.get());
+  }
+}
+
+void RecurrentGradientMachine::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("RecurrentBwTime", "RecurrentBwTime");
+  AsyncGpuBlock asyncGpuBlock;
+  for (int i = maxSequenceLength_ - 1; i >= 0; --i) {
+    frames_[i]->backward(nullptr);
+  }
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    memoryFrameLine.bootLayer->backward(nullptr);
+  }
+
+  // call printers here so the gradient can be printed
+  if (evaluator_) {
+    this->eval(evaluator_.get());
+  }
+}
+
+void RecurrentGradientMachine::forwardBackward(
+    const std::vector<Argument>& inArgs, std::vector<Argument>* outArgs,
+    PassType passType, const UpdateCallback& callback) {
+  LOG(FATAL) << "should not use this function";
+}
+
+void RecurrentGradientMachine::eval(Evaluator* evaluator) {
+  // call printers frame by frame
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    LOG(INFO) << "Recurrent Layer Group eval frame " << i << " begin";
+    evaluator->eval(*(frames_[i].get()));
+    LOG(INFO) << "Recurrent Layer Group eval frame " << i << " end";
+  }
+}
+
+void RecurrentGradientMachine::registerBeamSearchControlCallbacks(
+    const BeamSearchCandidatesAdjustCallback& adjustBeamSearch,
+    const NormOrDropNodeCallback& normOrDropNode,
+    const DropCallback& stopBeamSearch) {
+  this->removeBeamSearchControlCallbacks();
+  //! for gcc 46, aggregate initialization is not supported. TAT
+  this->beamSearchCtrlCallbacks_ = new BeamSearchControlCallbacks(
+      adjustBeamSearch, normOrDropNode, stopBeamSearch);
+}
+
+void RecurrentGradientMachine::removeBeamSearchControlCallbacks() {
+  if (this->beamSearchCtrlCallbacks_) {
+    delete this->beamSearchCtrlCallbacks_;
+    this->beamSearchCtrlCallbacks_ = nullptr;
+  }
+}
+
+void RecurrentGradientMachine::registerBeamSearchStatisticsCallbacks(
+    const EachStepCallback& onEachStepStarted,
+    const EachStepCallback& onEachStepStoped) {
+  this->removeBeamSearchStatisticsCallbacks();
+  this->beamSearchStatistics_ =
+      new BeamSearchStatisticsCallbacks(onEachStepStarted, onEachStepStoped);
+}
+
+void RecurrentGradientMachine::removeBeamSearchStatisticsCallbacks() {
+  if (this->beamSearchStatistics_) {
+    delete this->beamSearchStatistics_;
+    this->beamSearchStatistics_ = nullptr;
+  }
+}
+/* create scattered id infomation for all realLayer of inFrameLines one time.
+ * If hasSubseq, will also create scattered sequenceStartPositions infomation
+ * for all realLayer of inFrameLines one time.
+*/
+void RecurrentGradientMachine::createInFrameInfo(const Argument& input,
+                                                 PassType passType) {
+  bool hasSubseq = input.hasSubseq();
+  size_t numSequences = input.getNumSequences();
+  std::vector<int> allIds;
+  info_.idIndex.clear();
+  info_.idIndex.push_back(0);  // first idIndex = 0
+  if (hasSubseq) {             // for sequenceScatterAgentLayer
+    size_t numSubSequences = input.getNumSubSequences();
+    std::vector<int> sequenceStartPositions;
+    info_.seqStartPosIndex.clear();
+    info_.seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
+    for (int i = 0; i < maxSequenceLength_; ++i) {
+      sequenceStartPositions.push_back(0);  // first element = 0
+      for (size_t j = 0; j < numSubSequences; ++j) {
+        if (std::get<3>(seqLengthAndStart_[j]) == i) {
+          int subSeqStart = std::get<1>(seqLengthAndStart_[j]);
+          int subSeqLength = std::get<0>(seqLengthAndStart_[j]);
+          for (int k = subSeqStart; k < subSeqStart + subSeqLength; ++k) {
+            allIds.push_back(k);
+          }
+          sequenceStartPositions.push_back(sequenceStartPositions.back() +
+                                              subSeqLength);
+        }
+      }
+      info_.idIndex.push_back(allIds.size());
+      info_.seqStartPosIndex.push_back(sequenceStartPositions.size());
+    }
+    // inFrameLine create sequenceStartPositions one time
+    CHECK_EQ(sequenceStartPositions.size(),
+             maxSequenceLength_ + numSubSequences);
+    CHECK_EQ(info_.seqStartPosIndex.size(),
+             static_cast<size_t>(maxSequenceLength_ + 1));
+    createSeqPos(sequenceStartPositions, &info_.sequenceStartPositions);
+  } else {  // for scatterAgentLayer
+    for (int i = 0; i < maxSequenceLength_; ++i) {
+      for (size_t j = 0; j < numSequences; ++j) {
+        int seqLength = std::get<0>(seqLengthAndStart_[j]);
+        if (i >= seqLength) {
+          break;
+        }
+        int seqStart = std::get<1>(seqLengthAndStart_[j]);
+        allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
+                                   : (seqStart + i));
+      }
+      info_.idIndex.push_back(allIds.size());
+    }
+  }
+  // copy and check scatterId
+  copyScattedId(allIds, &info_.allIds, input.getBatchSize());
+  CHECK_EQ(info_.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
+  // inFrameLine select rows in real layer one time
+  for (auto& inFrameLine : inFrameLines_) {
+    selectRowsOneTime(inFrameLine.inLayer, info_.allIds, &inFrameLine.outArg,
+                      passType);
+  }
+}
+
+/* like createInFrameInfo, but for all realLayer of memoryFrameLines*/
+void RecurrentGradientMachine::createMemoryFrameInfo(
+    MemoryFrameLine* memoryFrameLine, PassType passType) {
+  const Argument& input = (*memoryFrameLine).rootLayer->getOutput();
+  size_t numSequences = input.getNumSequences();
+  std::vector<int> allIds;
+  bool seqFlag = (*memoryFrameLine).is_sequence;
+
+  if (seqFlag) {  // for sequenceScatterAgentLayer
+    CHECK(input.sequenceStartPositions)
+        << "boot layer must be a sequence when is_sequence = true";
+    std::vector<int> sequenceStartPositions;
+    sequenceStartPositions.push_back(0);  // first element = 0
+    const int* starts = input.sequenceStartPositions->getData(false);
+    for (size_t i = 0; i < numSequences; ++i) {
+      int seqId = std::get<2>(seqLengthAndStart_[i]);
+      for (int k = starts[seqId]; k < starts[seqId + 1]; ++k) {
+        allIds.push_back(k);
+      }
+      sequenceStartPositions.push_back(sequenceStartPositions.back() +
+                                          starts[seqId + 1] - starts[seqId]);
+    }
+    createSeqPos(sequenceStartPositions,
+                 &(*memoryFrameLine).sequenceStartPositions);
+
+  } else {  // for scatterAgentLayer
+    for (size_t i = 0; i < numSequences; ++i) {
+      allIds.push_back(std::get<2>(seqLengthAndStart_[i]));
+    }
+  }
+  // copy and check scatterId
+  copyScattedId(allIds, &(*memoryFrameLine).allIds, input.getBatchSize());
+  // memoryFrameLine select rows in real layer one time
+  selectRowsOneTime((*memoryFrameLine).rootLayer, (*memoryFrameLine).allIds,
+                    &(*memoryFrameLine).outArg, passType);
+}
+
+void RecurrentGradientMachine::copyScattedId(std::vector<int>& srcIds,
+                                             IVectorPtr* dstIds, int size) {
+  int idSize = srcIds.size();
+  CHECK_EQ(idSize, size);
+  IVector::resizeOrCreate(*dstIds, idSize, useGpu_);
+  (*dstIds)->copyFrom(srcIds.data(), idSize);
+  // check
+  std::sort(srcIds.begin(), srcIds.end());
+  for (int i = 0; i < idSize; ++i) {
+    CHECK_EQ(srcIds[i], i);
+  }
+}
+
+void RecurrentGradientMachine::selectRowsOneTime(LayerPtr layer,
+                                                 const IVectorPtr& allIds,
+                                                 Argument* arg,
+                                                 PassType passType) {
+  const MatrixPtr& realV = layer->getOutputValue();
+  int height = realV->getHeight();
+  int width = realV->getWidth();
+  Matrix::resizeOrCreate(arg->value, height, width, /* trans */ false, useGpu_);
+  arg->value->zeroMem();
+  arg->value->selectRows(*realV, *allIds);
+  if (passType != PASS_TEST) {
+    Matrix::resizeOrCreate(arg->grad, height, width, /* trans */ false,
+                           useGpu_);
+    arg->grad->zeroMem();
+  }
+}
+
+void RecurrentGradientMachine::createSeqPos(
+    const std::vector<int>& sequenceStartPosition,
+    ICpuGpuVectorPtr* sequenceStartPositions) {
+  int size = sequenceStartPosition.size();
+  const int* data = sequenceStartPosition.data();
+  ICpuGpuVector::resizeOrCreate(*sequenceStartPositions, size, false);
+  (*sequenceStartPositions)->copyFrom(data, size, false);
+}
+
+size_t RecurrentGradientMachine::getGenBatchSize() {
+  size_t numSequences = 0;
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    if (!memoryFrameLine.rootLayer) continue;
+    Argument& bootArg = memoryFrameLine.rootLayer->getOutput();
+    size_t batchSize = memoryFrameLine.is_sequence ?
+                       bootArg.getNumSequences() : bootArg.getBatchSize();
+    if (numSequences) {
+      CHECK_EQ(numSequences, batchSize);
+    } else {
+      numSequences = batchSize;
+    }
+  }
+  CHECK(numSequences) << "Fail to get batch size in generation. "
+    "At least one of the Memory layer MUST have a layer that is NOT in "
+    "the layer group to boot it, and this boot layer is used to "
+    "decide batch_size in generation process.";
+  return numSequences;
+}
+
+void RecurrentGradientMachine::generateSequence() {
+  CHECK_NOTNULL(eosFrameLine_.get());
+  CHECK_GE(outFrameLines_.size(), 1UL);
+  size_t numSequences = getGenBatchSize();
+
+  resizeBootFrame(numSequences);
+  // We create only two sub-network in generation for alternate use.
+  // Thus, we can reduce total memory of output_ in layer forward.
+  resizeOrCreateFrames(2);
+
+  // outFrameLines_.size() > 1UL
+  dataArgsSize_ = outFrameLines_.size() - 1;
+  dataArgs_.resize(dataArgsSize_);
+  dataArgsFrame_.clear();
+  dataArgsFrame_.resize(dataArgsSize_);
+
+  // connect boot frame memory links
+  std::vector<int> ids(numSequences);
+  for (size_t i = 0; i < numSequences; ++i) { ids[i] = i; }
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    if (memoryFrameLine.rootAgent) {
+      auto scatterAgent =
+          dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
+      bool seqFlag = memoryFrameLine.is_sequence;
+      scatterAgent->setRealLayer(memoryFrameLine.rootLayer, ids, seqFlag);
+      if (seqFlag) {
+        CHECK(memoryFrameLine.rootLayer->getOutput().sequenceStartPositions)
+            << "boot layer must be a sequence when is_sequence = true";
+      }
+    }
+    NeuralNetwork::connect(memoryFrameLine.agents[0], memoryFrameLine.bootLayer,
+                           ids.size());
+  }
+
+  // boot layer forward
+  AsyncGpuBlock asyncGpuBlock;
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    memoryFrameLine.bootLayer->forward(PASS_TEST);
+  }
+
+  // init outArg
+  size_t resultNum = generator_.config.num_results_per_sample();
+  IVector::resizeOrCreate(generator_.outArg.ids,
+      generator_.config.max_num_frames() * numSequences * resultNum, false);
+  if (resultNum > 1) {
+    CHECK_LE(resultNum, static_cast<size_t>(generator_.config.beam_size()));
+    Matrix::resizeOrCreate(generator_.outArg.in, /* height */ numSequences,
+                           /* width */ resultNum, false, /* useGpu */ false);
+  }
+  ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
+                                numSequences + 1, /* useGpu */ false);
+  if (getBeamSize() > 1) {
+    beamSearch(numSequences);
+  } else {
+    oneWaySearch(numSequences);
+  }
+  if (dataArgsSize_) createDataOutlink(batchMachineIdVec_);
+
+  size_t size = generator_.ids.size();
+  generator_.outArg.ids->resize(size);
+  generator_.outArg.ids->copyFrom(generator_.ids.data(), size);
+
+  OutFrameLine& outFrameLine = outFrameLines_[0];
+  auto dataAgent = dynamic_cast<DataLayer*>(outFrameLine.agentLayer.get());
+  CHECK_NOTNULL(dataAgent);
+  dataAgent->setData(generator_.outArg);
+  dataAgent->prefetch();
+}
+
+void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
+  OutFrameLine& outFrameLine = outFrameLines_[0];
+
+  // finalPaths_[0] stores the generated results of the
+  // entire batch, so its size exactly equals to batchSize.
+  finalPaths_.clear();
+  finalPaths_.resize(1);
+  std::vector<Path>& finalPaths = finalPaths_[0];
+  finalPaths.resize(batchSize);
+
+  seqIds_.resize(batchSize);
+  std::vector<int> scatterIds;
+  for (size_t i = 0; i < batchSize; ++i) {
+    finalPaths[i].seqId = i;
+    seqIds_[i] = i;
+  }
+
+  // forward
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    if (i && scatterIds.empty()) break;
+    int machineCur = i % 2;
+    int machinePrev = (i - 1) % 2;
+    // connect memory links
+    if (i) {
+      seqIds_.clear();
+      for (size_t j = 0; j < batchSize; ++j) {
+        if (finalPaths[j].seqId != -1) seqIds_.push_back(j);
+      }
+
+      for (auto& memoryFrameLine : memoryFrameLines_) {
+        auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
+            memoryFrameLine.scatterAgents[machineCur].get());
+        scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
+                                   scatterIds, memoryFrameLine.is_sequence);
+        scatterAgent->forward(PASS_TEST);
+        NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
+                               memoryFrameLine.scatterAgents[machineCur]);
+      }
+    }
+    const std::vector<Argument> inArgs;
+    std::vector<Argument> outArgs;
+    frames_[machineCur]->forward(inArgs, &outArgs, PASS_TEST);
+
+    const IVectorPtr& idVec = outFrameLine.frames[machineCur]->getOutput().ids;
+    for (size_t j = 0; j < seqIds_.size(); ++j) {
+      finalPaths[seqIds_[j]].ids.push_back(idVec->getElement(j));
+      finalPaths[seqIds_[j]].machineIdVec.push_back(j);
+    }
+
+    copyDataOutlinkFrame(machineCur);
+
+    // call value printer
+    if (evaluator_) {
+      evaluator_->eval(*(frames_[machineCur].get()));
+    }
+    // check eos
+    const IVectorPtr& eosVec =
+        eosFrameLine_->layers[machineCur]->getOutput().ids;
+    scatterIds.clear();
+    for (size_t j = 0; j < seqIds_.size(); ++j) {
+      if (eosVec->getElement(j) == 1U) {
+        // path.seqId = -1 indicates end of generation
+        // of an input sequence
+        finalPaths[seqIds_[j]].seqId = -1;
+      } else { scatterIds.push_back(j); }
+    }
+  }
+
+  batchMachineIdVec_.clear();
+  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
+  starts[0] = 0;
+  generator_.ids.clear();
+  for (size_t i = 0; i < batchSize; ++i) {
+    generator_.ids.insert(generator_.ids.end(),
+                          finalPaths[i].ids.begin(),
+                          finalPaths[i].ids.end());
+    starts[i + 1] = generator_.ids.size();
+    batchMachineIdVec_.insert(batchMachineIdVec_.end(),
+                             finalPaths[i].machineIdVec.begin(),
+                             finalPaths[i].machineIdVec.end());
+  }
+}
+
+void RecurrentGradientMachine::connectPrevFrame(int stepId,
+                                                std::vector<Path>& paths) {
+  int machineCur = stepId % 2;
+  int machinePrev = (stepId - 1) % 2;
+  int beam = getBeamSize();
+  machineIds_.clear();
+  topIds_.clear();
+  seqIds_.clear();
+
+  for (size_t j = 0; j < paths.size(); ++j) {
+    machineIds_.push_back(paths[j].machineId);
+    topIds_.push_back(paths[j].machineId * beam + paths[j].topIndex);
+    seqIds_.push_back(paths[j].seqId);
+  }
+
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    bool isOutIds = (memoryFrameLine.layerName == outFrameLines_[0].layerName);
+    auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
+        memoryFrameLine.scatterAgents[machineCur].get());
+    scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
+                               isOutIds ? topIds_ : machineIds_,
+                               memoryFrameLine.is_sequence);
+    scatterAgent->forward(PASS_TEST);
+    NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
+                           memoryFrameLine.scatterAgents[machineCur]);
+  }
+}
+
+void RecurrentGradientMachine::forwardFrame(int machineCur) {
+  // forward
+  const std::vector<Argument> inArgs;
+  std::vector<Argument> outArgs;
+  frames_[machineCur]->forward(inArgs, &outArgs, PASS_TEST);
+
+  copyDataOutlinkFrame(machineCur);
+
+  IVectorPtr& ids = outFrameLines_[0].frames[machineCur]->getOutput().ids;
+  MatrixPtr in = outFrameLines_[0].frames[machineCur]->getOutput().in;
+  IVectorPtr& eos = eosFrameLine_->layers[machineCur]->getOutput().ids;
+  if (useGpu_) {
+    IVector::resizeOrCreate(cpuId_, ids->getSize(), false /* useGpu */);
+    cpuId_->copyFrom(*ids);
+    Matrix::resizeOrCreate(cpuProb_, in->getHeight(), in->getWidth(),
+                           false /* trans */, false /* useGpu */);
+    cpuProb_->copyFrom(*in);
+    IVector::resizeOrCreate(cpuEos_, eos->getSize(), false /* useGpu */);
+    cpuEos_->copyFrom(*eos);
+  } else {
+    cpuId_ = ids;
+    cpuProb_ = in;
+    cpuEos_ = eos;
+  }
+}
+
+void RecurrentGradientMachine::singlePathExpand(
+    Path& curPath, size_t curPathId, std::vector<Path>& newPaths,
+    size_t expandWidth) {
+  int calc_id =
+      gDiyProbStart ? gDiyProbStart(curPath.ids.size(), curPath.ids.data()) : 0;
+
+  const int* idVec = cpuId_->getData();
+  const real* probMat = cpuProb_->getData();
+  const int* eosVec = cpuEos_->getData();
+
+  for (size_t k = 0; k < expandWidth; k++) {
+    int index = curPathId * expandWidth + k;
+    int id = idVec[index];
+    real prob = probMat[index];
+    /*
+     * Ordinarily, beam search greedily expands the most promising expandWidth
+     * paths that currently are ALWAYS returned by MaxIdLayer.
+     * In one condition, if user customizes the beam search procedure by
+     * restricting the expansion within a user defined subset,
+     * as a result, MaxIdLayer possibly COULD NOT return expandWidth
+     * vaild expansions, and it will use -1 to indicate the end of valid
+     * expansion candidates.
+     */
+    if (id == -1) break;
+
+    real newLogProb = generator_.config.log_prob() ? std::log(prob) : prob;
+    Path newPath(curPath, id, newLogProb,
+                 curPathId /*machineId*/, k /*topIndex*/);
+    if (this->beamSearchCtrlCallbacks_) {
+      if (beamSearchCtrlCallbacks_->stopDetermineCandidates(
+              newPath.seqId, newPath.ids, newPath.probHistory)) return;
+    }
+    // outFrameLines_.size() > 1UL
+    if (dataArgsSize_) {
+      newPath.machineIdVec = curPath.machineIdVec;
+      newPath.machineIdVec.push_back(curPathId);
+    }
+    bool atEos = eosVec[index] == 1U ||
+                 newPath.ids.size() >= (size_t)maxSequenceLength_;
+    // adjustNewPath
+    newPath.adjustProb(calc_id, atEos);
+    if (this->beamSearchCtrlCallbacks_) {
+      this->beamSearchCtrlCallbacks_->normOrDropNode(
+          newPath.seqId, newPath.ids, newPath.probHistory, &newPath.logProb);
+    }
+    if (!newPath.isDropable()) {
+      atEos ? finalPaths_[curPath.seqId].push_back(newPath) :
+              newPaths.push_back(newPath);
+    }
+  }  // for expandWidth
+
+  if (gDiyProbStop) { gDiyProbStop(calc_id); }
+}
+
+void RecurrentGradientMachine::beamExpand(
+    std::vector<Path>& paths, std::vector<Path>& newPaths) {
+  size_t candidatePathCount = paths.size();
+  // idVec.size() could be larger than candidatePathCount * beam,
+  // so user can drop some node customly.
+  CHECK_EQ(cpuId_->getSize() % candidatePathCount, 0UL);
+  size_t expandWidth = cpuId_->getSize() / candidatePathCount;
+
+  // iterate over each sequence
+  size_t totalExpandCount = 0;
+  int prevSeqId = -1;
+  int curSeqId = 0;
+  for (size_t j = 0; j <= candidatePathCount; j++) {
+    // expansions of a single sequence are all processed
+    curSeqId = (j < candidatePathCount? paths[j].seqId : curSeqId + 1);
+    if (prevSeqId != -1 && curSeqId != prevSeqId) {
+      totalExpandCount += beamShrink(newPaths, prevSeqId, totalExpandCount);
+    }
+    if (j == candidatePathCount) return;
+    singlePathExpand(paths[j], j, newPaths, expandWidth);
+
+    prevSeqId = paths[j].seqId;
+  }  // for paths
+}
+
+// Drop extra nodes to beam size.
+size_t RecurrentGradientMachine::beamShrink(
+    std::vector<Path>& newPaths, size_t seqId, size_t totalExpandCount) {
+  size_t minNewPathSize = std::min(getBeamSize(),
+                                   newPaths.size() - totalExpandCount);
+  if (!minNewPathSize) { return 0; }
+  std::nth_element(newPaths.begin() + totalExpandCount,
+                   newPaths.begin() + totalExpandCount + minNewPathSize,
+                   newPaths.end(), Path::greaterPath);
+  newPaths.resize(totalExpandCount + minNewPathSize);
+
+  real minPathLogProb = std::min_element(newPaths.end() - minNewPathSize,
+                                         newPaths.end())->logProb;
+  real maxPathLogProb = std::max_element(newPaths.end() - minNewPathSize,
+                                         newPaths.end())->logProb;
+
+  // Remove the already formed paths that are relatively short
+  finalPaths_[seqId].erase(
+      std::remove_if(finalPaths_[seqId].begin(),
+                     finalPaths_[seqId].end(),
+                     [&](Path& p) {
+                         return p.logProb < minPathLogProb;
+                     }),
+      finalPaths_[seqId].end());
+  for (auto p : finalPaths_[seqId]) {
+    if (minFinalPathLogProb_[seqId] > p.logProb) {
+      minFinalPathLogProb_[seqId] = p.logProb;
+    }
+  }
+
+  if (finalPaths_[seqId].size() >= getBeamSize() &&
+          minFinalPathLogProb_[seqId] >= maxPathLogProb) {
+    newPaths.resize(totalExpandCount);
+    return 0;
+  }
+  return minNewPathSize;
+}
+
+void RecurrentGradientMachine::fillGenOutputs() {
+  size_t numResults = generator_.config.num_results_per_sample();
+  for (size_t i = 0; i < finalPaths_.size(); ++i) {
+    size_t minFinalPathsSize = std::min(numResults, finalPaths_[i].size());
+    std::partial_sort(finalPaths_[i].begin(),
+                      finalPaths_[i].begin() + minFinalPathsSize,
+                      finalPaths_[i].end(), Path::greaterPath);
+    finalPaths_[i].resize(minFinalPathsSize);
+  }
+
+  batchMachineIdVec_.clear();
+  generator_.ids.clear();
+  if (numResults > 1) {
+    real* probs = generator_.outArg.in->getData();
+    int* starts =
+        generator_.outArg.sequenceStartPositions->getMutableData(false);
+    starts[0] = 0;
+    for (size_t i = 0; i < finalPaths_.size(); ++i) {
+      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
+        Path& path = finalPaths_[i][j];
+        generator_.ids.push_back(path.ids.size());  // sequence size
+        generator_.ids.insert(generator_.ids.end(), path.ids.begin(),
+                              path.ids.end());
+        generator_.ids.push_back(-1);  // end of sequence
+        probs[i * numResults + j] = path.logProb;
+
+        if (!j && dataArgsSize_) {
+          // in beam search, here only reserved the top 1 generated result
+          // for out_links that are not the generated word indices.
+          batchMachineIdVec_.insert(batchMachineIdVec_.end(),
+              path.machineIdVec.begin(), path.machineIdVec.end());
+        }
+      }
+      starts[i + 1] = generator_.ids.size();
+    }
+  } else {
+    for (size_t i = 0; i < finalPaths_.size(); ++i) {
+      CHECK(!finalPaths_[i].empty());
+      generator_.ids = finalPaths_[i][0].ids;
+    }
+  }
+}
+
+void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) {
+  for (size_t i = 0; i < dataArgsSize_; i++) {
+    Argument outFrame;
+    outFrame.resizeAndCopyFrom(
+        outFrameLines_[i + 1].frames[machineCur]->getOutput(), useGpu_);
+    dataArgsFrame_[i].emplace_back(outFrame);
+  }
+}
+
+void RecurrentGradientMachine::createDataOutlink(
+    std::vector<int>& machineIdVec) {
+  size_t seqNum = getBeamSize() > 1UL ?
+                  finalPaths_.size() : finalPaths_[0].size();
+  std::vector<int> starts(seqNum + 1, 0);
+  for (size_t i = 0; i < seqNum; ++i) {
+    size_t seqLen = getBeamSize() > 1UL ? finalPaths_[i][0].ids.size() :
+                                          finalPaths_[0][i].ids.size();
+    starts[i + 1] = starts[i] + seqLen;
+  }
+
+  for (size_t i = 0; i < dataArgsSize_; i++) {
+    dataArgs_[i].concat(dataArgsFrame_[i], machineIdVec,
+                        starts, useGpu_, HPPL_STREAM_1, PASS_TEST);
+
+    auto dataAgent = dynamic_cast<DataLayer*>(
+        outFrameLines_[i + 1].agentLayer.get());
+    CHECK_NOTNULL(dataAgent);
+    dataAgent->setData(dataArgs_[i]);
+  }
+}
+
+void RecurrentGradientMachine::beamSearch(size_t batchSize) {
+  finalPaths_.clear();
+  finalPaths_.resize(batchSize);
+  seqIds_.resize(batchSize);
+  minFinalPathLogProb_.clear();
+  minFinalPathLogProb_.resize(batchSize, 0);
+
+  std::vector<Path> paths;
+  std::vector<Path> newPaths;
+  for (size_t i = 0; i < batchSize; ++i) {
+    paths.push_back(Path(i));
+    if (this->beamSearchCtrlCallbacks_) {
+      paths.back().recordHistory();
+    }
+  }
+
+  // restart beam search
+  stopBeamSearch_ = false;
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    int machineCur = i % 2;
+    std::unique_ptr<
+        ScopedCallbacks<const RecurrentGradientMachine::EachStepCallback&, int>>
+        statisticsBlock;
+    if (this->beamSearchStatistics_) {
+      auto ptr =
+          new ScopedCallbacks<const RecurrentGradientMachine::EachStepCallback&,
+                              int>(beamSearchStatistics_->onEachStepStarted,
+                                   beamSearchStatistics_->onEachStepStoped, i);
+      statisticsBlock.reset(ptr);
+    }
+    if (stopBeamSearch_) break;
+
+    if (i) connectPrevFrame(i, paths);
+
+    if (this->beamSearchCtrlCallbacks_) {
+      std::vector<std::vector<int>*> prefixes;
+      prefixes.resize(paths.size());
+      std::transform(
+          paths.begin(), paths.end(), prefixes.begin(),
+          [](const Path& p) { return const_cast<std::vector<int>*>(&p.ids); });
+      beamSearchCtrlCallbacks_->beamSearchCandidateAdjust(
+          prefixes, frames_[machineCur].get(), i);
+    }
+
+    forwardFrame(machineCur);
+    beamExpand(paths, newPaths);
+    if (newPaths.empty()) break;
+
+    paths = newPaths;
+    newPaths.clear();
+  }  // end for machineCur
+  fillGenOutputs();
+}
+
+void RecurrentGradientMachine::Path::adjustProb(int calc_id, bool atEos) {
+  if (gDiyProbMethod) {
+    logProb = gDiyProbMethod(calc_id, ids.size(), ids.data(), logProb, atEos);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
new file mode 100644
index 00000000000000..cc49d13952323d
--- /dev/null
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -0,0 +1,483 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "GradientMachine.h"
+#include "NeuralNetwork.h"
+#include <functional>
+
+#include "paddle/utils/Locks.h"
+
+namespace paddle {
+
+/**
+ * Private data class declares.
+ * Used for user customized beam search.
+ */
+class BeamSearchControlCallbacks;
+class BeamSearchStatisticsCallbacks;
+
+class RecurrentGradientMachine : public NeuralNetwork {
+public:
+  RecurrentGradientMachine(const std::string& subModelName,
+                           NeuralNetwork* rootNetwork);
+
+  // Disable copy and assign.
+  RecurrentGradientMachine(const RecurrentGradientMachine& other) = delete;
+  RecurrentGradientMachine& operator=(const RecurrentGradientMachine& other) =
+      delete;
+
+  virtual ~RecurrentGradientMachine() {
+    this->removeBeamSearchStatisticsCallbacks();
+    this->removeBeamSearchControlCallbacks();
+  }
+
+  virtual void init(const ModelConfig& config, ParamInitCallback callback,
+                    const std::vector<ParameterType>& parameterTypes,
+                    bool useGpu);
+
+  virtual void prefetch(const std::vector<Argument>& inArgs);
+
+  virtual void forward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs, PassType passType);
+
+  virtual void backward(const UpdateCallback& callback = nullptr);
+
+  void forwardBackward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs, PassType passType,
+                       const UpdateCallback& callback);
+
+  virtual void resetState() {}
+  virtual void eval(Evaluator* evaluator);
+
+  const std::vector<int>& getParameterIds() { return parameterIds_; }
+
+  /**
+   * @brief BeamSearchCandidatesAdjustCallback
+   *
+   * Adjust searching candidates to restrict beam search
+   * searching within a limited subset of all possibile paths.
+   *
+   * The first parameter is the prefixes of all formed paths in current
+   * beam search step, whose type is basically int[][].
+   *
+   * The second parameter is a pointer to the network used to generate sequence,
+   * user can use this pointer to tranverse each layer in the network to
+   * modify behaivors of a particular layer.
+   *
+   * The third parameter is an integer to indicate the iteration number of
+   * beam search, so that user can customize different operations in different
+   * beam search iterations.
+   */
+  typedef std::function<void(const std::vector<std::vector<int>*>&,
+                             NeuralNetwork*, const int)>
+      BeamSearchCandidatesAdjustCallback;
+
+  /**
+   * @brief DropCallback
+   *
+   * Drop a whole prefix or one candidate in beam search or not.
+   *
+   * The first parameter is sequence index in a batch
+   *
+   * The second parameter is one path in beam search,
+   * which is made up of node indices.
+   *
+   * The third parameter is probabilites for each node in this path.
+   *
+   * Return true if this prefix or candidate is expected to be dropped.
+   */
+  typedef std::function<bool(int seqId, const std::vector<int>&,
+      const std::vector<real>&)> DropCallback;
+
+  /**
+    * @brief NormOrDropNodeCallback
+    *
+    * Normalize a path's probabilities or just drop it by modifying path.logProb
+    *
+    * The first parameter is sequence index in a batch
+    *
+    * The second parameter is path.ids
+    *
+    * The third parameter is probabilites for each node in this path.
+    *
+    * The fourth parameter is the probability of the whole path.
+    */
+  typedef std::function<void(int seqId, const std::vector<int>&,
+      std::vector<real>&, real*)> NormOrDropNodeCallback;
+
+  /**
+   * @brief Register beam search control callbacks. Used for prediction.
+   *
+   * @param queryBeamSearch: Give the sequences already formed, return the
+   * nodes expected to be expanded.
+   * Input: A pointer to an array holding pathes which have been expanded
+   * Return: A pointer to an array holding nodes wanted to be expanded.
+   *
+   * @param dropOneNode: Early drop a node in one beam search step.
+   * Given the path formed and probability history, decide whether a node
+   * should be dropped or not.
+   *
+   * @param stopBeamSearch: Early stop a path in one beam search step.
+   * Given the path and probability history, decide whether a path
+   * should be dropped or not.
+   */
+  void registerBeamSearchControlCallbacks(
+      const BeamSearchCandidatesAdjustCallback& adjustBeamSearch,
+      const NormOrDropNodeCallback& normOrDropNode,
+      const DropCallback& stopBeamSearch);
+
+  /**
+   * @brief Remove user costumized beam search callbacks,
+   *
+   * make sequence generation acts like normal beam search.
+   */
+  void removeBeamSearchControlCallbacks();
+
+  /**
+   * @brief EachStepCallback
+   *
+   * Invoke with beam search step.
+   */
+  typedef std::function<void(int)> EachStepCallback;
+
+  /**
+   * @brief register statistics methods for performance profile of beam search.
+   *
+   * @param onEachStepStarted: invoke once a beam search step starts.
+   * Its input is index of the beam search step.
+   *
+   * @param onEachStepStoped: invoke once a beam search step ends.
+   * Its input is index of the beam search step.
+   */
+  void registerBeamSearchStatisticsCallbacks(
+      const EachStepCallback& onEachStepStarted,
+      const EachStepCallback& onEachStepStoped);
+
+  /**
+   * @brief Remove beam search callbacks.
+   */
+  void removeBeamSearchStatisticsCallbacks();
+
+  /**
+   * @brief Stop beam search for current source.
+   *
+   * Will restart beam search in the next forward
+   */
+  void stopBeamSearch();
+
+  struct Path {
+    /**
+     * @brief ids, path of beam search.
+     */
+    std::vector<int> ids;
+
+    /**
+     * @brief logProb, current probability of path.
+     */
+    real logProb;
+
+    int machineId;  // index of sample in frame
+    int topIndex;   // index of MaxIdLayer output in one sample
+    int seqId;  // index of sequence in batch generation
+    std::vector<int> machineIdVec;
+
+    /**
+     * @brief A record of each node's probality in a formed path in beam search.
+     *
+     * @note  It could be empty when history is not recorded. If the history is
+     *        wanted to be recorded, recordHistory() MUST be invoked first.
+     */
+    std::vector<real> probHistory;
+
+    /**
+     * @brief Path default ctor, first logProb is 0.
+     */
+    Path() { logProb = 0; seqId = 0; }
+    explicit Path(size_t seqId) : seqId(seqId) { logProb = 0; }
+
+    /**
+     * @brief Create a new path based on an old path and
+     * a new node with probability.
+     *
+     * @param old       old path
+     * @param newId     index of the new node
+     * @param logProb   probability of the new node.
+     * @param machineId sample index of a frame in RNN
+     * @param topIndex  index of MaxIdLayer output in one sample
+     */
+    Path(Path& old, int newId, real logProb, int machineId, int topIndex)
+        : ids(old.ids),
+          logProb(old.logProb + logProb),
+          machineId(machineId),
+          topIndex(topIndex),
+          seqId(old.seqId) {
+      ids.push_back(newId);
+      if (!old.probHistory.empty()) {
+        this->probHistory = old.probHistory;
+        // probHistory store current prob, not sum
+        this->probHistory.push_back(logProb);
+      }
+    }
+
+    /**
+     * @brief operator <
+     *
+     * Path a < Path b means log probability of a is smaller than that of b
+     */
+    bool operator<(const Path& other) const {
+      return (logProb < other.logProb);
+    }
+
+    static bool greaterPath(const Path& a, const Path& b) { return (b < a); }
+
+    /**
+     * @brief Start recording history in this path.
+     */
+    void recordHistory() { this->probHistory.push_back(this->logProb); }
+
+    /**
+     * @brief Adjust probability for DIY beam search interface.
+     * In normal situation, it will do nothing.
+     *
+     * @param calc_id: the object id for DIY beam search interface.
+     * @param atEos: at end of sequence or not.
+     */
+    void adjustProb(int calc_id, bool atEos = false);
+
+    /**
+     * @brief isDropable indacating whether the current node will be
+     * dropped or not in beam search.
+     *
+     * @note: if logProb is -inf, current node will be dropped.
+     * @return true to drop the current node.
+     */
+    bool isDropable() const { return std::isinf(logProb) && logProb < 0; }
+  };
+
+  /**
+   * @brief access beam search results.
+   * @return beam search results.
+   */
+  const std::vector<std::vector<Path>>& getFinalPaths() const {
+    return this->finalPaths_;
+  }
+
+protected:
+  void resizeOrCreateFrames(int numFrames);
+  void resizeBootFrame(int numSequences);
+
+  void generateSequence();
+  void oneWaySearch(size_t batchSize);
+  void beamSearch(size_t batchSize);
+
+  struct InFrameLine {
+    std::string linkName;
+    LayerPtr inLayer;
+    std::vector<LayerPtr> agents;  // Scatter Agents to reform batch input
+    bool hasSubseq;
+    Argument outArg;  // scatter output argument
+  };
+  std::vector<InFrameLine> inFrameLines_;
+
+  struct OutFrameLine {
+    std::string layerName;
+    LayerPtr agentLayer;
+    std::vector<LayerPtr> frames;
+  };
+  std::vector<OutFrameLine> outFrameLines_;
+
+  struct MemoryFrameLine {
+    std::string layerName;
+    std::string linkName;
+    LayerPtr bootLayer;  // actually used biasLayer or rootAgent
+    LayerPtr biasLayer;
+    LayerPtr rootLayer;  // layer in root network to boot this memory
+    LayerPtr rootAgent;  // agent to link rootLayer
+    std::vector<LayerPtr> frames;
+    std::vector<LayerPtr> agents;
+    std::vector<LayerPtr> scatterAgents;  // scatter agent used by beam search
+    Argument outArg;                      // scatter output argument
+    bool is_sequence;
+    // Different memoryFrameLine have different element as follows
+    IVectorPtr allIds;  // scattered id of realLayer
+    ICpuGpuVectorPtr
+        sequenceStartPositions;  // scattered sequenceStartPositions
+  };
+  std::vector<MemoryFrameLine> memoryFrameLines_;
+
+  // All inFrameLines and outFrameLines have the same element as follows.
+  struct Info {
+    IVectorPtr allIds;         // scattered id of realLayer
+    std::vector<int> idIndex;  // index of allIds
+    ICpuGpuVectorPtr
+        sequenceStartPositions;      // scattered sequenceStartPositions
+    std::vector<int> seqStartPosIndex;  // index of sequenceStartPositions
+  };
+  Info info_;
+
+  // if no subSeq, tuple of (seqLength, seqStart, seqIndex, seqIndex)
+  // else, tuple of (subSeqLength, subSeqStart, seqIndex, subSeqIndex)
+  std::vector<std::tuple<int, int, int, int>> seqLengthAndStart_;
+
+  void createInFrameInfo(const Argument& input, PassType passType);
+
+  void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine,
+                             PassType passType);
+
+  void copyScattedId(std::vector<int>& srcIds, IVectorPtr* dstIds, int size);
+
+  void selectRowsOneTime(LayerPtr layer, const IVectorPtr& allIds,
+                         Argument* arg, PassType passType);
+
+  void createSeqPos(const std::vector<int>& sequenceStartPosition,
+                    ICpuGpuVectorPtr* sequenceStartPositions);
+
+  // for generator
+  struct EosFrameLine {
+    std::vector<LayerPtr> layers;
+  };
+  std::unique_ptr<EosFrameLine> eosFrameLine_;
+
+  struct Generator {
+    GeneratorConfig config;
+    std::vector<int> ids;  // store generated sequences
+    Argument outArg;       // final output argument
+  };
+  Generator generator_;
+
+  std::vector<std::unique_ptr<NeuralNetwork>> frames_;
+
+  NeuralNetwork* rootNetwork_;
+  bool reversed_;
+  int maxSequenceLength_;
+  bool useGpu_;
+  bool stopBeamSearch_;
+
+  std::vector<int>
+      parameterIds_;  // parameters actually used by this Layer Group
+
+  std::unique_ptr<Evaluator> evaluator_;  // frame printers in this layer group
+
+  // store final argument of outFrameLines_
+  std::vector<Argument> dataArgs_;
+  // store each frame's output argument of outFrameLines_
+  std::vector<std::vector<Argument>> dataArgsFrame_;
+  size_t dataArgsSize_;  // size of dataArgs_ = size of dataArgsFrame_
+
+  IVectorPtr cpuId_;
+  MatrixPtr cpuProb_;
+  IVectorPtr cpuEos_;
+
+private:
+  /*
+   * @return beam size in beam search
+   */
+  size_t getBeamSize() { return generator_.config.beam_size(); }
+
+  /*
+   * @return number of sequence in a batch in generation
+   */
+  size_t getGenBatchSize();
+
+  /*
+   * @brief store output of the machineCur-th frame during generation, for
+   * creating the final outlink after the entire generation process is finished.
+   *
+   * In generation, if the layer group has more than 1 outlink, the first
+   * one is reserved to store the generated word indices, the others are data
+   * outlinks, that can be used like a common layer in the network.
+   *
+   * @param machineCur : index to access the layer group frame in
+   * currrent generation step.
+   */
+  void copyDataOutlinkFrame(size_t machineCur);
+
+  /*
+   * @brief In generation, if the layer group has more than 1 outlink, outlinks
+   * except the first one are data outlinks. This function creates the data
+   * outlinks.
+   * @note In beam search, only one generated sequence with the hightest log
+   * probabilites are retained.
+   * @param machineIdVec : select a row of output matrix in each frame
+   * that the generation process expanded.
+   */
+  void createDataOutlink(std::vector<int> & machineIdVec);
+
+  /*
+   * @brief used in beam search, connect previous frame to form recurrent link
+   * @param stepId : iteration number of generation process.
+   * It equals to the length of longest half-generated sequence.
+   * @param paths : half-generated paths that are going to be expanded
+   * in current beam search iteration.
+   */
+  void connectPrevFrame(int stepId, std::vector<Path>& paths);
+
+  /*
+   * @brief used in beam search, forward current recurrent frame
+   * @param machineCur : index to access the layer group frame in
+   * currrent generation step.
+   */
+  void forwardFrame(int machineCur);
+
+  /*
+   * @brief reduce all expanded paths to beam size.
+   *
+   * @param newPaths : newPaths[totalExpandCount : ] stores all expanded paths
+   * for the seqId-th sequence
+   * @param seqId : sequence index in a batch
+   * @param totalExpandCount : number of already shrinked paths in newPaths
+   * @return size of retained paths at the end of a beam search iteration
+   */
+  size_t beamShrink(std::vector<Path>& newPaths, size_t seqId,
+                    size_t totalExpandCount);
+
+  /*
+   * @brief expand a single path to expandWidth new paths
+   * with highest probability
+   * @param curPath : path to be expanded
+   * @param curPathId : index of curPath in member newPaths
+   * @param expandWidth : number of paths to be expanded
+   */
+  void singlePathExpand(Path& curPath, size_t curPathId,
+                        std::vector<Path>& newPaths, size_t expandWidth);
+
+  /*
+   * @brief A new beam search iteration. Each half-generated paths in previous
+   * beam search iteration are further expanded to beam_size new paths
+   * with highest probabilities, and then all the expanded paths are again
+   * reduced to beam_size paths according to their log probabilities.
+   * @param paths : half-generated paths in previous iteration.
+   * @param newPaths : paths expanded and then reduces in current iteration.
+   */
+  void beamExpand(std::vector<Path>& paths, std::vector<Path>& newPaths);
+
+  /*
+   * @brief fill sequence start positions and some other information that are
+   * uesed by the "text_printer" evaluator.
+   */
+  void fillGenOutputs();
+
+  std::vector<int> machineIds_;
+  std::vector<int> topIds_;
+  std::vector<int> seqIds_;
+  std::vector<int> batchMachineIdVec_;
+  std::vector<std::vector<Path>> finalPaths_;
+  std::vector<real> minFinalPathLogProb_;
+  BeamSearchControlCallbacks* beamSearchCtrlCallbacks_;
+  BeamSearchStatisticsCallbacks* beamSearchStatistics_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/AddtoLayer.cpp b/paddle/gserver/layers/AddtoLayer.cpp
new file mode 100644
index 00000000000000..083b1957f3a724
--- /dev/null
+++ b/paddle/gserver/layers/AddtoLayer.cpp
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "AddtoLayer.h"
+
+#include "paddle/utils/Logging.h"
+
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(addto, AddtoLayer);
+
+bool AddtoLayer::init(const LayerMap& layerMap,
+                      const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  return true;
+}
+
+void AddtoLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(0)->getHeight();
+  int size = getSize();
+
+  reserveOutput(batchSize, size);
+
+  MatrixPtr outV = getOutputValue();
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    MatrixPtr input = getInputValue(i);
+    i == 0 ? outV->assign(*input) : outV->add(*input);
+  }
+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */ { forwardActivation(); }
+}
+
+void AddtoLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    /* Calculate the input layers error */
+    MatrixPtr preGrad = getInputGrad(i);
+    if (NULL != preGrad) {
+      preGrad->add(*getOutputGrad());
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/AddtoLayer.h b/paddle/gserver/layers/AddtoLayer.h
new file mode 100644
index 00000000000000..0f2ca0bf19ee7d
--- /dev/null
+++ b/paddle/gserver/layers/AddtoLayer.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/** 
+ * This layer just simply add all input layers together, then activate 
+ * the sum inputs. Each input of this layer should be the same size, 
+ * which is also the output size of this layer.
+ * \f[
+ *   y=f(\sum_{i}x_i + b)
+ * \f]
+ * where \f$y\f$ is output, \f$x\f$ is input, \f$b\f$ is bias, and \f$f\f$ is activation function.
+ * 
+ * The config file api is addto_layer.
+ */
+class AddtoLayer : public Layer {
+protected:
+  std::unique_ptr<Weight> biases_;
+
+public:
+  explicit AddtoLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~AddtoLayer() {}
+
+  /** 
+   * Intialization of AddtoLayer. 
+   */
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  /** 
+   * Forward propagation.
+   * @note There is no weight matrix for each input, 
+   *       because it just a simple add operation.
+   */
+  void forward(PassType passType);
+
+  /** 
+   * Backward propagation. 
+   */
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp
new file mode 100644
index 00000000000000..c1bef18ed38af8
--- /dev/null
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -0,0 +1,282 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "AgentLayer.h"
+
+#include "paddle/utils/Logging.h"
+
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(agent, AgentLayer);
+
+bool AgentLayer::init(const LayerMap& layerMap,
+                      const ParameterMap& parameterMap) {
+  CHECK_EQ(config_.inputs_size(), 0);
+  if (!Layer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  setNeedGradient(true);
+  return true;
+}
+
+void AgentLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  Argument& realOutput = realLayer_->getOutput();
+  int realHeight = realOutput.getBatchSize();
+  CHECK_LE(numSamples_, realHeight);
+
+  // get Arguments from real layers
+  if (numSamples_ > 0 && numSamples_ < realHeight) {
+    if (realOutput.ids) {
+      output_.ids->subVecFrom(*realOutput.ids, 0, numSamples_);
+    } else {
+      output_.subArgFrom(realOutput, /* offset */ 0, numSamples_, getSize(),
+                         useGpu_);
+    }
+  } else {
+    output_ = realOutput;
+  }
+}
+
+void SequenceAgentLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  Argument& realOutput = realLayer_->getOutput();
+  int realNumSequences = realOutput.getNumSequences();
+  CHECK_LE(numSamples_, realNumSequences);
+
+  // get Arguments from real layers
+  if (numSamples_ > 0 && numSamples_ < realNumSequences) {
+    int numRows = realOutput.sequenceStartPositions->
+                  getData(false)[numSamples_];
+    CHECK(!realOutput.ids) << "Not supported";
+    output_.subArgFrom(realOutput, /* offset */ 0, numRows, getSize(), useGpu_,
+                       /* trans */ false, /* seqFlag */ true,
+                       /* seqStart */ 0, /* seqSize */ numSamples_ + 1);
+  } else {
+    output_ = realOutput;
+  }
+}
+
+REGISTER_LAYER(sequence_agent, SequenceAgentLayer);
+
+bool GatherAgentLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  CHECK_EQ(config_.inputs_size(), 0);
+  if (!Layer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  setNeedGradient(true);
+  return true;
+}
+
+void GatherAgentLayer::copyIdAndSequenceInfo(const Argument& input,
+                                             const IVectorPtr& ids,
+                                             const std::vector<int>& idIndex) {
+  output_.sequenceStartPositions = input.sequenceStartPositions;
+  output_.subSequenceStartPositions = input.subSequenceStartPositions;
+  realLayers_.clear();
+  allIds_ = ids;
+  idIndex_ = idIndex;
+}
+
+void GatherAgentLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int height = allIds_->getSize();
+  int width = this->getSize();
+  resetOutput(height, width);
+  idsVec_.resize(idIndex_.size());
+
+  const MatrixPtr& outV = getOutputValue();
+
+  for (size_t i = 0; i < realLayers_.size(); ++i) {
+    const MatrixPtr& realV = realLayers_[i]->getOutputValue();
+    idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i],
+                                 /* size */ realV->getHeight(), useGpu_);
+    realV->addToRows(*outV, *idsVec_[i]);
+  }
+}
+
+void GatherAgentLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  const MatrixPtr& outputGrad = getOutputGrad();
+
+  for (size_t i = 0; i < realLayers_.size(); ++i) {
+    const MatrixPtr& realG = realLayers_[i]->getOutputGrad();
+    if (realG) {
+      realG->selectRows(*outputGrad, *idsVec_[i]);
+    }
+  }
+}
+
+bool ScatterAgentLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  CHECK_EQ(config_.inputs_size(), 0);
+  if (!Layer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  setNeedGradient(true);
+  return true;
+}
+
+void ScatterAgentLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
+
+  if (realLayer_->getOutput().ids) {  // ids scatter
+    IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
+    output_.ids->selectFrom(*realLayer_->getOutput().ids, *ids_);
+  } else {  // value scatter
+    int width = this->getSize();
+    if (realOutArg_.value) {
+      output_.subArgFrom(realOutArg_, /* offset */ idIndex_ * width, idSize_,
+                         width, useGpu_);
+    } else {  // used in generation
+      int height = ids_->getSize();
+      resetOutput(height, width);
+
+      const MatrixPtr& outV = getOutputValue();
+      const MatrixPtr& realV = realLayer_->getOutputValue();
+      outV->selectRows(*realV, *ids_);
+    }
+  }
+}
+
+void ScatterAgentLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  const MatrixPtr& outputGrad = realOutArg_.grad;
+  const MatrixPtr& realGrad = realLayer_->getOutputGrad();
+  if (realGrad) {
+    // for agent in inFrameLines and memoryFrameLines,
+    // only first scatterAgentLayer should do addToRows in backward
+    if (idIndex_ == 0) {
+      outputGrad->addToRows(*realGrad, *ids_);
+    }
+  }
+}
+
+REGISTER_LAYER(gather_agent, GatherAgentLayer);
+REGISTER_LAYER(scatter_agent, ScatterAgentLayer);
+
+void SequenceGatherAgentLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  int height = 0;
+  int* starts = output_.subSequenceStartPositions->getMutableData(false);
+  IVectorPtr idReal = realLayers_[0]->getOutputLabel();
+  if (idReal) {
+    // Gather generator.idsVec
+    // if is beam search generation result. Get first result.
+    if (idReal->getData()[idReal->getSize() - 1] == -1) {
+      for (size_t i = 0; i < realLayers_.size(); ++i) {
+        // The first element stores first result size
+        idReal = realLayers_[i]->getOutputLabel();
+        idReal->subVecFrom(*idReal, 1, idReal->getData()[0]);
+      }
+    }
+    for (size_t i = 0; i < realLayers_.size(); ++i) {
+      CHECK(realLayers_[i]->getOutputLabel());
+      starts[i] = height;
+      height += realLayers_[i]->getOutputLabel()->getSize();
+    }
+    starts[realLayers_.size()] = height;
+    output_.sequenceStartPositions->getMutableData(false)[1] = height;
+
+    IVector::resizeOrCreate(output_.ids, height, false);
+    for (size_t i = 0; i < realLayers_.size(); ++i) {
+      output_.ids->subVec(starts[i], starts[i + 1] - starts[i])
+          ->copyFrom(*realLayers_[i]->getOutputLabel());
+    }
+  } else {
+    // Gather output.value, same as GatherAgentLayer
+    CHECK(output_.subSequenceStartPositions);
+    GatherAgentLayer::forward(passType);
+  }
+}
+
+void SequenceScatterAgentLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
+  CHECK(!realLayer_->getOutput().ids) << "Not supported";
+
+  const Argument& input = realLayer_->getOutput();
+  CHECK_EQ(input.value->getWidth(), this->getSize());
+  int width = this->getSize();
+
+  AsyncGpuBlock asyncGpuBlock;
+  REGISTER_TIMER_INFO("SequenceAgentLayerForward", getName().c_str());
+
+  if (realOutArg_.value) {
+    CHECK(realOutArg_.sequenceStartPositions);
+    output_.subArgFrom(realOutArg_, /* offset */ idIndex_ * width, idSize_,
+                       width, useGpu_, /* trans */ false, /* seqFlag */ true,
+                       /* seqStart */ seqStartPosIndex_,
+                       /* seqSize */ numSequences_);
+  } else {
+    // used in generation
+    int height = 0;
+    size_t numSequences = ids_->getSize();
+    const int* starts = input.getCpuStartPositions();
+    size_t size = input.hasSubseq() ? input.getNumSubSequences()
+                                    : input.getNumSequences();
+    const int* cpuIds = cpuIds_->getData();
+
+    for (size_t i = 0; i < numSequences; ++i) {
+      size_t seqId = cpuIds[i];
+      CHECK_LT(seqId, size);
+      height += starts[seqId + 1] - starts[seqId];
+    }
+    reserveOutput(height, width);
+
+    const MatrixPtr& outputValue = getOutputValue();
+
+    CHECK_NE(input.sequenceStartPositions.get(),
+             output_.sequenceStartPositions.get());
+    ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions,
+                                   numSequences + 1, false);
+    int* outStarts = output_.sequenceStartPositions->getMutableData(false);
+
+    IVector::resizeOrCreate(cpuInputStartPos_, height, false);
+    int* inStarts = cpuInputStartPos_->getData();
+    size_t offsetOut = 0;
+    for (size_t i = 0; i < numSequences; ++i) {
+      outStarts[i] = offsetOut;
+      size_t seqId = cpuIds[i];
+      int size = starts[seqId + 1] - starts[seqId];
+      for (int j = 0; j < size; j++) {
+        inStarts[offsetOut + j] = starts[seqId] + j;
+      }
+      offsetOut += size;
+    }
+    outStarts[numSequences] = offsetOut;
+
+    if (useGpu_) {
+      IVector::resizeOrCreate(inputStartPos_, height, true);
+      inputStartPos_->copyFrom(*cpuInputStartPos_, HPPL_STREAM_DEFAULT);
+    } else {
+      inputStartPos_ = cpuInputStartPos_;
+    }
+    outputValue->copyByRowIndex(*input.value, *inputStartPos_);
+  }
+}
+
+REGISTER_LAYER(sequence_gather_agent, SequenceGatherAgentLayer);
+REGISTER_LAYER(sequence_scatter_agent, SequenceScatterAgentLayer);
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h
new file mode 100644
index 00000000000000..d82078dd933294
--- /dev/null
+++ b/paddle/gserver/layers/AgentLayer.h
@@ -0,0 +1,211 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * AgentLayer use as a virtual input of another layer in config,
+ * before execute forward/backward, setRealLayer() should be
+ * called to set one and only one real layer
+ */
+class AgentLayer : public Layer {
+protected:
+  LayerPtr realLayer_;
+  int numSamples_;
+
+public:
+  explicit AgentLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~AgentLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  // if *numSamples* set,
+  // real layer output will only use first *numSamples* rows
+  void setRealLayer(LayerPtr layer, int numSamples = 0) {
+    realLayer_ = layer;
+    numSamples_ = numSamples;
+  }
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr) {}
+};
+
+/**
+ * like AgentLayer, but use first *numSamples* sequences
+ */
+class SequenceAgentLayer : public AgentLayer {
+public:
+  explicit SequenceAgentLayer(const LayerConfig& config) : AgentLayer(config) {}
+  ~SequenceAgentLayer() {}
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr) {}
+};
+
+/**
+ * Like AgentLayer, but it can gather many real layers. Each real
+ * layer give a few rows of a sequence, after gather all real layers,
+ * GatherAgentLayer collect a complete sequence.
+ */
+class GatherAgentLayer : public Layer {
+protected:
+  std::vector<LayerPtr> realLayers_;
+  std::vector<IVectorPtr> idsVec_;
+  // we don't clear idsVec_ vector to aviod IVector alloc/free
+  IVectorPtr allIds_;
+  std::vector<int> idIndex_;
+
+public:
+  explicit GatherAgentLayer(const LayerConfig& config) : Layer(config) {}
+
+  virtual ~GatherAgentLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  // call before addRealLayer
+  void copyIdAndSequenceInfo(const Argument& input, const IVectorPtr& allIds,
+                             const std::vector<int>& idIndex);
+
+  // add one real layer, can call many times
+  void addRealLayer(LayerPtr layer) { realLayers_.push_back(layer); }
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback);
+};
+
+/**
+ * Like GatherAgentLayer, but select a few sequence in real layer.
+ * *ids* in addRealLayer() are the ids of selected sequence.
+ * It's used to reorder sequence output.
+ */
+class SequenceGatherAgentLayer : public GatherAgentLayer {
+public:
+  explicit SequenceGatherAgentLayer(const LayerConfig& config)
+      : GatherAgentLayer(config) {}
+  virtual ~SequenceGatherAgentLayer() {}
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback) {
+    // same as GatherAgentLayer
+    GatherAgentLayer::backward(callback);
+  }
+};
+
+/**
+ * Like AgentLayer, but only select a few rows in real layer.
+ * [idIndex, idIndex + idSize) of *ids* in setRealLayerAndOutput()
+ * are the selected row ids. It's used to scatter one layer's output
+ * to many small submodels. ScatterAgentLayer can support ids real layer,
+ * if it is, the agent will select a few ids in real layer.
+ */
+class ScatterAgentLayer : public Layer {
+protected:
+  LayerPtr realLayer_;
+  IVectorPtr ids_;
+  IVectorPtr cpuIds_;
+  Argument realOutArg_;
+  int idIndex_;
+  int idSize_;
+  int seqStartPosIndex_;
+  int numSequences_;  // number of sequences in this scatterAgentLayer
+
+public:
+  explicit ScatterAgentLayer(const LayerConfig& config) : Layer(config) {}
+
+  virtual ~ScatterAgentLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  /**
+   * @brief set real layer in generation
+   *
+   * @param layer[input]    realLayer
+   * @param ids[input]      row id in real layer
+   * @param copyId[input]   whether to copy a cpu version of ids, 
+   *                        false(default) in ScatterAgentLayer, and 
+   *                        true in SequenceScatterAgentLayer.
+   */
+  void setRealLayer(LayerPtr layer, const std::vector<int>& ids,
+                    bool copyId = false) {
+    realLayer_ = layer;
+    IVector::resizeOrCreate(ids_, ids.size(), useGpu_);
+    ids_->copyFrom(ids.data(), ids.size());
+    if (copyId) {
+      if (useGpu_) {
+        IVector::resizeOrCreate(cpuIds_, ids.size(), false);
+        cpuIds_->copyFrom(ids.data(), ids.size());
+      } else {
+        cpuIds_ = ids_;
+      }
+    }
+  }
+
+  // set real layer and output, [idIndex, idIndex + idSize) of *ids*
+  // are selected row for realOutArg in realLayer
+  void setRealLayerAndOutput(LayerPtr layer, const Argument& outArg,
+                             const IVectorPtr& ids, int idIndex, int idSize) {
+    realLayer_ = layer;
+    realOutArg_ = outArg;
+    ids_ = ids;
+    idIndex_ = idIndex;
+    idSize_ = idSize;
+  }
+
+  void setSequenceStartPositions(
+      const ICpuGpuVectorPtr& sequenceStartPositions,
+      int seqStartPosIndex, int numSequences) {
+    realOutArg_.sequenceStartPositions = sequenceStartPositions;
+    seqStartPosIndex_ = seqStartPosIndex;
+    numSequences_ = numSequences;
+  }
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback);
+};
+
+/**
+ * Like ScatterAgentLayer, but select a few sequence in real layer.
+ * *ids* in setRealLayer() or setRealLayerAndOutput() are the ids of
+ * selected sequence. It's used to reorder sequence input.
+ */
+class SequenceScatterAgentLayer : public ScatterAgentLayer {
+protected:
+  // use to store expanded cpuStartPositions or subSequenceStartPositions
+  // of real layer.
+  IVectorPtr cpuInputStartPos_;
+
+  // point to cpuInputStartPos_ when useGpu_ is false
+  // copy from cpuInputStartPos_ when useGpu_ is true
+  IVectorPtr inputStartPos_;
+
+public:
+  explicit SequenceScatterAgentLayer(const LayerConfig& config)
+      : ScatterAgentLayer(config) {}
+  virtual ~SequenceScatterAgentLayer() {}
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback) {
+    ScatterAgentLayer::backward(callback);
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/AverageLayer.cpp b/paddle/gserver/layers/AverageLayer.cpp
new file mode 100644
index 00000000000000..374117b7659bbe
--- /dev/null
+++ b/paddle/gserver/layers/AverageLayer.cpp
@@ -0,0 +1,161 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "AverageLayer.h"
+
+#include "paddle/utils/Logging.h"
+
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(average, AverageLayer);
+
+bool AverageLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+  dataMtx_ = Matrix::create(nullptr, 1, 1, false, useGpu_);
+  outMtx_ = Matrix::create(nullptr, 1, getSize(), false, useGpu_);
+  // average strategy
+  if (config_.average_strategy() == "average") {
+    mode_ = kAverage;
+  } else if (config_.average_strategy() == "sum") {
+    mode_ = kSum;
+  } else if (config_.average_strategy() == "squarerootn") {
+    mode_ = kAverageSquareRootN;
+  } else {
+    LOG(FATAL) << "Unknown average strategy: " << config_.average_strategy();
+  }
+  // transform to which sequence type
+  if (config_.trans_type() == "non-seq") {
+    type_ = kNonSeq;
+  } else if (config_.trans_type() == "seq") {
+    type_ = kSeq;
+  } else {
+    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
+  }
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void AverageLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  // average layer should have exactly 1 input
+  CHECK_EQ(1U, inputLayers_.size());
+
+  size_t dim = getSize();
+  const Argument& input = getInput(0);
+  int64_t newBatchSize =
+      type_ ? input.getNumSubSequences() : input.getNumSequences();
+  ICpuGpuVectorPtr startPositions =
+      type_ ? input.subSequenceStartPositions
+            : input.sequenceStartPositions;
+  const int* starts = startPositions->getData(false);
+  size_t numSequences = startPositions->getSize() - 1;
+
+  // check
+  CHECK_EQ(numSequences, (size_t)newBatchSize);
+  CHECK_EQ(starts[numSequences], input.getBatchSize());
+  if (type_) {
+    // when trans_type = seq, input must hasSubseq
+    CHECK_EQ(input.hasSubseq(), 1UL);
+  }
+
+  CHECK_EQ(dim, input.value->getWidth());
+
+  resetOutput(newBatchSize, dim);
+  auto startsPos = startPositions->getVector(useGpu_);
+  MatrixPtr inputValue = getInputValue(0);
+  getOutputValue()->sequenceAvgForward(*inputValue, *startsPos, mode_);
+
+  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
+   * thus, in this case, output_ has no sequenceStartPositions.
+   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
+   * case, we should compute the new sequenceStartPositions.
+  */
+  if (type_) {
+    output_.degradeSequence(input, useGpu_);
+  }
+
+  /* add the bias-vector AFTER average operation */
+  if (biases_.get() != NULL) {
+    MatrixPtr outV = getOutputValue();
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */ { forwardActivation(); }
+}
+
+void AverageLayer::backward(const UpdateCallback& callback) {
+  const Argument& input = getInput(0);
+  ICpuGpuVectorPtr startPositions =
+      type_ ? input.subSequenceStartPositions
+            : input.sequenceStartPositions;
+  const int* starts = startPositions->getData(false);
+  /* Do derivation */ { backwardActivation(); }
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  MatrixPtr grad = getInputGrad(0);
+  if (grad) {
+    size_t dim = getSize();
+    real* gradientData = getInputGrad(0)->getData();
+    real* gradient = getOutputGrad()->getData();
+    size_t numSequences = startPositions->getSize() - 1;
+    for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
+      // TODO(Dangqingqing) optimization for GPU
+      int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
+      if (0 == sequenceLength) {
+        // empty sequence
+        continue;
+      }
+      dataMtx_->setData(gradientData + starts[sequenceId] * dim, sequenceLength,
+                        dim);
+      outMtx_->setData(gradient + sequenceId * dim);
+      switch (mode_) {
+        case kAverage: {
+          // plain average
+          dataMtx_->addBias(*outMtx_, 1.0f / sequenceLength);
+          break;
+        }
+        case kSum: {
+          // sum instead of average
+          dataMtx_->addBias(*outMtx_, 1.0f);
+          break;
+        }
+        case kAverageSquareRootN: {
+          // divide by square root of sequenceLength
+          dataMtx_->addBias(*outMtx_, 1.0f / sqrt(sequenceLength));
+          break;
+        }
+        default: { LOG(FATAL) << "should not reach here"; }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h
new file mode 100644
index 00000000000000..ae910ddefad137
--- /dev/null
+++ b/paddle/gserver/layers/AverageLayer.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * A layer for "internal average" for sequence input.
+ * Input: one or more sequences. Each sequence contains some instances.
+ * If AverageLevel = kNonSeq:
+ *    Output: output size is the number of input sequences (NOT input instances)
+ *    output[i] = average_{for each instance in this sequence}{input[i]}
+ * If AverageLevel = kSeq:
+ *    Check input sequence must has sub-sequence
+ *    Output: output size is the number of input sub-sequences
+ *    output[i] = average_{for each instance in this sub-sequence}{input[i]}
+ */
+
+class AverageLayer : public Layer {
+public:
+  enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 };
+  enum AverageLevel { kNonSeq = 0, kSeq = 1 };
+  explicit AverageLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~AverageLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+
+protected:
+  std::unique_ptr<Weight> biases_;
+  MatrixPtr outMtx_;
+  MatrixPtr dataMtx_;
+  int mode_;
+  int type_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp
new file mode 100644
index 00000000000000..8052b35ec69c50
--- /dev/null
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Stat.h"
+#include "Layer.h"
+#include "BatchNormBaseLayer.h"
+#include "BatchNormalizationLayer.h"
+#ifndef PADDLE_ONLY_CPU
+#include "CudnnBatchNormLayer.h"
+#endif
+
+namespace paddle {
+
+bool BatchNormBaseLayer::init(const LayerMap& layerMap,
+                              const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  if (!Layer::init(layerMap, parameterMap)) return false;
+
+  /* initialize the weightList */
+  // first is Input in configure
+  // other two is created in config_parser.py
+  CHECK_EQ(inputLayers_.size(), 3U);
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  CHECK_EQ(inputLayers_.size(), size_t(config_.inputs_size()));
+  const ImageConfig& conf = config_.inputs(0).image_conf();
+  channels_ = conf.channels();
+  calFeatureMapSize();
+
+  if (config_.has_use_global_stats()) {
+    useGlobalStats_ = config_.use_global_stats();
+  }
+  movingAvgFraction_ = config_.moving_average_fraction();
+
+  weight_.reset(new Weight(1, channels_, parameters_[0]));
+  movingMean_.reset(new Weight(1, channels_, parameters_[1]));
+  movingVar_.reset(new Weight(1, channels_, parameters_[2]));
+
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, channels_, biasParameter_));
+  }
+
+  savedMean_ = Matrix::create(1, channels_, false, useGpu_);
+  savedInvVar_ = Matrix::create(1, channels_, false, useGpu_);
+  savedMean_->zeroMem();
+  savedInvVar_->zeroMem();
+
+  return true;
+}
+
+void BatchNormBaseLayer::calFeatureMapSize() {
+  const ImageConfig& conf = config_.inputs(0).image_conf();
+  if (inputLayers_[0]->getOutput().getFrameHeight() == 0 &&
+      inputLayers_[0]->getOutput().getFrameWidth() == 0) {
+    imgSize_ = conf.img_size();
+    imageH_ = imgSize_;
+    imageW_ = imgSize_;
+  } else {
+    imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
+    imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  }
+  imgPixels_ = imageH_ * imageW_;
+  getOutput().setFrameHeight(imageH_);
+  getOutput().setFrameWidth(imageW_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h
new file mode 100644
index 00000000000000..2302d1a8e0b17f
--- /dev/null
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "paddle/utils/Stat.h"
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * @brief Batch normalization layer use to normalizes the input to across the batch.
+ *
+ * By default, calculating global mean and variance statistics via a running
+ * average in the training peroid. Then the pre-calculated global mean and
+ * variance are used for testing.
+ *
+ * Moving mean and variance are located in Parameter object when constructing
+ * and the calculation will change them. Now we only save global mean and 
+ * variance of one thread in first node for GPU.
+ * But the calculation in CPU is different, because parameters are shared by
+ * multiple threads. Here using ShareCpuMatrix with lock to calculate. We
+ * still save global mean and variance in first node in CPU when multi machine.
+ *
+ * [1] S. Ioffe and C. Szegedy, "Batch Normalization: Accelerating Deep Network
+ *     Training by Reducing Internal Covariate Shift." arXiv preprint
+ *     arXiv:1502.03167 (2015).
+ */
+
+class BatchNormBaseLayer : public Layer {
+public:
+  explicit BatchNormBaseLayer(const LayerConfig& config)
+      : Layer(config) {}
+
+  ~BatchNormBaseLayer() {}
+
+  /**
+   * @brief Create BatchNorm layer by norm_type, including batch_norm and
+   * cudnn_batch_norm. If do not set norm_type, it will automatically select
+   * cudnn_batch_norm for GPU and batch_norm for CPU.
+   */
+  static Layer* create(const LayerConfig& config);
+
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  /** 
+   * @brief Calculate feature map size. Some input uses frameHeight and 
+   * frameWidth to store feature size
+   */
+  void calFeatureMapSize();
+
+protected:
+  /// Batch normalization scale parameter, which is referred to as gamma in
+  /// in original paper.
+  std::unique_ptr<Weight> weight_;
+  /// Moving average of mean.
+  std::unique_ptr<Weight> movingMean_;
+  /// Moving average of variance.
+  std::unique_ptr<Weight> movingVar_;
+  /// Batch normalization bias parameter, which is referred to as beta in
+  /// in original paper.
+  std::unique_ptr<Weight> biases_;
+
+  /// Save intermediate results computed during the forward pass,
+  /// these can then be reused to speed up the backward pass.
+  MatrixPtr savedMean_;
+  MatrixPtr savedInvVar_;
+
+  /// Height or width of input image feature, now height is equal to width.
+  /// imgSize is 1 if the input is fully-connected layer.
+  int imgSize_;
+  int imageH_;
+  int imageW_;
+  /// Height * Width.
+  int imgPixels_;
+  /// Feature dimension. If the input layer is conv layer, it is the channels
+  /// of feature map of the conv layer. If the input layer is fully-connected
+  /// layer, it is the dimension of fc layer.
+  int channels_;
+  // if useGlobalStats_ is true, will use the loaded mean and variance.
+  // otherwise, calculate mean and variance in this mini-batch.
+  bool useGlobalStats_;
+  // use to compute moving mean and variance.
+  real movingAvgFraction_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.cpp b/paddle/gserver/layers/BatchNormalizationLayer.cpp
new file mode 100644
index 00000000000000..b2921e6d40d3d5
--- /dev/null
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
@@ -0,0 +1,269 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Stat.h"
+#ifndef PADDLE_ONLY_CPU
+#include "hl_batch_transpose.h"
+#endif
+#include "BatchNormalizationLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(batch_norm, BatchNormalizationLayer);
+
+const real BatchNormalizationLayer::EPS = 1E-5;
+
+bool BatchNormalizationLayer::init(const LayerMap& layerMap,
+                                   const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  if (!BatchNormBaseLayer::init(layerMap, parameterMap)) return false;
+
+  return true;
+}
+
+void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
+  int numSamples = mat->getHeight();
+  Matrix::resizeOrCreate(tmpMat_, numSamples, channels_, false, useGpu_);
+  savedMean_->zeroMem();
+  savedMean_->accumulateColSum(*mat);
+  savedMean_->mulScalar(1.0 / numSamples);  // E[x]
+
+  tmpMat_->assign(*mat);
+  tmpMat_->square();
+  savedInvVar_->zeroMem();
+  savedInvVar_->accumulateColSum(*tmpMat_);
+  savedInvVar_->mulScalar(1.0 / numSamples);  // E[x^2]
+  savedInvVar_->addSquare(*savedMean_, -1.0);      // E[x^2] - E^2[x]
+
+  // Variance may be small negative value
+  // because of the subtraction operation.
+  // Here using clipping.
+  savedInvVar_->downClip(real(0.0));
+
+  calMovingMeanAndVar();
+
+  savedInvVar_->subScalar(-EPS);
+  savedInvVar_->sqrt(*savedInvVar_);
+}
+
+void BatchNormalizationLayer::calMovingMeanAndVar() {
+  // calculating and saving moving mean and variance
+  MatrixPtr movingMean = movingMean_->getW();
+  MatrixPtr movingVar = movingVar_->getW();
+
+  if (!useGpu_ && FLAGS_trainer_count > 1) {
+    auto mvMean = std::dynamic_pointer_cast<SharedCpuMatrix>(movingMean);
+    auto mvVar = std::dynamic_pointer_cast<SharedCpuMatrix>(movingVar);
+    CHECK(mvMean && mvVar);
+
+    mvMean->add(*savedMean_, movingAvgFraction_, 1.0 - movingAvgFraction_);
+    mvVar->add(*savedInvVar_, movingAvgFraction_, 1.0 - movingAvgFraction_);
+  } else {
+    // movingMean =  movingMean * movingAvgFraction_
+    //            + savedMean_ * (1 - movingAvgFraction_)
+    movingMean->add(*savedMean_, movingAvgFraction_, 1.0 - movingAvgFraction_);
+    // movingVar =  movingVar * movingAvgFraction_
+    //           + savedInvVar_ * (1 - movingAvgFraction_)
+    movingVar->add(*savedInvVar_, movingAvgFraction_, 1.0 - movingAvgFraction_);
+  }
+}
+
+void BatchNormalizationLayer::setMeanAndStd() {
+  savedMean_->copyFrom(*(movingMean_->getW()));
+  savedInvVar_->copyFrom(*(movingVar_->getW()));
+  savedInvVar_->downClip(real(0.0));
+
+  savedInvVar_->subScalar(-EPS);
+  savedInvVar_->sqrt(*savedInvVar_);
+}
+
+void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {
+  CHECK_EQ(in->getWidth(), static_cast<size_t>(channels_ * imgPixels_));
+  CHECK_EQ(out->getWidth(), static_cast<size_t>(channels_));
+  CHECK(!in->isTransposed());
+  CHECK(!out->isTransposed());
+  if (imgPixels_ == 1) {
+    out->assign(*in);
+    return;
+  }
+  size_t batchSize = in->getHeight();
+  CHECK_EQ(out->getHeight(), batchSize * imgPixels_);
+  if (useGpu_) {
+#ifdef PADDLE_ONLY_CPU
+    LOG(FATAL) << "paddle is compiled only for cpu";
+#else
+    batchTranspose(in->getData(), out->getData(), imgPixels_,
+                   channels_, batchSize);
+#endif
+  } else {
+    for (size_t i = 0; i < batchSize; i++) {
+      const MatrixPtr inTmp =
+          Matrix::create(in->getData() + i * imgPixels_ * channels_, channels_,
+                         imgPixels_, false, useGpu_);
+      MatrixPtr outTmp =
+          Matrix::create(out->getData() + i * imgPixels_ * channels_,
+                         imgPixels_, channels_, false, useGpu_);
+      inTmp->transpose(outTmp, false);
+    }
+  }
+}
+
+void BatchNormalizationLayer::shrinkMat(const MatrixPtr& in, MatrixPtr& out) {
+  CHECK_EQ(in->getWidth(), static_cast<size_t>(channels_));
+  CHECK_EQ(out->getWidth(), static_cast<size_t>(channels_ * imgPixels_));
+  size_t batchSize = out->getHeight();
+  CHECK(!in->isTransposed());
+  CHECK(!out->isTransposed());
+  if (imgPixels_ == 1) {
+    out->assign(*in);
+    return;
+  }
+  CHECK_EQ(in->getHeight(), static_cast<size_t>(batchSize * imgPixels_));
+  if (useGpu_) {
+#ifdef PADDLE_ONLY_CPU
+    LOG(FATAL) << "paddle is compiled only for cpu";
+#else
+    batchTranspose(in->getData(), out->getData(), channels_,
+                   imgPixels_, batchSize);
+#endif
+  } else {
+    for (size_t i = 0; i < batchSize; i++) {
+      const MatrixPtr inTmp =
+          Matrix::create(in->getData() + i * channels_ * imgPixels_, imgPixels_,
+                         channels_, false, useGpu_);
+      MatrixPtr outTmp =
+          Matrix::create(out->getData() + i * imgPixels_ * channels_, channels_,
+                         imgPixels_, useGpu_);
+      inTmp->transpose(outTmp, false);
+    }
+  }
+}
+
+
+void BatchNormalizationLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = getInputValue(0)->getHeight();
+  calFeatureMapSize();
+  resetOutput(batchSize, getInputValue(0)->getWidth());
+
+  // for testing in training peroid.
+  useGlobalStats_ = (passType == PASS_TEST);
+  if (passType == PASS_TEST && config_.has_use_global_stats()) {
+    useGlobalStats_ = config_.use_global_stats();
+  }
+
+  Matrix::resizeOrCreate(expandedIn_, batchSize * imgPixels_, channels_, false,
+                         useGpu_);
+  Matrix::resizeOrCreate(normIn_, batchSize * imgPixels_, channels_, false,
+                         useGpu_);
+  Matrix::resizeOrCreate(expandedOut_, batchSize * imgPixels_, channels_, false,
+                         useGpu_);
+  expandMat(getInputValue(0), expandedIn_);
+
+  if (useGlobalStats_) {
+    if (firstTest_) {
+      setMeanAndStd();
+      firstTest_ = false;
+    }
+  } else {
+    calMeanAndStd(expandedIn_);
+    firstTest_ = true;
+  }
+
+  normIn_->assign(*expandedIn_);
+  normIn_->addBias(*savedMean_, -1);  // subtract mean.
+  normIn_->divRowVector(*savedInvVar_);  // divide std.
+
+  expandedOut_->assign(*normIn_);
+  expandedOut_->mulRowVector(*weight_->getW());  // multiple gamma.
+  if (biases_) {
+    expandedOut_->addBias(*(biases_->getW()), 1);  // add beta.
+  }
+  MatrixPtr out = getOutputValue();
+  shrinkMat(expandedOut_, out);
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void BatchNormalizationLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+  int batchSize = getInputValue(0)->getHeight();
+
+  Matrix::resizeOrCreate(meanGrad_, 1, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(stdGrad_, 1, channels_, false, useGpu_);
+
+  Matrix::resizeOrCreate(expandedInGrad_, batchSize * imgPixels_, channels_,
+                         false, useGpu_);
+  Matrix::resizeOrCreate(inGrad_, batchSize, imgPixels_ * channels_, false,
+                         useGpu_);
+  Matrix::resizeOrCreate(normInGrad_, batchSize * imgPixels_, channels_, false,
+                         useGpu_);
+  Matrix::resizeOrCreate(expandedOutGrad_, batchSize * imgPixels_, channels_,
+                         false, useGpu_);
+  Matrix::resizeOrCreate(tmpMat_, batchSize * imgPixels_, channels_, false,
+                         useGpu_);
+  Matrix::resizeOrCreate(tmpGrad_, batchSize * imgPixels_, channels_, false,
+                         useGpu_);
+
+  expandMat(getOutputGrad(), expandedOutGrad_);
+
+  // compute derivatives.
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
+    biases_->getWGrad()->collectBias(*expandedOutGrad_, 1);
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+  if (weight_->getWGrad()) {
+    tmpMat_->dotMul(*expandedOutGrad_, *normIn_);
+    weight_->getWGrad()->collectBias(*tmpMat_, 1);
+  }
+
+  // compute input gradients.
+  normInGrad_->assign(*expandedOutGrad_);
+  normInGrad_->mulRowVector(*(weight_->getW()));  // multiple gamma.
+  // normInGrad * (x - \mu)/ \sqrt(\delta^2)
+  tmpMat_->dotMul(*normInGrad_, *normIn_);
+  stdGrad_->zeroMem();
+  stdGrad_->collectBias(*tmpMat_, -1.0 / (batchSize * imgPixels_));
+  tmpGrad_->assign(*normIn_);
+  tmpGrad_->mulRowVector(*stdGrad_);
+
+  meanGrad_->zeroMem();
+  meanGrad_->collectBias(*normInGrad_, -1.0 / (batchSize * imgPixels_));
+
+  expandedInGrad_->zeroMem();
+  expandedInGrad_->add(*normInGrad_, *tmpGrad_);
+  expandedInGrad_->addRowVector(*meanGrad_);
+  expandedInGrad_->divRowVector(*savedInvVar_);
+
+  shrinkMat(expandedInGrad_, inGrad_);
+  if (getInputGrad(0)) {
+    getInputGrad(0)->add(*getInputGrad(0), *inGrad_);
+  }
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/gserver/layers/BatchNormalizationLayer.h
new file mode 100644
index 00000000000000..175b9a80e63f79
--- /dev/null
+++ b/paddle/gserver/layers/BatchNormalizationLayer.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Layer.h"
+#include "BatchNormBaseLayer.h"
+
+namespace paddle {
+
+/**
+ * @brief A Inheritance class of Batch normalization layer.
+ * It supports both CPU and GPU.
+ *
+ * The config file api is batch_norm_layer.
+ */
+
+class BatchNormalizationLayer : public BatchNormBaseLayer {
+public:
+  explicit BatchNormalizationLayer(const LayerConfig& config)
+      : BatchNormBaseLayer(config), firstTest_(true) {}
+
+  ~BatchNormalizationLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+
+protected:
+  /// Epsilon value used in the batch normalization formula.
+  static const real EPS;
+
+  /// Load pre-calculated mean and std.
+  void setMeanAndStd();
+
+  /// Calculate mean and std.
+  void calMeanAndStd(const MatrixPtr& mat);
+
+  /// Calculate moving mean and variance.
+  void calMovingMeanAndVar();
+
+  /// expand a Matrix from batch, channels* imagePixels to
+  /// batch * ImagePixels * channels.
+  void expandMat(const MatrixPtr& in, MatrixPtr& out);
+
+  /// Shrink a Matrix from  from batch * ImagePixels * channels
+  /// to batch, channels* imagePixels.
+  void shrinkMat(const MatrixPtr& in, MatrixPtr& out);
+
+  MatrixPtr tmpMat_, tmpGrad_;
+  MatrixPtr expandedIn_, expandedOut_;
+  MatrixPtr expandedInGrad_, expandedOutGrad_, inGrad_;
+  MatrixPtr normIn_, normInGrad_, meanGrad_, stdGrad_;
+
+  /// Load mean and variance only once flag.
+  bool firstTest_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp
new file mode 100644
index 00000000000000..8da159def82b0c
--- /dev/null
+++ b/paddle/gserver/layers/BlockExpandLayer.cpp
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "BlockExpandLayer.h"
+
+#include "paddle/utils/Logging.h"
+
+namespace paddle {
+
+REGISTER_LAYER(blockexpand, BlockExpandLayer);
+
+bool BlockExpandLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(config_.inputs_size(), 1);
+  const BlockExpandConfig& blockConf = config_.inputs(0).block_expand_conf();
+  blockH_ = blockConf.block_y();
+  blockW_ = blockConf.block_x();
+  strideH_ = blockConf.stride_y();
+  strideW_ = blockConf.stride_x();
+  paddingH_ = blockConf.padding_y();
+  paddingW_ = blockConf.padding_x();
+  channels_ = blockConf.channels();
+  imgSizeH_ = blockConf.img_size_y();
+  imgSizeW_ = blockConf.img_size_x();
+
+  return true;
+}
+
+size_t BlockExpandLayer::getBlockNum() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  const BlockExpandConfig& blockConf = config_.inputs(0).block_expand_conf();
+  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imgSizeH_ == 0) {
+    imgSizeH_ = blockConf.img_size_y();
+  }
+  if (imgSizeW_ == 0) {
+    imgSizeW_ = blockConf.img_size_x();
+  }
+  size_t tmpH  = 2 * paddingH_ + imgSizeH_ - blockH_;
+  outputH_ = (int)tmpH < 0 ? 1 : 1 + (tmpH + strideH_ - 1) / strideH_;
+  size_t tmpW = 2 * paddingW_ + imgSizeW_ - blockW_;
+  outputW_ = (int)tmpW < 0 ? 1 : 1 + (tmpW + strideW_ - 1) / strideW_;
+
+  return outputH_ * outputW_;
+}
+
+void BlockExpandLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+
+  size_t blockNum = getBlockNum();
+  size_t blockSize = blockH_ * blockW_ * channels_;
+  resetOutput(blockNum * batchSize, blockSize);
+  Argument& out = getOutput();
+  MatrixPtr outV = getOutputValue();
+
+  MatrixPtr input = getPrev(0)->getOutputValue();
+  Matrix::resizeOrCreate(outVTrans_, blockSize, blockNum, false, useGpu_);
+  ICpuGpuVector::resizeOrCreate(out.sequenceStartPositions,
+                                batchSize + 1, false);
+  IVector::resizeOrCreate(out.cpuSequenceDims, 2 * batchSize, false);
+  int* start = out.sequenceStartPositions->getMutableData(false);
+  int* dims = out.cpuSequenceDims->getData();
+  for (size_t i = 0; i < batchSize; i++) {
+    outVTrans_->zeroMem();
+    /* expand each block as one row */
+    MatrixPtr inputTmp =
+        Matrix::create(input->getData() + i * input->getWidth(), 1,
+                       input->getWidth(), false, useGpu_);
+    outVTrans_->convExpand(*inputTmp, imgSizeH_, imgSizeW_, channels_, blockH_,
+                          blockW_, strideH_, strideW_, paddingH_, paddingW_,
+                          outputH_, outputW_);
+    MatrixPtr outVTmp =
+        Matrix::create(outV->getData() + i * blockNum * blockSize, blockNum,
+                       blockSize, false, useGpu_);
+    outVTrans_->transpose(outVTmp, false);
+    start[i] = i * blockNum;
+    dims[2 * i] = outputH_;
+    dims[2 * i + 1] = outputW_;
+  }
+  start[batchSize] = batchSize * blockNum;
+}
+
+void BlockExpandLayer::backward(const UpdateCallback& callback) {
+  size_t blockNum = outputH_ * outputW_;
+  size_t blockSize = blockH_ * blockW_ * channels_;
+  /* Calculate the input layers error */
+  MatrixPtr preGrad = inputLayers_[0]->getOutputGrad();
+  if (!preGrad) {
+    return;
+  }
+  MatrixPtr grad = getOutputGrad();
+  MatrixPtr gradTrans = Matrix::create(blockSize, blockNum, false, useGpu_);
+  size_t batchSize = preGrad->getHeight();
+
+  CHECK_EQ(batchSize * blockNum, grad->getHeight());
+  CHECK_EQ(blockSize, grad->getWidth());
+
+  for (size_t i = 0; i < batchSize; i++) {
+    MatrixPtr gradTmp =
+        Matrix::create(grad->getData() + i * blockNum * blockSize, blockNum,
+                       blockSize, false, useGpu_);
+    gradTmp->transpose(gradTrans, false);
+    MatrixPtr preGradTmp =
+        Matrix::create(preGrad->getData() + i * preGrad->getWidth(), 1,
+                       preGrad->getWidth(), false, useGpu_);
+    preGradTmp->convShrink(*gradTrans, imgSizeH_, imgSizeW_, channels_, blockH_,
+                           blockW_, strideH_, strideW_, paddingH_, paddingW_,
+                           outputH_, outputW_, 1.0, 1.0);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
new file mode 100644
index 00000000000000..3b04c713e34e7a
--- /dev/null
+++ b/paddle/gserver/layers/BlockExpandLayer.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief Expand feature map to minibatch matrix.
+ * - matrix width is: blockH_ * blockW_ * channels_
+ * - matirx height is: outputH_ * outputW_
+ * \f[
+ * outputH_ = 1 + (2 * paddingH_ + imgSizeH_ - blockH_ + strideH_ - 1) /
+ *            strideH_;
+ * outputW_ = 1 + (2 * paddingW_ + imgSizeW_ - blockW_ + strideW_ - 1) /
+ *            strideW_;
+ * \f]
+ * The expand method is same with ExpandConvLayer, but saved the transposed
+ * value. After expanding, output_.sequenceStartPositions will store timeline.
+ * The number of time steps are outputH_ * outputW_ and the dimension of each
+ * time step is blockH_ * blockW_ * channels_. This layer can be used after
+ * convolution neural network, and before recurrent neural network.
+ *
+ * The config file api is block_expand_layer.
+ */
+class BlockExpandLayer : public Layer {
+protected:
+  /**
+   * @brief Calculate outputH_ and outputW_ and return block number which
+   * actually is time steps.
+   * @return time steps, outoutH_ * outputW_.
+   */
+  size_t getBlockNum();
+  size_t blockH_, blockW_, strideH_, strideW_, paddingH_, paddingW_;
+  size_t imgSizeH_, imgSizeW_, outputH_, outputW_, channels_;
+
+  /// auxiliary variable, which saves the transposed output value.
+  MatrixPtr outVTrans_;
+
+public:
+  explicit BlockExpandLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~BlockExpandLayer() {}
+
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback = nullptr);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CRFDecodingLayer.cpp b/paddle/gserver/layers/CRFDecodingLayer.cpp
new file mode 100644
index 00000000000000..d3dfbb7c80f68b
--- /dev/null
+++ b/paddle/gserver/layers/CRFDecodingLayer.cpp
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "CRFDecodingLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(crf_decoding, CRFDecodingLayer);
+
+bool CRFDecodingLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  if (!CRFLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  crf_.reset(new LinearChainCRF(
+      numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData(), nullptr));
+  return true;
+}
+
+void CRFDecodingLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  CHECK(!useGpu_) << "GPU is not supported";
+
+  const Argument& output = getInput(0);
+  CHECK(output.sequenceStartPositions);
+
+  size_t batchSize = output.getBatchSize();
+  size_t numSequences = output.sequenceStartPositions->getSize() - 1;
+
+  IVector::resizeOrCreate(output_.ids, batchSize, useGpu_);
+  const int* starts = output.sequenceStartPositions->getData(false);
+  CHECK_EQ(starts[numSequences], (int)batchSize);
+
+  for (size_t i = 0; i < numSequences; ++i) {
+    crf_->decode(output.value->getData() + numClasses_ * starts[i],
+                 output_.ids->getData() + starts[i], starts[i + 1] - starts[i]);
+  }
+
+  if (inputLayers_.size() == 2) {
+    const Argument& label = getInput(1);
+    resizeOutput(batchSize, 1);
+    CHECK(label.ids);
+    real* error = output_.value->getData();
+    int* ids = label.ids->getData();
+    int* result = output_.ids->getData();
+    for (size_t i = 0; i < batchSize; ++i) {
+      error[i] = ids[i] == result[i] ? 0 : 1;
+    }
+  }
+}
+
+void CRFDecodingLayer::backward(const UpdateCallback& callback) {
+  parameter_->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CRFDecodingLayer.h b/paddle/gserver/layers/CRFDecodingLayer.h
new file mode 100644
index 00000000000000..005bffff6b6b80
--- /dev/null
+++ b/paddle/gserver/layers/CRFDecodingLayer.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <memory>
+
+#include "CRFLayer.h"
+#include "LinearChainCRF.h"
+
+namespace paddle {
+
+/**
+ * A layer for calculating the decoding sequence of sequential conditional
+ * random field model.
+ * The decoding sequence is stored in output_.ids
+ * It also calculate error, output_.value[i] is 1 for incorrect decoding
+ * or 0 for correct decoding)
+ * See LinearChainCRF.h for the detail of the CRF formulation.
+ */
+class CRFDecodingLayer : public CRFLayer {
+public:
+  explicit CRFDecodingLayer(const LayerConfig& config) : CRFLayer(config) {}
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback);
+
+protected:
+  std::unique_ptr<LinearChainCRF> crf_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/gserver/layers/CRFLayer.cpp
new file mode 100644
index 00000000000000..df8a2b03142b85
--- /dev/null
+++ b/paddle/gserver/layers/CRFLayer.cpp
@@ -0,0 +1,168 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "CRFLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(crf, CRFLayer);
+
+bool CRFLayer::init(const LayerMap& layerMap,
+                    const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  if (config_.type() == "crf") {
+    CHECK_GE(inputLayers_.size(), 2UL);
+    // the third output is sequence weight. one weight for each sequence
+    CHECK_LE(inputLayers_.size(), 3UL);
+  }
+
+  // coeff only affect bp, keep consistent with CostLayer
+  coeff_ = config_.has_coeff() ? config_.coeff() : real(1.0);
+  if (inputLayers_.size() == 3) {
+    weightLayer_ = inputLayers_[2];
+  }
+
+  numClasses_ = inputLayers_[0]->getSize();
+
+  CHECK_GE(numClasses_, 2UL);
+
+  CHECK_EQ(parameters_[0]->getSize(), numClasses_ * (numClasses_ + 2));
+
+  parameter_ = parameters_[0];
+
+  // We don't need sequenceStartPositions because each sample of output_ is
+  // for the cost of one sequence.
+  setNeedSequenceInfo(false);
+  if (useGpu_) {
+    tmpCpuInput_.reserve(inputLayers_.size());
+    for (size_t i = 0; i < inputLayers_.size(); i++) {
+      tmpCpuInput_.push_back(Argument());
+    }
+  }
+  return true;
+}
+
+void CRFLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  if (useGpu_) {
+    for (size_t i = 0; i < inputLayers_.size(); i++) {
+      tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
+    }
+    VectorPtr cpuParameterValue;
+    VectorPtr cpuParameterGradient;
+    cpuParameterValue =
+      Vector::create(parameter_->getBuf(PARAMETER_VALUE)->getSize(), false);
+    cpuParameterValue->
+      copyFrom(*parameter_->getBuf(PARAMETER_VALUE), HPPL_STREAM_1);
+    if (parameter_->getBuf(PARAMETER_GRADIENT)) {
+      cpuParameterGradient =
+        Vector::create(parameter_->getBuf(PARAMETER_GRADIENT)->getSize(),
+                       false);
+      cpuParameterGradient->
+        copyFrom(*parameter_->getBuf(PARAMETER_GRADIENT), HPPL_STREAM_1);
+    } else {
+      cpuParameterGradient = nullptr;
+    }
+    forwardImp(tmpCpuInput_[0], tmpCpuInput_[1], cpuParameterValue,
+               cpuParameterGradient);
+    parameter_->getBuf(PARAMETER_VALUE)->copyFrom(*cpuParameterValue,
+                                                  HPPL_STREAM_1);
+    if (parameter_->getBuf(PARAMETER_GRADIENT)) {
+      parameter_->getBuf(PARAMETER_GRADIENT)->copyFrom(*cpuParameterGradient,
+                                                    HPPL_STREAM_1);
+    }
+  } else {
+    forwardImp(getInput(0), getInput(1), parameter_->getBuf(PARAMETER_VALUE),
+               parameter_->getBuf(PARAMETER_GRADIENT));
+  }
+}
+
+void CRFLayer::forwardImp(const Argument&output,
+                          const Argument& label,
+                          VectorPtr parameterValue,
+                          VectorPtr parameterGradient) {
+  CHECK(label.sequenceStartPositions);
+  CHECK(label.ids);
+
+  int batchSize = output.getBatchSize();
+  size_t numSequences = label.sequenceStartPositions->getSize() - 1;
+  resizeOutput(numSequences, 1);
+  std::vector<real> out(numSequences);
+
+  const int* starts = label.sequenceStartPositions->getData(false);
+  CHECK_EQ(starts[numSequences], batchSize);
+  VectorPtr cpuParameterValue;
+  VectorPtr cpuParameterGradient;
+
+
+  for (size_t i = 0; i < numSequences; ++i) {
+    if (i >= crfs_.size()) {
+      crfs_.emplace_back(numClasses_,
+                         parameterValue->getData(),
+                         parameterGradient
+                            ? parameterGradient->getData()
+                            : nullptr);
+    }
+    out[i] = crfs_[i].forward(
+        output.value->getData() + numClasses_ * starts[i],
+        label.ids->getData() + starts[i], starts[i + 1] - starts[i]);
+  }
+  output_.value->copyFrom(out.data(), numSequences);
+  if (weightLayer_) {
+    const MatrixPtr& weight = getInputValue(*weightLayer_);
+    getOutputValue()->dotMul(*getOutputValue(), *weight);
+  }
+}
+
+void CRFLayer::backward(const UpdateCallback &callback) {
+  (void)callback;
+  if (useGpu_) {
+    backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]);
+    const_cast<Argument&>(getInput(0)).
+            resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_1);
+    const_cast<Argument&>(getInput(1)).
+            resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_1);
+
+  } else {
+    backwardImp(callback, getInput(0), getInput(1));
+  }
+}
+
+void CRFLayer::backwardImp(const UpdateCallback& callback,
+                           const Argument&output,
+                           const Argument& label) {
+  const int* starts = label.sequenceStartPositions->getData(false);
+  int numSequences = label.sequenceStartPositions->getSize() - 1;
+
+  for (int i = 0; i < numSequences; ++i) {
+    crfs_[i].backward(output.value->getData() + numClasses_ * starts[i],
+                      output.grad->getData() + numClasses_ * starts[i],
+                      label.ids->getData() + starts[i],
+                      starts[i + 1] - starts[i]);
+    if (weightLayer_) {
+      real weight = getInputValue(*weightLayer_)->getElement(i, 0);
+      MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i+1]);
+      grad->mulScalar(weight);
+    }
+  }
+  if (coeff_ != real(1.0f)) {
+    output.grad->mulScalar(coeff_);
+  }
+  parameter_->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h
new file mode 100644
index 00000000000000..5facb9b54818c2
--- /dev/null
+++ b/paddle/gserver/layers/CRFLayer.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <memory>
+
+#include "Layer.h"
+#include "LinearChainCRF.h"
+
+namespace paddle {
+
+/**
+ * A layer for calculating the cost of sequential conditional random field
+ * model.
+ * See LinearChainCRF.h for the detail of the CRF formulation.
+ */
+class CRFLayer : public Layer {
+public:
+  explicit CRFLayer(const LayerConfig& config) : Layer(config) {}
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  virtual void forward(PassType passType);
+  void forwardImp(const Argument&output, const Argument& label,
+                  VectorPtr parameterValue, VectorPtr parameterGradient);
+  virtual void backward(const UpdateCallback& callback);
+  void backwardImp(const UpdateCallback& callback, const Argument&output,
+                   const Argument& label);
+
+protected:
+  size_t numClasses_;
+  ParameterPtr parameter_;
+  std::vector<LinearChainCRF> crfs_;
+  LayerPtr weightLayer_;  // weight for each sequence
+  real coeff_;  // weight for the layer
+  std::vector<Argument> tmpCpuInput_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CTCLayer.cpp b/paddle/gserver/layers/CTCLayer.cpp
new file mode 100644
index 00000000000000..db1450694ecf76
--- /dev/null
+++ b/paddle/gserver/layers/CTCLayer.cpp
@@ -0,0 +1,122 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "CTCLayer.h"
+
+/* Please reference the Chapter7  in
+ * "Alex graves, Supervised Sequence Labelling with
+ * Recurrent Neural Networks" */
+namespace paddle {
+REGISTER_LAYER(ctc, CTCLayer);
+
+bool CTCLayer::init(const LayerMap& layerMap,
+                    const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2UL);
+
+  /* The inputLayers_[0] must be softmax output */
+  numClasses_ = inputLayers_[0]->getSize();
+  normByTimes_ = config_.norm_by_times();
+  CHECK_GE(numClasses_, 2UL);
+
+  // We don't need sequenceStartPositions because each sample of output_ is
+  // for the cost of one sequence.
+  setNeedSequenceInfo(false);
+  if (useGpu_) {
+    tmpCpuInput_.reserve(inputLayers_.size());
+    for (size_t i = 0; i < inputLayers_.size(); i++) {
+      tmpCpuInput_.push_back(Argument());
+    }
+  }
+  return true;
+}
+
+void CTCLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  if (useGpu_) {
+    for (size_t i = 0; i < inputLayers_.size(); i++) {
+      tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
+    }
+    forwardImp(tmpCpuInput_[0], tmpCpuInput_[1]);
+  } else {
+    forwardImp(getInput(0), getInput(1));
+  }
+}
+
+void CTCLayer::forwardImp(const Argument& softmaxSeqs,
+                          const Argument& labelSeqs) {
+  CHECK(softmaxSeqs.sequenceStartPositions);
+  CHECK(labelSeqs.sequenceStartPositions);
+  CHECK(labelSeqs.ids);
+
+  size_t numSequences = labelSeqs.sequenceStartPositions->getSize() - 1;
+  CHECK_EQ(numSequences, softmaxSeqs.sequenceStartPositions->getSize() - 1);
+
+  resizeOutput(numSequences, 1);
+  std::vector<real> out(numSequences);
+
+  const int* labelSeqsStarts =
+      labelSeqs.sequenceStartPositions->getData(false);
+  const int* softmaxSeqsStarts =
+      softmaxSeqs.sequenceStartPositions->getData(false);
+
+  for (size_t i = 0; i < numSequences; i++) {
+    if (i >= ctcs_.size()) {
+      ctcs_.emplace_back(numClasses_, normByTimes_);
+    }
+    out[i] = ctcs_[i].forward(
+            softmaxSeqs.value->getData() + numClasses_ * softmaxSeqsStarts[i],
+            softmaxSeqsStarts[i + 1] - softmaxSeqsStarts[i],
+            labelSeqs.ids->getData() + labelSeqsStarts[i],
+            labelSeqsStarts[i + 1] - labelSeqsStarts[i]);
+  }
+  output_.value->copyFrom(out.data(), numSequences);
+}
+
+void CTCLayer::backward(const UpdateCallback &callback) {
+  (void)callback;
+  if (useGpu_) {
+    backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]);
+    const_cast<Argument&>(getInput(0)).
+            resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_1);
+    const_cast<Argument&>(getInput(1)).
+            resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_1);
+  } else {
+    backwardImp(callback, getInput(0), getInput(1));
+  }
+}
+
+void CTCLayer::backwardImp(const UpdateCallback& callback,
+                           const Argument& softmaxSeqs,
+                           const Argument& labelSeqs) {
+  size_t numSequences = labelSeqs.sequenceStartPositions->getSize() - 1;
+
+  const int* labelSeqsStarts =
+      labelSeqs.sequenceStartPositions->getData(false);
+  const int* softmaxSeqsStarts =
+      softmaxSeqs.sequenceStartPositions->getData(false);
+
+  for (size_t i = 0; i < numSequences; ++i) {
+    ctcs_[i].backward(
+        softmaxSeqs.value->getData() + numClasses_ * softmaxSeqsStarts[i],
+        softmaxSeqs.grad->getData() + numClasses_ * softmaxSeqsStarts[i],
+        labelSeqs.ids->getData() + labelSeqsStarts[i],
+        labelSeqsStarts[i + 1] - labelSeqsStarts[i]);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CTCLayer.h b/paddle/gserver/layers/CTCLayer.h
new file mode 100644
index 00000000000000..49a059e43e6af4
--- /dev/null
+++ b/paddle/gserver/layers/CTCLayer.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Layer.h"
+#include "LinearChainCTC.h"
+
+namespace paddle {
+
+class CTCLayer : public Layer {
+public:
+  explicit CTCLayer(const LayerConfig& config) : Layer(config) {}
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  virtual void forward(PassType passType);
+  void forwardImp(const Argument& softmaxSeqs, const Argument& labelSeqs);
+  virtual void backward(const UpdateCallback& callback);
+  void backwardImp(const UpdateCallback& callback,
+                   const Argument& softmaxSeqs, const Argument& labelSeqs);
+
+protected:
+  size_t numClasses_;
+  bool normByTimes_;
+  std::vector<LinearChainCTC> ctcs_;
+  std::vector<Argument> tmpCpuInput_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConcatenateLayer.cpp b/paddle/gserver/layers/ConcatenateLayer.cpp
new file mode 100644
index 00000000000000..52a7cb6f777c3a
--- /dev/null
+++ b/paddle/gserver/layers/ConcatenateLayer.cpp
@@ -0,0 +1,180 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Stat.h"
+#include "Layer.h"
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * A concatenate layer has multiple input layers. It concatenates rows of
+ * each input as one row for the output of this layer and apply activation.
+ */
+class ConcatenateLayer : public Layer {
+public:
+  explicit ConcatenateLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ConcatenateLayer() {}
+
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback = nullptr);
+};
+
+REGISTER_LAYER(concat, ConcatenateLayer);
+
+bool ConcatenateLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  if (!Layer::init(layerMap, parameterMap)) return false;
+
+  CHECK(!biasParameter_);
+
+  return true;
+}
+
+void ConcatenateLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = getInput(0).getBatchSize();
+  int size = getSize();
+  reserveOutput(batchSize, size);
+
+  const MatrixPtr& out = getOutputValue();
+  int offset = 0;
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    const MatrixPtr& in = getInputValue(i);
+    size_t inSize = in->getWidth();
+    out->assignAtOffset(*in, offset);
+    offset += inSize;
+  }
+  CHECK_EQ(size, offset);
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void ConcatenateLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  /* Do activation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  const MatrixPtr& out = getOutputGrad();
+  int offset = 0;
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    const MatrixPtr& in = getInputGrad(i);
+    size_t inSize = getInputValue(i)->getWidth();
+    if (in) {
+      in->addAtOffset(*out, offset);
+    }
+    offset += inSize;
+  }
+}
+
+/**
+ * concat2 layer is like concat layer, but each input layer was
+ * processed by a Projection.
+ */
+class ConcatenateLayer2 : public Layer {
+public:
+  explicit ConcatenateLayer2(const LayerConfig& config) : Layer(config) {}
+
+  ~ConcatenateLayer2() {}
+
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback = nullptr);
+
+protected:
+  std::vector<std::unique_ptr<Projection>> projections_;
+  std::vector<Argument> projOutput_;
+  std::vector<std::pair<size_t, size_t>> projCol_;
+};
+
+REGISTER_LAYER(concat2, ConcatenateLayer2);
+
+bool ConcatenateLayer2::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  if (!Layer::init(layerMap, parameterMap)) return false;
+
+  CHECK(!biasParameter_);
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  projections_.reserve(inputLayers_.size());
+  projCol_.reserve(inputLayers_.size());
+  projOutput_.resize(inputLayers_.size());
+
+  size_t startCol = 0;
+  size_t endCol = 0;
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    projections_.emplace_back(Projection::create(config_.inputs(i).proj_conf(),
+                                                 parameters_[i], useGpu_));
+
+    endCol += projections_[i]->getOutputSize();
+    projCol_.push_back(std::make_pair(startCol, endCol));
+    startCol = endCol;
+  }
+  CHECK_EQ(getSize(), endCol);
+
+  return true;
+}
+
+void ConcatenateLayer2::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = getInput(0).getBatchSize();
+  int size = getSize();
+  resetOutput(batchSize, size);
+
+  for (size_t i = 0; i < projections_.size(); i++) {
+    size_t startCol = projCol_[i].first;
+    size_t endCol = projCol_[i].second;
+    projOutput_[i].value = output_.value->subColMatrix(startCol, endCol);
+    projOutput_[i].grad = output_.grad->subColMatrix(startCol, endCol);
+  }
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void ConcatenateLayer2::backward(const UpdateCallback& callback) {
+  /* Do activation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    if (projections_[i]) {
+      projections_[i]->backward(callback);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
new file mode 100644
index 00000000000000..3b1498f7e98673
--- /dev/null
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -0,0 +1,148 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Stat.h"
+#include "ContextProjection.h"
+
+namespace paddle {
+
+REGISTER_PROJECTION(context, ContextProjection);
+
+ContextProjection::ContextProjection(const ProjectionConfig& config,
+                                     ParameterPtr parameter, bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(config.has_context_start());
+  CHECK(config.has_context_length());
+  if (config.context_start() == 0 && config.context_length() == 1) {
+    config_.set_trainable_padding(false);
+  }
+  if (config_.trainable_padding()) {
+    CHECK(parameter);
+    beginPad_ = std::max(0, -config.context_start());
+    endPad_ = std::max(0, config.context_start() + config.context_length() - 1);
+    size_t totalPad = beginPad_ + endPad_;
+    size_t inputDim = parameter->getSize() / totalPad;
+    CHECK_EQ(config.input_size(), inputDim);
+    CHECK_EQ(inputDim * totalPad, parameter->getSize());
+    weight_.reset(new Weight(totalPad, inputDim, parameter));
+  }
+}
+
+void ContextProjection::resetState() {
+  CHECK_LE(config_.context_start() + config_.context_length(), 1)
+      << "state is not allowed for future context";
+  if (config_.context_start() >= 0) return;
+  Matrix::resizeOrCreate(state_, -config_.context_start(), config_.input_size(),
+                         false,  // trans
+                         useGpu_);
+  Matrix::resizeOrCreate(state2_, -config_.context_start(),
+                         config_.input_size(),
+                         false,  // trans
+                         useGpu_);
+  if (config_.trainable_padding()) {
+    state_->assign(*weight_->getW()->subMatrix(0, -config_.context_start()));
+  } else {
+    state_->zeroMem();
+  }
+}
+
+void ContextProjection::setState(LayerStatePtr state) {
+  CHECK(state->value.size() == 1)
+      << "one matrix is expected for ContextProjection state";
+  state_->copyFrom(*(state->value[0]));
+}
+
+LayerStatePtr ContextProjection::getState() {
+  if (state_ == nullptr) {
+    return nullptr;
+  }
+  LayerStatePtr res = std::make_shared<LayerState>();
+  res->value.push_back(state_->clone(0, 0, false));
+  res->value[0]->copyFrom(*state_);
+  return res;
+}
+
+void ContextProjection::forward() {
+  CHECK(in_->value);
+  CHECK(in_->sequenceStartPositions);
+
+  auto startPositions =
+    in_->sequenceStartPositions->getVector(useGpu_);
+
+  int64_t inputDim = in_->value->getWidth();
+  int64_t dim = out_->value->getWidth();
+  CHECK_EQ(dim, inputDim * config_.context_length());
+
+  REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
+  bool isPadding = config_.trainable_padding();
+  out_->value->contextProjectionForward(
+      in_->value, state_ ? state_ : isPadding ? weight_->getW() : nullptr,
+      *startPositions, config_.context_length(), config_.context_start(),
+      beginPad_, state_ ? true : isPadding);
+
+  if (state_ && config_.context_start() < 0) {
+    CHECK_EQ(1, in_->getNumSequences());
+    const int* starts = in_->sequenceStartPositions->getData(false);
+    int length = starts[1] - starts[0];
+    if (-config_.context_start() <= length) {
+      MatrixPtr sub = in_->value->subMatrix(starts[1] + config_.context_start(),
+                                            -config_.context_start());
+      state_->copyFrom(*sub);
+    } else {
+      int prevLength = -config_.context_start() - length;
+      state2_->subMatrix(0, prevLength)
+          ->copyFrom(*state_->subMatrix(length, prevLength));
+      state2_->subMatrix(prevLength, length)
+          ->copyFrom(*in_->value->subMatrix(starts[0], length));
+      std::swap(state_, state2_);
+    }
+  }
+}
+
+void ContextProjection::backward(const UpdateCallback& callback) {
+  CHECK(in_->value);
+  int64_t inputDim = in_->value->getWidth();
+  int64_t dim = out_->value->getWidth();
+  CHECK_EQ(dim, inputDim * config_.context_length());
+  auto startPositions =
+    in_->sequenceStartPositions->getVector(useGpu_);
+
+  REGISTER_TIMER_INFO("ContextProjectionBackward", getName().c_str());
+  bool isPadding = config_.trainable_padding();
+  if (!out_->grad->useGpu()) {
+    out_->grad->contextProjectionBackward(
+        in_->grad, isPadding ? weight_->getWGrad() : nullptr, *startPositions,
+        config_.context_length(), config_.context_start(), beginPad_,
+        isPadding);
+  } else {
+    if (in_->grad) {
+      out_->grad->contextProjectionBackwardData(in_->grad, *startPositions,
+                                                config_.context_length(),
+                                                config_.context_start());
+    }
+
+    if (isPadding && weight_->getWGrad()) {
+      out_->grad->contextProjectionBackwardWeight(
+          weight_->getWGrad(), *startPositions, config_.context_length(),
+          config_.context_start(), weight_->getWGrad()->getHeight(), beginPad_);
+    }
+  }
+
+  if (config_.trainable_padding()) {
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ContextProjection.h b/paddle/gserver/layers/ContextProjection.h
new file mode 100644
index 00000000000000..0786ee28f2eed9
--- /dev/null
+++ b/paddle/gserver/layers/ContextProjection.h
@@ -0,0 +1,76 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * @brief Context projection concatenate features in adjacent time steps in
+ * a sequence. The i-th row of the output is the concatenation of
+ * context_length rows of the input. The context_length rows are the
+ * consecutive rows from the i+shift_start row.
+ *
+ * For example, assumed input (x) has 4 words and the dimension of each word
+ * representation is 2. If we use zero to pad instead of learned weight to pad,
+ * and the context_lenth is 3, the output (y) is:
+ *
+ * @code
+ *  x = [a1, a2;
+ *       b1, b2;
+ *       c1, c2;
+ *       d1, d2]
+ *  y = [0,  0,  a1, a2, b1, b2;
+ *       a1, a2, b1, b2, c1, c2;
+ *       b1, b2, c1, c2, d1, d2;
+ *       c1, c2, d1, d2, 0,  0]
+ * @endcode
+ *
+ * The config file api is context_projection.
+ */
+class ContextProjection : public Projection {
+public:
+  /**
+   * Constructor. If context_start is zero and context_lenth is one, it will
+   * set trainable_padding false. trainable_padding is an optional arguments
+   * and if it is set, constructor will set learned weight, which is used to
+   * pad output.
+   */
+  ContextProjection(const ProjectionConfig& config, ParameterPtr parameter,
+                    bool useGpu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+  virtual void resetState();
+
+  virtual void setState(LayerStatePtr state);
+
+  virtual LayerStatePtr getState();
+
+protected:
+  std::unique_ptr<Weight> weight_;
+  /// number of extra timesteps added at the beginning
+  size_t beginPad_;
+  /// number of extra timesteps added at the end
+  size_t endPad_;
+  /// state_ and state2_ are used in sequence generating and saved
+  /// previous inputs.
+  MatrixPtr state_;
+  MatrixPtr state2_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp
new file mode 100644
index 00000000000000..9ed9572139dc8c
--- /dev/null
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -0,0 +1,77 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "ConvBaseLayer.h"
+namespace paddle {
+
+bool ConvBaseLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* Initialize the convolutional layer parameter */
+  numFilters_ = config_.num_filters();
+  sharedBiases_ = config_.shared_biases();
+  for (auto& inputConfig : config_.inputs()) {
+    const ConvConfig& conf = inputConfig.conv_conf();
+    padding_.push_back(conf.padding());
+    stride_.push_back(conf.stride());
+    filterSize_.push_back(conf.filter_size());
+    paddingY_.push_back(conf.padding_y());
+    strideY_.push_back(conf.stride_y());
+    filterSizeY_.push_back(conf.filter_size_y());
+    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
+    channels_.push_back(conf.channels());
+    imgSize_.push_back(conf.img_size());
+    imgPixels_.push_back(imgSize_.back() * imgSize_.back());
+    groups_.push_back(conf.groups());
+    filterChannels_.push_back(conf.filter_channels());
+    outputX_.push_back(conf.output_x());
+    outputs_.push_back(outputX_.back() * outputX_.back());
+  }
+
+  /* initialize the weightList */
+  CHECK(inputLayers_.size() == parameters_.size());
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    size_t height, width;
+    height = filterPixels_[i] * filterChannels_[i];
+    width = numFilters_;
+
+    // create a new weight
+    CHECK_EQ(parameters_[i]->getSize(), width * height);
+    Weight* w = new Weight(height, width, parameters_[i]);
+    weights_.emplace_back(w);
+  }
+
+  /* initialize the biases_ */
+  if (biasParameter_.get() != NULL) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
+    }
+  }
+
+  // default caffe model
+  caffeMode_ = true;
+
+  return true;
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h
new file mode 100644
index 00000000000000..eaeaebf43be252
--- /dev/null
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Layer.h"
+namespace paddle {
+
+/**
+ * @brief A Base Convolution Layer, which convolves the input image
+ * with learned filters and (optionally) adds biases.
+ */
+
+class ConvBaseLayer : public Layer {
+protected:
+  typedef std::vector<int> IntV;
+
+  /// The number of filters.
+  int numFilters_;
+  /// The x dimension of the padding.
+  IntV padding_;
+  /// The y dimension of the padding.
+  IntV paddingY_;
+  /// The x dimension of the stride.
+  IntV stride_;
+  /// The y dimension of the stride.
+  IntV strideY_;
+  /// The x dimension of a filter kernel.
+  IntV filterSize_;
+  /// The y dimension of a filter kernel.
+  IntV filterSizeY_;
+  /// The spatial dimensions of the convolution input.
+  IntV channels_;
+  /// The spatial dimensions of input feature map.
+  IntV imgSize_;
+  /// The total pixel size of input feature map.
+  /// imgPixels_ = imgSizeX_ * imgSizeY_.
+  IntV imgPixels_;
+  /// filterPixels_ = filterSizeX_ * filterSizeY_.
+  IntV filterPixels_;
+  /// filterChannels_ = channels_/groups_.
+  IntV filterChannels_;
+  /// The spatial dimensions of output feature map.
+  IntV outputX_;
+  /// The spatial dimensions of output feature map.
+  IntV outputs_;
+  /// Group size, refer to grouped convolution in
+  /// Alex Krizhevsky's paper: when group=2, the first half of the
+  /// filters are only connected to the first half of the input channels,
+  /// and the second half only connected to the second half.
+  IntV groups_;
+  /// Whether the bias is shared for feature in each channel.
+  bool sharedBiases_;
+
+  /// shape of weight: (numChannels * filterPixels_, numFilters)
+  WeightList weights_;
+  /// If shared_biases is false shape of bias: (numFilters_, 1)
+  /// If shared_biases is ture shape of bias:
+  /// (numFilters_ * outputX * outputY, 1)
+  std::unique_ptr<Weight> biases_;
+
+  /// True by default. The only difference is the calculation
+  /// of output size.
+  bool caffeMode_;
+
+public:
+  explicit ConvBaseLayer(const LayerConfig& config) : Layer(config) {}
+
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  Weight& getWeight(int idx) { return *weights_[idx]; }
+
+  /**
+   * Calculate output size based on caffeMode_.
+   * - input(+padding): 0123456789
+   * - imageSize(+padding) = 10;
+   * - filterSize = 3;
+   * - stride = 2;
+   * - caffeMode_ is true:
+       - output: (012), (234), (456), (678)
+       - outputSize = 4;
+   * - caffeMode_ is false:
+   *   - output: (012), (234), (456), (678), (9)
+   *   - outputSize = 5;
+   */
+  int outputSize(int imageSize, int filterSize, int padding, int stride) {
+    int outputSize;
+    if (!caffeMode_) {
+     outputSize =
+          (imageSize - filterSize + 2 * padding + stride - 1) / stride + 1;
+    } else {
+      outputSize = (imageSize - filterSize + 2 * padding) / stride + 1;
+    }
+    CHECK_GE(outputSize, 1);
+    return outputSize;
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvOperator.cpp b/paddle/gserver/layers/ConvOperator.cpp
new file mode 100644
index 00000000000000..d08c422764e564
--- /dev/null
+++ b/paddle/gserver/layers/ConvOperator.cpp
@@ -0,0 +1,293 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/math/Matrix.h"
+#include "Operator.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+class ConvOperator : public Operator {
+public:
+  ConvOperator(const OperatorConfig &config, bool useGpu);
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvOperator() {
+    if (workSpaceInBytes_ != 0) {
+        hl_free_mem_device(workSpace_);
+        workSpaceInBytes_ = 0;
+    }
+
+    hl_destroy_tensor_descriptor(inputDesc_);
+    hl_destroy_tensor_descriptor(outputDesc_);
+    hl_destroy_filter_descriptor(filterDesc_);
+    hl_destroy_convolution_descriptor(convDesc_);
+  }
+  virtual void forward();
+  virtual void backward();
+
+private:
+  /**
+   * Get convolution parameters from layer config and
+   * initialize member variables.
+   */
+  void getConvParams();
+
+  /**
+   * Allocate Gpu Memory for cudnn convolution algorithms.
+   */
+  void allocConvWorkSpace(size_t maxWorkSpace);
+
+  /**
+   * Create cudnn tensor descriptor for convolution operation.
+   */
+  void computeConvSizes();
+
+  /**
+   * Reshape cudnn tensor descriptor.
+   */
+  void reshapeImageDescriptors();
+
+  /**
+   * Reshape cudnn tensor descriptor.
+   */
+  void reshape(int batchSize);
+
+  /**
+   * Check filter size is equal to the size calculated by parameters from
+   * layer config.
+   */
+  void checkFilterSize(const MatrixPtr &filter) {
+    CHECK_EQ(static_cast<int>(filter->getWidth()),
+             filterSize_ * filterSizeY_ * channels_ * numFilters_);
+  }
+
+  /**
+   * Calculate output size.
+   */
+  int outputSize(int imageSize, int filterSize, int padding, int stride) {
+    int outputSize;
+    if (!caffeMode_) {
+      /* input(+padding): 0123456789
+       * imageSize(+padding) = 10;
+       * filterSize = 3;
+       * stride = 2;
+       * output: (012), (234), (456), (678), (9)
+       * outputSize = 5;
+       */
+      outputSize =
+          (imageSize - filterSize + 2 * padding + stride - 1) / stride + 1;
+    } else {
+      /* input(+padding): 0123456789
+       * imageSize(+padding) = 10;
+       * filterSize = 3;
+       * stride = 2;
+       * output: (012), (234), (456), (678)
+       * outputSize = 4;
+       */
+      outputSize = (imageSize - filterSize + 2 * padding) / stride + 1;
+    }
+    return outputSize;
+  }
+  /// Most of member variables are same with CudnnConvLayer.
+  /// There is no explanation here.
+  int imageH_, imageW_, outputH_, outputW_;
+  hl_tensor_descriptor inputDesc_;
+  hl_tensor_descriptor outputDesc_;
+  hl_filter_descriptor filterDesc_;
+  hl_convolution_descriptor convDesc_;
+  bool caffeMode_;
+  int inputOffset_, outputOffset_, weightOffset_;
+  int numFilters_;
+  int padding_, stride_, filterSize_, channels_, imgSize_;
+  int paddingY_, strideY_, filterSizeY_;
+  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputs_;
+
+  /// Following member variables are same with CudnnConvLayer.
+  /// There is no explanation here.
+  int fwdAlgo_, bwdFilterAlgo_, bwdDataAlgo_;
+  size_t fwdLimitBytes_, bwdDataLimitBytes_, bwdFilterLimitBytes_;
+  size_t workSpaceInBytes_;
+  void* workSpace_;
+  bool isSelectAlgo_;
+};
+
+REGISTER_OPERATOR(conv, ConvOperator);
+
+ConvOperator::ConvOperator(const OperatorConfig &config, bool useGpu)
+    : Operator(config, useGpu) {
+  CHECK(useGpu);
+  CHECK_EQ(config_.input_indices_size(), 2L);
+
+  caffeMode_ = true;
+  getConvParams();
+  computeConvSizes();
+
+  // initialize all to default algorithms
+  fwdAlgo_ = 0;
+  bwdFilterAlgo_ = 0;
+  bwdDataAlgo_ = 0;
+  fwdLimitBytes_ = 0;
+  bwdDataLimitBytes_ = 0;
+  bwdFilterLimitBytes_ = 0;
+  workSpaceInBytes_ = 0;
+  workSpace_ = nullptr;
+
+  isSelectAlgo_ = false;
+}
+
+void ConvOperator::allocConvWorkSpace(size_t maxWorkSpace) {
+  if (maxWorkSpace > workSpaceInBytes_) {
+    if (workSpaceInBytes_ != 0) {
+        hl_free_mem_device(workSpace_);
+    }
+    // total amount of storage needed
+    workSpace_ = hl_malloc_device(maxWorkSpace);
+    workSpaceInBytes_ = maxWorkSpace;
+  }
+}
+
+
+void ConvOperator::reshape(int batchSize) {
+  imageH_ = ins_[0]->getFrameHeight();
+  imageW_ = ins_[0]->getFrameWidth();
+  if (imageH_ == 0) imageH_ = imgSize_;
+  if (imageW_ == 0) imageW_ = imgSize_;
+  outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_);
+  outputW_ = outputSize(imageW_, filterSize_, padding_, stride_);
+
+  out_->setFrameHeight(outputH_);
+  out_->setFrameWidth(outputW_);
+
+  reshapeImageDescriptors();
+
+  if (!isSelectAlgo_) {
+    hl_conv_workspace(inputDesc_, outputDesc_, filterDesc_,
+               convDesc_, &fwdAlgo_, &fwdLimitBytes_,
+               &bwdDataAlgo_, &bwdDataLimitBytes_,
+               &bwdFilterAlgo_, &bwdFilterLimitBytes_);
+
+    size_t maxWorkSpace = 0;
+    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+
+    allocConvWorkSpace(maxWorkSpace);
+  }
+
+  isSelectAlgo_ = true;
+}
+
+void ConvOperator::computeConvSizes() {
+  hl_create_filter_descriptor(&filterDesc_, channels_, numFilters_,
+                              filterSizeY_, filterSize_);
+  hl_create_tensor_descriptor(&inputDesc_);
+  int outputX = outputSize(imgSize_, filterSize_, padding_, stride_);
+  CHECK_EQ(outputX, outputX_);
+  hl_create_tensor_descriptor(&outputDesc_);
+  hl_create_convolution_descriptor(&convDesc_, inputDesc_, filterDesc_,
+                                   paddingY_, padding_, strideY_, stride_);
+}
+
+void ConvOperator::reshapeImageDescriptors() {
+  hl_tensor_reshape(inputDesc_, 1, channels_, imageH_, imageW_,
+                    channels_ * imageH_ * imageW_, imageH_ * imageW_,
+                    imageW_, 1);
+  hl_tensor_reshape(outputDesc_, 1, numFilters_, outputH_, outputW_,
+                    numFilters_ * outputH_ * outputW_, outputH_ * outputW_,
+                    outputW_, 1);
+  hl_reset_convolution_descriptor(convDesc_, inputDesc_, filterDesc_,
+                                  paddingY_, padding_, strideY_, stride_);
+  inputOffset_ = channels_ * imageH_ * imageW_;
+  outputOffset_ = numFilters_ * outputH_ * outputW_;
+  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSize_;
+}
+
+void ConvOperator::getConvParams() {
+  numFilters_ = config_.num_filters();
+  const ConvConfig &conf = config_.conv_conf();
+  padding_ = conf.padding();
+  stride_ = conf.stride();
+  filterSize_ = conf.filter_size();
+  paddingY_ = conf.padding_y();
+  strideY_ = conf.stride_y();
+  filterSizeY_ = conf.filter_size_y();
+  filterPixels_ = filterSize_ * filterSizeY_;
+  channels_ = conf.channels();
+  imgSize_ = conf.img_size();
+  imgPixels_ = imgSize_ * imgSize_;
+  CHECK_EQ(conf.groups(), 1U);
+  filterChannels_ = conf.filter_channels();
+  outputX_ = conf.output_x();
+  outputs_ = outputX_ * outputX_;
+}
+
+void ConvOperator::forward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  reshape(batchSize);
+  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
+  checkFilterSize(ins_[1]->value);
+  Matrix::resizeOrCreate(out_->value, batchSize,
+                         outputH_ * outputW_ * numFilters_);
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+      real *outData = out_->value->getData() + outputOffset_ * batchId;
+      hl_convolution_forward(inputDesc_, inputData, outputDesc_, outData,
+                             filterDesc_, wgtData, convDesc_, workSpace_,
+                             workSpaceInBytes_, fwdAlgo_);
+    }
+  }
+}
+
+void ConvOperator::backward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *outGrad = out_->grad->getData() + outputOffset_ * batchId;
+      if (ins_[1]->grad) {
+        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
+        hl_convolution_backward_filter(inputDesc_, inputData, outputDesc_,
+                                       outGrad, filterDesc_, weightGrad,
+                                       convDesc_, workSpace_,
+                                       workSpaceInBytes_, bwdFilterAlgo_);
+      }
+
+      MatrixPtr preGrad = ins_[0]->grad;
+      if (NULL != preGrad) {
+        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
+        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+        hl_convolution_backward_data(inputDesc_, inputGrad, outputDesc_,
+                                     outGrad, filterDesc_, wgtData,
+                                     convDesc_, workSpace_,
+                                     workSpaceInBytes_, bwdDataAlgo_);
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvShiftLayer.cpp b/paddle/gserver/layers/ConvShiftLayer.cpp
new file mode 100644
index 00000000000000..6b3881e3cc8039
--- /dev/null
+++ b/paddle/gserver/layers/ConvShiftLayer.cpp
@@ -0,0 +1,108 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for circular convluation of two vectors,
+ * which is used in NEURAL TURING MACHINE.
+ * - Input: two vectors, the first is data (batchSize x dataDim)
+ * the second is shift weights (batchSize x shiftDim)
+ * - Output: a vector (batchSize x dataDim)
+ * Assumed that:
+ * - a[in]: contains M elements.
+ * - b[in]: contains N elements (N should be odd).
+ * - c[out]: contains M elements.
+ *
+ * \f[
+ *     c[i] = \sum_{j=-(N-1)/2}^{(N-1)/2}a_{i+j} * b_{j}
+ * \f]
+ *
+ * In this formula:
+ *  - a's index is computed modulo M.
+ *  - b's index is comupted modulo N.
+ *
+ * The config file api is conv_shift_layer.
+ */
+
+class ConvShiftLayer : public Layer {
+public:
+  explicit ConvShiftLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ConvShiftLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+REGISTER_LAYER(conv_shift, ConvShiftLayer);
+
+bool ConvShiftLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+
+  return true;
+}
+
+void ConvShiftLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t dataDim = inV0->getWidth();
+
+  CHECK_EQ(batchSize, inV1->getHeight());
+  CHECK_EQ(dataDim, getSize());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  REGISTER_TIMER_INFO("FwConvShiftTimer", getName().c_str());
+  outV->circularConv(*inV0, *inV1);
+}
+
+void ConvShiftLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+
+  REGISTER_TIMER_INFO("BwConvShiftTimer", getName().c_str());
+
+  if (inG0 && inG1) {
+    outG->circularConvDerivative(*outG, *inV0, *inV1, *inG0, *inG1);
+  } else {
+    CHECK(!inG0 || !inG1) << "Not supported";
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvexCombinationLayer.cpp b/paddle/gserver/layers/ConvexCombinationLayer.cpp
new file mode 100644
index 00000000000000..e092b2e390f37c
--- /dev/null
+++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp
@@ -0,0 +1,144 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for convex weighted average of vectors,
+ * which is used in NEURAL MACHINE TRANSLATION BY JOINTLY LEARNING TO ALIGN AND
+ * TRANSLATE
+ * - Input: the first input contains the convex weights (batchSize x weightDim),
+ *          and the shape of second input is (batchSize x (weightdim*dataDim)).
+ * - Output: the shape of output is (batchSize x dataDim).
+ * \f[
+ *   out[i][j] = \sum_{j}(in0(i, j) * in1(i,j + i * dataDim)),
+ *               i = 0,1,...,(batchSize-1); j = 0, 1,...,(dataDim-1)
+ * \f]
+ *
+ * The config file api is convex_comb_layer.
+ */
+class ConvexCombinationLayer : public Layer {
+protected:
+  /// A matrix pointer pointing to second input.
+  MatrixPtr tmpMtx0;
+  /// A matrix pointer pointing to first input.
+  MatrixPtr tmpRow0;
+  /// A matrix pointer pointing to output.
+  MatrixPtr tmpRow1;
+
+public:
+  explicit ConvexCombinationLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ConvexCombinationLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+REGISTER_LAYER(convex_comb, ConvexCombinationLayer);
+
+bool ConvexCombinationLayer::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(2U, inputLayers_.size());
+  size_t dataDim = getSize();
+  size_t weightDim = inputLayers_[0]->getSize();
+
+  CHECK_EQ(weightDim * dataDim, inputLayers_[1]->getSize())
+      << "Dimension mismatch";
+
+  tmpRow0 = Matrix::create(nullptr, /* height= */ 1, weightDim,
+                           /* trans= */ false, useGpu_);
+  tmpRow1 = Matrix::create(nullptr, /* height= */ 1, dataDim,
+                           /* trans= */ false, useGpu_);
+  tmpMtx0 = Matrix::create(nullptr, /* height= */ weightDim, dataDim,
+                           /* trans= */ false, useGpu_);
+
+  return true;
+}
+
+void ConvexCombinationLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t weightDim = inV0->getWidth();
+  size_t dataDim = getSize();
+
+  CHECK_EQ(batchSize, inV1->getHeight());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  REGISTER_TIMER_INFO("FwCvxCombTimer", getName().c_str());
+  for (size_t i = 0; i < batchSize; i++) {
+    tmpMtx0->setData(inV1->getData() + i * weightDim * dataDim);
+    tmpRow0->setData(inV0->getData() + i * weightDim);
+    tmpRow1->setData(outV->getData() + i * dataDim);
+
+    tmpRow1->mul(tmpRow0, tmpMtx0, 1, 0);
+  }
+}
+
+void ConvexCombinationLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t weightDim = inV0->getWidth();
+  size_t dataDim = getSize();
+
+  REGISTER_TIMER_INFO("BwCvxCombTimer", getName().c_str());
+
+  if (inG0) {
+    for (size_t i = 0; i < batchSize; i++) {
+      tmpRow0->setData(inG0->getData() + i * weightDim);
+      tmpRow1->setData(outG->getData() + i * dataDim);
+      tmpMtx0->setData(inV1->getData() + i * weightDim * dataDim);
+
+      tmpRow0->mul(tmpRow1, tmpMtx0->getTranspose(), 1, 1);
+    }
+  }
+
+  if (inG1) {
+    for (size_t i = 0; i < batchSize; i++) {
+      tmpRow0->setData(inV0->getData() + i * weightDim);
+      tmpRow1->setData(outG->getData() + i * dataDim);
+      tmpMtx0->setData(inG1->getData() + i * weightDim * dataDim);
+
+      tmpMtx0->mul(tmpRow0->getTranspose(), tmpRow1, 1, 1);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CosSimLayer.cpp b/paddle/gserver/layers/CosSimLayer.cpp
new file mode 100644
index 00000000000000..b10bd1d886ecf4
--- /dev/null
+++ b/paddle/gserver/layers/CosSimLayer.cpp
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "CosSimLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(cos, CosSimLayer);
+
+bool CosSimLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2LU);
+  return true;
+}
+
+void CosSimLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(0)->getHeight();
+  int size = getSize();
+
+  {
+    REGISTER_TIMER_INFO("CosFwResetTimer", getName().c_str());
+    reserveOutput(batchSize, size);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("CosFwAtvTimer", getName().c_str());
+    MatrixPtr prevOut1 = getInputValue(0);
+    MatrixPtr prevOut2 = getInputValue(1);
+    outV->cosSim(*prevOut1, *prevOut2, kCosSimScale_);
+  }
+}
+
+void CosSimLayer::backward(const UpdateCallback& callback) {
+  /* activation */ {
+    REGISTER_TIMER_INFO("CosBpAtvTimer", getName().c_str());
+    MatrixPtr outG = this->getOutputGrad();
+
+    outG->cosSimDerivative(*this->getOutputValue(), *getInputValue(0),
+                           *getInputValue(1), *getInputGrad(0),
+                           *getInputGrad(1), kCosSimScale_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CosSimLayer.h b/paddle/gserver/layers/CosSimLayer.h
new file mode 100644
index 00000000000000..65c6fa8280b583
--- /dev/null
+++ b/paddle/gserver/layers/CosSimLayer.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+
+class CosSimLayer : public Layer {
+public:
+  explicit CosSimLayer(const LayerConfig& config)
+      : Layer(config), kCosSimScale_(5.0f) {}
+
+  ~CosSimLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+
+  const real kCosSimScale_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CosSimVecMatLayer.cpp b/paddle/gserver/layers/CosSimVecMatLayer.cpp
new file mode 100644
index 00000000000000..773d35c0f059bb
--- /dev/null
+++ b/paddle/gserver/layers/CosSimVecMatLayer.cpp
@@ -0,0 +1,139 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for computing cosine similarity between a vector an each row of a
+ * matrix,
+ * out[i] = cos_scale * cos(in1, in2(i,:));
+ * which is used in NEURAL TURING MACHINE
+ * Input: a vector (batchSize x dataDim) and a matrix in vec form (batchSize x
+ * (weightDim*dataDim))
+ * Output: a vector (batchSize x weightDim)
+ */
+
+class CosSimVecMatLayer : public Layer {
+protected:
+  MatrixPtr tmpMtx0;
+  MatrixPtr tmpMtx1;
+  MatrixPtr tmpRow0;
+  MatrixPtr tmpRow1;
+  MatrixPtr tmpRow2;
+  MatrixPtr tmpRow3;
+
+public:
+  explicit CosSimVecMatLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~CosSimVecMatLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+REGISTER_LAYER(cos_vm, CosSimVecMatLayer);
+
+bool CosSimVecMatLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+
+  size_t dataDim = inputLayers_[0]->getSize();
+  size_t numKeys = getSize();
+  size_t memoryDim = inputLayers_[1]->getSize();
+
+  CHECK_EQ(dataDim * numKeys, memoryDim) << "Dimension mismatch";
+
+  tmpRow0 = Matrix::create(nullptr, /* height= */ 1, dataDim,
+                           /* trans= */ false, useGpu_);
+  tmpRow1 = Matrix::create(nullptr, /* height= */ 1, dataDim,
+                           /* trans= */ false, useGpu_);
+  tmpRow2 = Matrix::create(nullptr, /* height= */ numKeys, 1,
+                           /* trans= */ false, useGpu_);
+  tmpRow3 = Matrix::create(nullptr, /* height= */ numKeys, 1,
+                           /* trans= */ false, useGpu_);
+
+  tmpMtx0 = Matrix::create(nullptr, /* height= */ numKeys, dataDim,
+                           /* trans= */ false, useGpu_);
+  tmpMtx1 = Matrix::create(nullptr, /* height= */ numKeys, dataDim,
+                           /* trans= */ false, useGpu_);
+  return true;
+}
+
+void CosSimVecMatLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t numKeys = getSize();
+
+  CHECK_EQ(batchSize, inV1->getHeight());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, numKeys);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  REGISTER_TIMER_INFO("FwCosVMTimer", getName().c_str());
+  for (size_t i = 0; i < batchSize; i++) {
+    tmpRow0->setData(inV0->rowBuf(i));
+    tmpMtx0->setData(inV1->rowBuf(i));
+    tmpRow2->setData(outV->rowBuf(i));
+    tmpRow2->cosSim(*(tmpMtx0), *(tmpRow0), config_.cos_scale());
+  }
+}
+
+void CosSimVecMatLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+
+  size_t batchSize = inV0->getHeight();
+
+  REGISTER_TIMER_INFO("BwCosVMTimer", getName().c_str());
+
+  if (inG0 && inG1) {
+    for (size_t i = 0; i < batchSize; i++) {
+      tmpRow0->setData(inV0->rowBuf(i));
+      tmpRow1->setData(inG0->rowBuf(i));
+      tmpMtx0->setData(inV1->rowBuf(i));
+      tmpMtx1->setData(inG1->rowBuf(i));
+      tmpRow2->setData(outV->rowBuf(i));
+      tmpRow3->setData(outG->rowBuf(i));
+
+      tmpRow3->cosSimDerivative(*(tmpRow2), *(tmpMtx0), *(tmpRow0), *(tmpMtx1),
+                                *(tmpRow1), config_.cos_scale());
+    }
+  } else {
+    CHECK(!inG0 || !inG1) << "Not supported";
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
new file mode 100644
index 00000000000000..f353afabb3b716
--- /dev/null
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -0,0 +1,567 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <memory>
+#include <algorithm>
+#include "paddle/utils/Logging.h"
+#include <cmath>
+#include "CostLayer.h"
+
+#include "paddle/math/SparseMatrix.h"
+
+namespace paddle {
+
+bool CostLayer::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
+  bool ret = Layer::init(layerMap, parameterMap);
+  if (config_.has_coeff()) {
+    coeff_ = config_.coeff();  // coeff only affact bp
+  } else {
+    coeff_ = real(1.0);
+  }
+  if (!ret) return ret;
+  CHECK_GE(inputLayers_.size(), 2UL);
+  CHECK_LE(inputLayers_.size(), 3UL);
+  if (inputLayers_.size() == 3) {
+    weightLayer_ = inputLayers_[2];
+  }
+  return true;
+}
+
+void CostLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(*getOutputLayer())->getHeight();
+  int size = 1;
+  resetOutput(batchSize, size);
+
+  const MatrixPtr& output = getInputValue(*getOutputLayer());
+  Argument label = getInput(*getLabelLayer());
+
+  /* get the cost value for each sample*/
+  forwardImp(*output, label, *getOutputValue());
+  if (weightLayer_) {
+    const MatrixPtr& weight = getInputValue(*weightLayer_);
+    getOutputValue()->dotMul(*getOutputValue(), *weight);
+  }
+}
+
+void CostLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  const Argument& output = getInput(*getOutputLayer());
+  Argument label = getInput(*getLabelLayer());
+
+  bool support = true;
+  if (weightLayer_) {
+    support = output.grad->getAbsSum() == 0;
+  }
+
+  backwardImp(*output.value, label, *output.grad);
+
+  if (weightLayer_) {
+    CHECK(support) << "Weighted cost layer '" << getName()
+                   << "' must be the last layer "
+                      "connected to the output layer '"
+                   << getOutputLayer()->getName() << "'";
+    output.grad->rowScale(0, *output.grad, *getInputValue(*weightLayer_));
+  }
+  if (coeff_ != real(1.0f)) {
+    output.grad->add(coeff_, 0);
+  }
+}
+
+//
+// class MultiClassCrossEntropy
+//
+bool MultiClassCrossEntropy::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void MultiClassCrossEntropy::forwardImp(Matrix& output, Argument& label,
+                                        Matrix& target) {
+  target.oneHotCrossEntropy(output, *label.ids);
+}
+
+void MultiClassCrossEntropy::backwardImp(
+    Matrix& output, Argument& label, Matrix& outputG) {
+  outputG.oneHotCrossEntropyBp(output, *label.ids);
+}
+
+//
+// class MultiClassCrossEntropyWithSelfNorm
+//
+REGISTER_LAYER(multi_class_cross_entropy_with_selfnorm,
+               MultiClassCrossEntropyWithSelfNorm);
+
+bool MultiClassCrossEntropyWithSelfNorm::init(
+    const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void MultiClassCrossEntropyWithSelfNorm::forwardImp(Matrix& output,
+                                                    Argument& label,
+                                                    Matrix& target) {
+  Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
+  output.rowSum(*sftMaxSum_);
+  sftMaxSum_->log();
+
+  target.oneHotCrossEntropy(output, *label.ids);
+  target.add(*sftMaxSum_);
+
+  sftMaxSum_->square();
+  target.add(*sftMaxSum_, config_.softmax_selfnorm_alpha());
+}
+
+void MultiClassCrossEntropyWithSelfNorm::backwardImp(Matrix& output,
+                                                     Argument& label,
+                                                     Matrix& outputG) {
+  Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
+  output.rowSum(*sftMaxSum_);
+
+  Matrix::resizeOrCreate(sumInv_, output.getHeight(), 1, false, useGpu_);
+  sftMaxSum_->reciprocal(*sumInv_);
+
+  outputG.oneHotCrossEntropyBp(output, *label.ids);
+  outputG.addColumnVector(*sumInv_);
+
+  sftMaxSum_->log();
+  sumInv_->dotMul(*sumInv_, *sftMaxSum_);
+  sumInv_->mulScalar(2 * config_.softmax_selfnorm_alpha());
+
+  outputG.addColumnVector(*sumInv_);
+}
+
+//
+// class SoftBinaryClassCrossEntropy
+//
+REGISTER_LAYER(soft_binary_class_cross_entropy, SoftBinaryClassCrossEntropy);
+
+bool SoftBinaryClassCrossEntropy::init(const LayerMap& layerMap,
+                                       const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void SoftBinaryClassCrossEntropy::forwardImp(Matrix& output, Argument& label,
+                                             Matrix& target) {
+  Matrix::resizeOrCreate(targetPerDim_, output.getHeight(), output.getWidth(),
+                         false, useGpu_);
+
+  targetPerDim_->softCrossEntropy(output, *label.value);
+  targetPerDim_->rowSum(target);
+}
+
+void SoftBinaryClassCrossEntropy::backwardImp(
+    Matrix& output, Argument& label, Matrix& outputG) {
+  outputG.softCrossEntropyBp(output, *label.value);
+}
+
+//
+// class SumOfSquaresCostLayer
+//
+
+REGISTER_LAYER(square_error, SumOfSquaresCostLayer);
+
+bool SumOfSquaresCostLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void SumOfSquaresCostLayer::forwardImp(Matrix& output, Argument& label,
+                                       Matrix& target) {
+  target.sumOfSquares(output, *label.value);
+}
+
+void SumOfSquaresCostLayer::backwardImp(
+    Matrix& output, Argument& label, Matrix& outputG) {
+  outputG.sumOfSquaresBp(output, *label.value);
+}
+
+//
+// class RankingCost
+//
+bool RankingCost::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  posPairCount_ = 0;
+  negPairCount_ = 0;
+
+  bool ret = Layer::init(layerMap, parameterMap);
+  if (!ret) return ret;
+  CHECK_GE(inputLayers_.size(), 3UL);
+  CHECK_LE(inputLayers_.size(), 4UL);
+  if (inputLayers_.size() == 4) {
+    weightLayer_ = inputLayers_[3];
+  }
+  return true;
+}
+
+void RankingCost::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(*getOutputLayer(0))->getHeight();
+  int size = 1;
+  resizeOutput(batchSize, size);
+  Matrix::resizeOrCreate(margin_, batchSize, size, /* trans= */ false, useGpu_);
+  MatrixPtr label = getInputValue(*getLabelLayer());
+  if (!label) {
+    // input label is not in value, try ids
+    IVectorPtr idLabel = getInput(*getLabelLayer()).ids;
+    CHECK(idLabel) << "label layer has neither value nor ids";
+    CHECK_EQ((size_t)batchSize, idLabel->getSize());
+    Matrix::resizeOrCreate(labelBuf_, batchSize, /*width*/ 1, /*trans*/ false,
+                           useGpu_);
+    labelBuf_->copyFrom(*idLabel);
+    label = labelBuf_;
+  }
+
+  MatrixPtr output[] = {getInputValue(*getOutputLayer(0)),
+                        getInputValue(*getOutputLayer(1))};
+  MatrixPtr target = this->getOutputValue();
+  margin_->sub(*output[0], *output[1]);
+
+  // for validation
+  size_t height = output[0]->getHeight();
+  target->biggerThan(*(output[0]), *(output[1]), *label);
+  double total = static_cast<double>(height);
+  if (weightLayer_) {
+    const MatrixPtr& weight = getInputValue(*weightLayer_);
+    target->dotMul(*target, *weight);
+    total = weight->getSum();
+  }
+  double pos = target->getSum();
+  posPairCount_ += pos;
+  negPairCount_ += (total - pos);
+
+  // forward
+  target->logisticRegressionLoss(*margin_, *label);
+  if (weightLayer_) {
+    const MatrixPtr& weight = getInputValue(*weightLayer_);
+    target->dotMul(*target, *weight);
+  }
+}
+
+void RankingCost::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  MatrixPtr label = getInputValue(*getLabelLayer());
+  if (!label) {
+    // input label is not in value, but in ids
+    // use labelBuf_ (should already resized and copied during forward)
+    label = labelBuf_;
+  }
+
+  Matrix::resizeOrCreate(marginGrad_, label->getHeight(), 1, /* trans= */ false,
+                         useGpu_);
+  marginGrad_->zeroMem();
+  marginGrad_->logisticRegressionLossBp(*margin_, *label);
+  if (weightLayer_) {
+    const MatrixPtr& weight = getInputValue(*weightLayer_);
+    marginGrad_->dotMul(*marginGrad_, *weight);
+  }
+
+  getInputGrad(0)->add(*marginGrad_);
+  getInputGrad(1)->sub(*marginGrad_);
+}
+
+void RankingCost::onPassEnd() {
+  double ratio = posPairCount_ / ((negPairCount_ <= 0) ? 1.0 : negPairCount_);
+  LOG(INFO) << "calc pos/neg: " << ratio << " pos= " << posPairCount_
+            << " neg= " << negPairCount_;
+
+  posPairCount_ = 0;
+  negPairCount_ = 0;
+}
+
+//
+// class LambdaCost
+//
+REGISTER_LAYER(lambda_cost, LambdaCost);
+
+bool LambdaCost::init(const LayerMap& layerMap,
+                      const ParameterMap& parameterMap) {
+  truncationSize_ = config_.ndcg_num();
+  maxSortSize_ = config_.max_sort_size();
+  if (maxSortSize_ != -1) {
+    CHECK_GE(maxSortSize_, truncationSize_)
+        << "maxSortSize must be greater than or equal to NDCG size!";
+  }
+  LOG(INFO) << "LambdaRank v1.3, NDCG size = " << truncationSize_
+            << ", Max partial sort size = " << maxSortSize_;
+  CHECK(!useGpu_) << "LambdaRank supports CPU only!";
+  return Layer::init(layerMap, parameterMap);
+}
+
+void LambdaCost::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(*getOutputLayer())->getHeight();
+  resizeOutput(batchSize, 1);
+
+  MatrixPtr score = getInputValue(*getScoreLayer());
+  MatrixPtr output = getInputValue(*getOutputLayer());
+  MatrixPtr target = this->getOutputValue();
+
+  real* scoreData = score->getData();
+  real* outputData = output->getData();
+  real* targetData = target->getData();
+
+  auto startPos =
+      getInput(*getOutputLayer()).sequenceStartPositions;
+  const int* startPosData = startPos->getData(false);
+  size_t batchNum = startPos->getSize() - 1;
+  for (size_t i = 0; i < batchNum; ++i) {
+    int beginPos = startPosData[i];
+    int endPos = startPosData[i + 1];
+    real NDCG = calcNDCG(outputData + beginPos, scoreData + beginPos,
+                         endPos - beginPos);
+    for (int j = beginPos; j < endPos; ++j) {
+      targetData[j] = NDCG;
+    }
+  }
+}
+
+void LambdaCost::backward(const UpdateCallback& callback) {
+  (void)callback;
+  MatrixPtr score = getInputValue(*getScoreLayer());
+  MatrixPtr output = getInputValue(*getOutputLayer());
+  Matrix::resizeOrCreate(marginGrad_, score->getHeight(), 1,
+                         /* trans= */ false, useGpu_);
+  marginGrad_->zeroMem();
+
+  real* gradData = marginGrad_->getData();
+  real* scoreData = score->getData();
+  real* outputData = output->getData();
+
+  auto startPos =
+      getInput(*getOutputLayer()).sequenceStartPositions;
+  const int* startPosData = startPos->getData(false);
+  size_t batchNum = startPos->getSize() - 1;
+
+  for (size_t i = 0; i < batchNum; ++i) {
+    int beginPos = startPosData[i];
+    int endPos = startPosData[i + 1];
+    calcGrad(outputData + beginPos, scoreData + beginPos, gradData + beginPos,
+             endPos - beginPos);
+  }
+
+  getInputGrad(0)->add(*marginGrad_);
+}
+
+void LambdaCost::onPassEnd() {}
+
+void LambdaCost::calcGrad(const real* outputScore, const real* score,
+                          real* gradData, int size) {
+  CHECK_GE(size, truncationSize_)
+      << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
+  int sortSize = maxSortSize_ == -1 ? size : std::min(maxSortSize_, size);
+
+  scorePair_.clear();
+  for (int i = 0; i < size; ++i) {
+    scorePair_.push_back(std::make_pair(score[i], i));
+  }
+  if (size <= sortSize) {
+    std::sort(scorePair_.begin(), scorePair_.end(),
+              [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
+                return a.first > b.first;
+              });
+  } else {
+    std::partial_sort(
+        scorePair_.begin(), scorePair_.begin() + sortSize, scorePair_.end(),
+        [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
+          return a.first > b.first;
+        });
+  }
+
+  real maxDCG = 0;
+  for (int i = 0; i < truncationSize_; ++i) {
+    maxDCG += (std::pow(2, scorePair_[i].first) - 1) / std::log(i + 2);
+  }
+  CHECK_GT(maxDCG, 0) << "Invalid: max DCG = 0!";
+
+  for (int i = 0; i < sortSize; ++i) {
+    for (int j = i + 1; j < size; ++j) {
+      int index_i = scorePair_[i].second;
+      int index_j = scorePair_[j].second;
+      real score_i = score[index_i];
+      real score_j = score[index_j];
+      real dcgDif = 0;
+      if (j < sortSize) {
+        dcgDif = (std::pow(2, score_i) - std::pow(2, score_j)) /
+                 (std::log(i + 2) - std::log(j + 2));
+      } else {
+        dcgDif =
+            (std::pow(2, score_i) - std::pow(2, score_j)) / std::log(i + 2);
+      }
+
+      real lambda_ij =
+          -std::abs(dcgDif) /
+          (1 + std::exp(outputScore[index_i] - outputScore[index_j]));
+      gradData[index_i] += lambda_ij / maxDCG;
+      gradData[index_j] -= lambda_ij / maxDCG;
+    }
+  }
+}
+
+real LambdaCost::calcNDCG(const real* outputScore, const real* score,
+                          int size) {
+  CHECK_GE(size, truncationSize_)
+      << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
+
+  outputScorePair_.clear();
+  for (int i = 0; i < size; ++i) {
+    outputScorePair_.push_back(std::make_pair(outputScore[i], i));
+  }
+  std::partial_sort(
+      outputScorePair_.begin(), outputScorePair_.begin() + truncationSize_,
+      outputScorePair_.end(),
+      [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
+        return a.first > b.first;
+      });
+
+  real DCG = 0;
+  for (int i = 0; i < truncationSize_; ++i) {
+    DCG +=
+        (std::pow(2, score[outputScorePair_[i].second]) - 1) / std::log(i + 2);
+  }
+
+  scoreVec_.resize(size);
+  std::copy(score, score + size, scoreVec_.begin());
+  real maxDCG = 0;
+  std::partial_sort(scoreVec_.begin(), scoreVec_.begin() + truncationSize_,
+                    scoreVec_.end(), std::greater<real>());
+  for (int i = 0; i < truncationSize_; ++i) {
+    maxDCG += (std::pow(2, scoreVec_[i]) - 1) / std::log(i + 2);
+  }
+  CHECK_GT(maxDCG, 0) << "Invalid: max DCG = 0!";
+
+  return DCG / maxDCG;
+}
+
+//
+// class MultiBinaryLabelCrossEntropy
+//
+
+REGISTER_LAYER(multi_binary_label_cross_entropy, MultiBinaryLabelCrossEntropy);
+
+bool MultiBinaryLabelCrossEntropy::init(const LayerMap& layerMap,
+                                        const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void MultiBinaryLabelCrossEntropy::forwardImp(Matrix& output, Argument& label,
+                                              Matrix& target) {
+  if (dynamic_cast<CpuSparseMatrix*>(label.value.get()) ||
+      dynamic_cast<GpuSparseMatrix*>(label.value.get())) {
+    target.multiBinaryLabelCrossEntropy(output, *label.value);
+  } else {
+    Matrix::resizeOrCreate(targetPerDim_, output.getHeight(), output.getWidth(),
+                           false, useGpu_);
+
+    targetPerDim_->binaryLabelCrossEntropy(output, *label.value);
+    targetPerDim_->rowSum(target);
+  }
+}
+
+void MultiBinaryLabelCrossEntropy::backwardImp(
+    Matrix& output, Argument& label, Matrix& outputG) {
+  if (dynamic_cast<CpuSparseMatrix*>(label.value.get()) ||
+      dynamic_cast<GpuSparseMatrix*>(label.value.get())) {
+    outputG.multiBinaryLabelCrossEntropyBp(output, *label.value);
+  } else {
+    outputG.binaryLabelCrossEntropyBp(output, *label.value);
+  }
+}
+
+//
+// Huber loss for robust 2-classes classification
+//
+REGISTER_LAYER(huber, HuberTwoClass);
+
+bool HuberTwoClass::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  CostLayer::init(layerMap, parameterMap);
+  if (useGpu_) {
+    tmpCpuInput_.reserve(inputLayers_.size());
+    for (size_t i = 0; i < inputLayers_.size(); i++) {
+      tmpCpuInput_.push_back(Argument());
+    }
+  }
+  return true;
+}
+
+void HuberTwoClass::forwardImp(Matrix &output, Argument &label,
+                               Matrix &cost) {
+  if (useGpu_) {
+    for (size_t i = 0; i < inputLayers_.size(); i++) {
+      tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
+    }
+  }
+  forwardImpIn(output, label, cost);
+}
+
+void HuberTwoClass::forwardImpIn(Matrix& output, Argument& label,
+                                 Matrix& target) {
+  size_t numSamples = target.getHeight();
+  CHECK_EQ((*label.ids).getSize(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(output.getWidth(), (size_t)1);
+  CHECK_EQ(target.getWidth(), (size_t)1);
+
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData(): output.getData();
+  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
+  std::vector<real> cost(numSamples);
+  for (size_t i = 0; i < numSamples; ++i) {
+    int y = 2 * lbl[i] - 1;
+    if (out[i] * y < -1)
+      cost[i] = -4 * out[i] * y;
+    else if (out[i] * y < 1)
+      cost[i] = (1 - out[i] * y) * (1 - out[i] * y);
+    else
+      cost[i] = 0;
+  }
+  target.copyFrom(cost.data(), numSamples);
+}
+
+void HuberTwoClass::backwardImp(Matrix &outputValue,
+                                Argument &label, Matrix &outputGrad) {
+  if (useGpu_) {
+    backwardImpIn(*tmpCpuInput_[0].value, tmpCpuInput_[1],
+                  *tmpCpuInput_[0].grad);
+    outputGrad.copyFrom(*tmpCpuInput_[0].grad);
+  } else {
+    backwardImpIn(outputValue, label, outputGrad);
+  }
+}
+
+void HuberTwoClass::backwardImpIn(
+    Matrix& output, Argument& label, Matrix& outputG) {
+  size_t numSamples = output.getHeight();
+  real* out = output.getData();
+  real* grad = outputG.getData();
+  int* lbl = (*label.ids).getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    int y = 2 * lbl[i] - 1;
+    if (y * out[i] < -1)
+      grad[i] += -4 * y;
+    else if (y * out[i] < 1)
+      grad[i] += -2 * (1 - y * out[i]) * y;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h
new file mode 100644
index 00000000000000..b4383370a06e3f
--- /dev/null
+++ b/paddle/gserver/layers/CostLayer.h
@@ -0,0 +1,238 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * Base class for a particular type of cost layer.
+ * This type of cost should have one data layer, one label layer
+ * and an optional weight layer as input.
+ * The derived class should implemnt forwardImp() and backwardImp()
+ * which calculate the cost for data and label. The weight is automatically
+ * handled by the base class.
+ */
+class CostLayer : public Layer {
+public:
+  explicit CostLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  LayerPtr getOutputLayer() { return inputLayers_[0]; }
+
+  LayerPtr getLabelLayer() { return inputLayers_[1]; }
+
+  virtual void forward(PassType passType);
+
+  virtual void backward(const UpdateCallback& callback = nullptr);
+
+  virtual void forwardImp(Matrix& outputValue, Argument& label,
+                          Matrix& cost) = 0;
+
+  virtual void backwardImp(Matrix& outputValue, Argument& label,
+                           Matrix& outputGrad) = 0;
+
+protected:
+  LayerPtr weightLayer_;
+  real coeff_;
+};
+
+/*
+ * MultiClassCrossEntropy
+ */
+class MultiClassCrossEntropy : public CostLayer {
+public:
+  explicit MultiClassCrossEntropy(const LayerConfig& config)
+      : CostLayer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+
+  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+};
+
+/*
+ * MultiClassCrossEntropyWithSelfNorm
+ * \sum_i (-log(x_label(i)) + alpha * log(Z(i)^2)
+ */
+class MultiClassCrossEntropyWithSelfNorm : public CostLayer {
+public:
+  explicit MultiClassCrossEntropyWithSelfNorm(const LayerConfig& config)
+      : CostLayer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+
+  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+
+protected:
+  MatrixPtr sftMaxSum_;
+  MatrixPtr sumInv_;
+};
+
+/*
+ * SoftBinaryClassCrossEntropy
+ *  \sum_i (\sum_j -y_j(i)*log(x_j(i))-(1-y_j(i))*log(1-x_j(i)))
+ */
+class SoftBinaryClassCrossEntropy : public CostLayer {
+public:
+  explicit SoftBinaryClassCrossEntropy(const LayerConfig& config)
+      : CostLayer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+
+  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+
+protected:
+  MatrixPtr targetPerDim_;
+};
+
+class SumOfSquaresCostLayer : public CostLayer {
+public:
+  explicit SumOfSquaresCostLayer(const LayerConfig& config)
+      : CostLayer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+
+  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+};
+
+/*
+ * RankingCost
+ */
+class RankingCost : public Layer {
+public:
+  explicit RankingCost(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  LayerPtr getOutputLayer(size_t i) { return inputLayers_[i]; }
+
+  LayerPtr getLabelLayer() { return inputLayers_[2]; }
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback& callback = nullptr);
+
+  void onPassEnd();
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) {
+    (void)output;
+    (void)label;
+    (void)cost;
+  }
+
+  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {
+    (void)outputValue;
+    (void)label;
+    (void)outputGrad;
+  }
+
+private:
+  double posPairCount_;
+  double negPairCount_;
+  MatrixPtr margin_;
+  MatrixPtr marginGrad_;
+  // if input label is put in ids (not value), copy to this buffer.
+  MatrixPtr labelBuf_;
+  LayerPtr weightLayer_;
+};
+
+/* lambdaRank listwise LTR approach */
+class LambdaCost : public Layer {
+public:
+  explicit LambdaCost(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  LayerPtr getOutputLayer() { return inputLayers_[0]; }
+
+  LayerPtr getScoreLayer() { return inputLayers_[1]; }
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback& callback = nullptr);
+
+  void onPassEnd();
+
+  real calcNDCG(const real* outputScore, const real* score, int size);
+  void calcGrad(const real* outputScore, const real* score, real* gradData,
+                int size);
+
+private:
+  MatrixPtr marginGrad_;
+  int truncationSize_;
+  int maxSortSize_;
+  std::vector<std::pair<real, int>> scorePair_;
+  std::vector<std::pair<real, int>> outputScorePair_;
+  std::vector<real> scoreVec_;
+};
+
+/**
+ * Cross entropy for multi binary labels
+ * cost[i] = -sum(label[i][j]*log(output[i][j])
+ *                + (1-label[i][j])*log(1-output[i][j]))
+ */
+class MultiBinaryLabelCrossEntropy : public CostLayer {
+protected:
+  MatrixPtr targetPerDim_;
+
+public:
+  explicit MultiBinaryLabelCrossEntropy(const LayerConfig& config)
+      : CostLayer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+
+  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+};
+
+/*
+ * Huber loss for robust 2-classes classification
+ *
+ * For label={0, 1}, let y=2*label-1. Given output f, the loss is:
+ * -4*y*f, if y*f < -1
+ * (1-y*f)^2, if -1 < y*f < 1,
+ * 0, otherwise
+ */
+class HuberTwoClass : public CostLayer {
+  std::vector<Argument> tmpCpuInput_;
+public:
+  explicit HuberTwoClass(const LayerConfig& config) : CostLayer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+
+  void forwardImpIn(Matrix& output, Argument& label, Matrix& cost);
+
+  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+
+  void backwardImpIn(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+};
+
+typedef std::shared_ptr<CostLayer> CostLayerPtr;
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
new file mode 100644
index 00000000000000..e1762e8d360dea
--- /dev/null
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -0,0 +1,150 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Stat.h"
+#include "Layer.h"
+#include "CudnnBatchNormLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(cudnn_batch_norm, CudnnBatchNormLayer);
+
+const double CudnnBatchNormLayer::EPS = 1E-5;
+
+bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  if (!BatchNormBaseLayer::init(layerMap, parameterMap)) return false;
+  CHECK(useGpu_) << "CudnnBatchNorm only support GPU";
+
+  hl_create_tensor_descriptor(&ioDesc_);
+  hl_create_tensor_descriptor(&bnParamDesc_);
+  hl_tensor_reshape(bnParamDesc_, 1, channels_, 1, 1);
+
+  return true;
+}
+
+void CudnnBatchNormLayer::reshape(int batchSize) {
+  hl_tensor_reshape(ioDesc_, batchSize, channels_, imageH_, imageW_);
+}
+
+void CudnnBatchNormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = getInputValue(0)->getHeight();
+  calFeatureMapSize();
+  reshape(batchSize);
+  resetOutput(batchSize, getInputValue(0)->getWidth());
+
+  // for testing in training peroid.
+  useGlobalStats_ = (passType == PASS_TEST);
+  if (passType == PASS_TEST && config_.has_use_global_stats()) {
+    useGlobalStats_ = config_.use_global_stats();
+  }
+
+  real* input = getInputValue(0)->getData();
+  real* output = getOutputValue()->getData();
+  real* gamma = weight_->getW()->getData();
+  real* beta = biases_->getW()->getData();
+  real* movingMean = movingMean_->getW()->getData();
+  real* movingVar = movingVar_->getW()->getData();
+
+  if (!useGlobalStats_) {
+    REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
+    real* savedMean = savedMean_->getData();
+    real* savedInvVar = savedInvVar_->getData();
+    hl_batch_norm_forward_training(ioDesc_, input, ioDesc_, output,
+                                   bnParamDesc_,
+                                   gamma, beta, 1.0 - movingAvgFraction_,
+                                   movingMean, movingVar,
+                                   EPS, savedMean, savedInvVar);
+  } else {
+    // used movingMean and movingVar in testing
+    hl_batch_norm_forward_inference(ioDesc_, input, ioDesc_, output,
+                                    bnParamDesc_, gamma, beta,
+                                    movingMean, movingVar, EPS);
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  real* input = getInputValue(0)->getData();
+  real* outGrad = getOutputGrad()->getData();
+  real* inGrad = getInputGrad(0)->getData();
+  real* gamma = weight_->getW()->getData();
+  real* savedMean = savedMean_->getData();
+  real* savedInvVar = savedInvVar_->getData();
+
+  auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) {
+    Matrix::resizeOrCreate(m, h, w, false, true);
+    m->zeroMem();
+    *p = m->getData();
+  };
+
+  real* gammaGrad = nullptr;
+  real* betaGrad = nullptr;
+  if (weight_->getWGrad()) {
+    gammaGrad = weight_->getWGrad()->getData();
+  } else {
+    create(tmpWGrad_, 1, channels_, &gammaGrad);
+  }
+  if (biases_ && biases_->getWGrad()) {
+    betaGrad = biases_->getWGrad()->getData();
+  } else {
+    create(tmpBiasGrad_, 1, channels_, &betaGrad);
+  }
+#if CUDNN_VERSION < 5000
+  // because of the different api of cudnn v4 and v5.
+  if (weight_->getWGrad()) {
+    create(tmpWGrad_, 1, channels_, &gammaGrad);
+  }
+  if (biases_ && biases_->getWGrad()) {
+    create(tmpBiasGrad_, 1, channels_, &betaGrad);
+  }
+#endif
+  hl_batch_norm_backward(ioDesc_, input, ioDesc_, outGrad,
+                         ioDesc_, inGrad, bnParamDesc_,
+                         gamma, gammaGrad, betaGrad,
+                         EPS, savedMean, savedInvVar);
+
+#if CUDNN_VERSION < 5000
+  // because of the different api of cudnn v4 and v5.
+  if (weight_->getWGrad() && biases_->getWGrad()) {
+    weight_->getWGrad()->add(*tmpWGrad_);
+    biases_->getWGrad()->add(*tmpBiasGrad_);
+  }
+#endif
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    biases_->getParameterPtr()->incUpdate(callback);
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+CudnnBatchNormLayer::~CudnnBatchNormLayer() {
+  hl_destroy_tensor_descriptor(ioDesc_);
+  hl_destroy_tensor_descriptor(bnParamDesc_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/gserver/layers/CudnnBatchNormLayer.h
new file mode 100644
index 00000000000000..03f4f591c3bfa0
--- /dev/null
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "paddle/utils/Stat.h"
+#include "Layer.h"
+#include "BatchNormBaseLayer.h"
+
+namespace paddle {
+
+/**
+ * @brief Cudnn Batch normalization layer use to cuDNN lib to implentment.
+ * @note Cudnn version must >= v4.0, and better to use the latest version (v5.1).
+ *
+ * The config file api is batch_norm_layer.
+ */
+
+class CudnnBatchNormLayer : public BatchNormBaseLayer {
+public:
+  explicit CudnnBatchNormLayer(const LayerConfig& config)
+      : BatchNormBaseLayer(config) {}
+
+  ~CudnnBatchNormLayer();
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  /**
+   * reshape tensor of ioDesc_.
+   */
+  void reshape(int batchSize);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+
+protected:
+  /**
+   * Epsilon value used in the batch normalization formula.
+   * Minimum allowed value is CUDNN_BN_MIN_EPSILON defined in cudnn.h.
+   * Same epsilon value should be used in forward and backward functions.
+   */
+  static const double EPS;
+
+  /// Input/output tensor descriptor desc
+  hl_tensor_descriptor ioDesc_;
+  /// Shared tensor descriptor desc for the 6 tenros:
+  /// bnScale, bnBias, running mean/var, save_mean/var
+  hl_tensor_descriptor bnParamDesc_;
+
+  /**
+   * @brief The gradient of weight and bias in cudnn api can not be empty.
+   * If set is_static for weight or bias, it will not allocate memory for them,
+   * and the gradient is NULL. In this case, will use two matrix.
+   */
+  MatrixPtr tmpWGrad_, tmpBiasGrad_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnConvLayer.cpp b/paddle/gserver/layers/CudnnConvLayer.cpp
new file mode 100644
index 00000000000000..a74e6ba38dfc63
--- /dev/null
+++ b/paddle/gserver/layers/CudnnConvLayer.cpp
@@ -0,0 +1,280 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+#include "CudnnConvLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(cudnn_conv, CudnnConvLayer);
+
+bool CudnnConvLayer::init(const LayerMap &layerMap,
+                          const ParameterMap &parameterMap) {
+  ConvBaseLayer::init(layerMap, parameterMap);
+  CHECK(useGpu_) << "CudnnConvLayer only support gpu";
+
+  maxGroups_ = 0;
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    CHECK_EQ(channels_[i] % groups_[i], 0);
+    CHECK_EQ(numFilters_ % groups_[i], 0);
+
+    hl_filter_descriptor filter;
+    hl_create_filter_descriptor(&filter, channels_[i] / groups_[i],
+                                numFilters_ / groups_[i], filterSizeY_[i],
+                                filterSize_[i]);
+    filterDesc_.push_back(filter);
+
+    hl_tensor_descriptor input;
+    hl_create_tensor_descriptor(&input);
+    inputDesc_.push_back(input);
+
+    hl_tensor_descriptor output;
+    int outputX =
+        outputSize(imgSize_[i], filterSize_[i], padding_[i], stride_[i]);
+    CHECK_EQ(outputX, outputX_[i]);
+    hl_create_tensor_descriptor(&output);
+    outputDesc_.push_back(output);
+
+    hl_convolution_descriptor conv;
+    hl_create_convolution_descriptor(&conv, input, filter, paddingY_[i],
+                                     padding_[i], strideY_[i], stride_[i]);
+    convDesc_.push_back(conv);
+
+    weightOffset_.push_back((numFilters_ / groups_[i]) *
+                            (channels_[i] / groups_[i]) * filterPixels_[i]);
+    inputOffset_.push_back((channels_[i] / groups_[i]) * imgSize_[i] *
+                           imgSize_[i]);
+    outputOffset_.push_back((numFilters_ / groups_[i]) * outputX_[i] *
+                            outputX_[i]);
+
+    // initialize all to default algorithms
+    fwdAlgo_.push_back(0);
+    bwdFilterAlgo_.push_back(0);
+    bwdDataAlgo_.push_back(0);
+    fwdLimitBytes_.push_back(0);
+    bwdFilterLimitBytes_.push_back(0);
+    bwdDataLimitBytes_.push_back(0);
+
+    // cudnn streams per group equal to 1
+    if (groups_[i] > maxGroups_) {
+      maxGroups_ = groups_[i];
+    }
+  }
+
+  workSpaceInBytes_ = 0;
+  workSpaceData_ = NULL;
+  for (int i = 0; i < maxGroups_; ++i) {
+    workSpace_.push_back(NULL);
+  }
+
+  if (biases_.get() && sharedBiases_) {
+    hl_create_tensor_descriptor(&biasDesc_);
+    hl_tensor_reshape(biasDesc_, 1, numFilters_ / groups_[0], 1, 1);
+    biasOffset_ = numFilters_ / groups_[0];
+  }
+
+  isSelectAlgo_ = false;
+  return true;
+}
+
+void CudnnConvLayer::allocConvWorkSpace(size_t maxWorkSpace) {
+  size_t totalWorkSpace = maxWorkSpace * maxGroups_;
+
+  if (totalWorkSpace  > workSpaceInBytes_) {
+      if (workSpaceInBytes_ != 0) {
+          hl_free_mem_device(workSpaceData_);
+      }
+      // total amount of storage needed over all groups
+      workSpaceData_ = hl_malloc_device(totalWorkSpace);
+
+      // update work space address for each group
+      for (int i = 0; i < maxGroups_; ++i) {
+            workSpace_[i] = reinterpret_cast<char *>(workSpaceData_)
+                                  + i * maxWorkSpace;
+      }
+      workSpaceInBytes_ = totalWorkSpace;
+  }
+}
+
+void CudnnConvLayer::reshape(int batchSize) {
+  CHECK_NE(inputLayers_.size(), 0UL);
+  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imageH_ == 0) imageH_ = imgSize_[0];
+  if (imageW_ == 0) imageW_ = imgSize_[0];
+
+  for (size_t i = 1; i < inputLayers_.size(); i++) {
+    int imageH = inputLayers_[i]->getOutput().getFrameHeight();
+    int imageW = inputLayers_[i]->getOutput().getFrameWidth();
+    if (imageH) {
+      CHECK_EQ(imageH_, imageH) << "Inputs must have same height.";
+    }
+    if (imageW) {
+      CHECK_EQ(imageW_, imageW) << "Inputs must have same width.";
+    }
+  }
+
+  outputH_ = outputSize(imageH_, filterSizeY_[0], paddingY_[0], strideY_[0]);
+  outputW_ = outputSize(imageW_, filterSize_[0], padding_[0], stride_[0]);
+  // check outputH & outputW
+  getOutput().setFrameHeight(outputH_);
+  getOutput().setFrameWidth(outputW_);
+
+  size_t maxWorkSpace = 0;
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    CHECK_EQ(inputLayers_[i]->getOutput().value->getWidth(),
+             (size_t)(channels_[i] * imageH_ * imageW_));
+
+    hl_tensor_reshape(inputDesc_[i], batchSize, channels_[i] / groups_[i],
+                      imageH_, imageW_, channels_[i] * imageH_ * imageW_,
+                      imageH_ * imageW_, imageW_, 1);
+
+    hl_tensor_reshape(outputDesc_[i], batchSize, numFilters_ / groups_[i],
+                      outputH_, outputW_, numFilters_ * outputH_ * outputW_,
+                      outputH_ * outputW_, outputW_, 1);
+
+    hl_reset_convolution_descriptor(convDesc_[i], inputDesc_[i],
+                                    filterDesc_[i], paddingY_[i],
+                                    padding_[i], strideY_[i], stride_[i]);
+
+    inputOffset_[i] = (channels_[i] / groups_[i]) * imageH_ * imageW_;
+    outputOffset_[i] = (numFilters_ / groups_[i]) * outputH_ * outputW_;
+
+    if (!isSelectAlgo_) {
+      hl_conv_workspace(inputDesc_[i], outputDesc_[i], filterDesc_[i],
+                        convDesc_[i], &fwdAlgo_[i], &fwdLimitBytes_[i],
+                        &bwdDataAlgo_[i], &bwdDataLimitBytes_[i],
+                        &bwdFilterAlgo_[i], &bwdFilterLimitBytes_[i]);
+
+      maxWorkSpace = std::max(fwdLimitBytes_[i], bwdDataLimitBytes_[i]);
+      maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_[i]);
+    }
+  }
+
+  if (!isSelectAlgo_) {
+    allocConvWorkSpace(maxWorkSpace);
+  }
+
+  isSelectAlgo_ = true;
+}
+
+void CudnnConvLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  reshape(batchSize);
+  resetOutput(batchSize, outputH_ * outputW_ * numFilters_);
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    REGISTER_TIMER_INFO("CudnnConvFwTimer", getName().c_str());
+    for (int g = 0; g < groups_[i]; ++g) {
+      real *inputData = getInputValue(i)->getData() + inputOffset_[i] * g;
+      real *wgtData = weights_[i]->getW()->getData() + weightOffset_[i] * g;
+      real *outData = getOutputValue()->getData() + outputOffset_[i] * g;
+      hl_convolution_forward(inputDesc_[i], inputData, outputDesc_[i],
+                             outData, filterDesc_[i], wgtData,
+                             convDesc_[i], workSpace_[g],
+                             fwdLimitBytes_[i], fwdAlgo_[i]);
+    }
+  }
+
+  if (biases_) {
+    REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
+    addBiases();
+  }
+
+  forwardActivation();
+}
+
+void CudnnConvLayer::addBiases() {
+  if (sharedBiases_) {
+    for (int g = 0; g < groups_[0]; ++g) {
+      real *biasData = biases_->getW()->getData() + biasOffset_ * g;
+      real *outData = getOutputValue()->getData() + outputOffset_[0] * g;
+      hl_convolution_forward_add_bias(biasDesc_, biasData,
+                                      outputDesc_[0], outData);
+    }
+  } else {
+    LOG(FATAL) << "Not supported";
+  }
+}
+
+void CudnnConvLayer::bpropBiases() {
+  if (sharedBiases_) {
+    for (int g = 0; g < groups_[0]; ++g) {
+      real *biasGrad = biases_->getWGrad()->getData() + biasOffset_ * g;
+      real *outGrad = getOutputGrad()->getData() + outputOffset_[0] * g;
+      hl_convolution_backward_bias(biasDesc_, biasGrad,
+                                   outputDesc_[0], outGrad);
+    }
+  } else {
+    LOG(FATAL) << "Not supported";
+  }
+}
+
+void CudnnConvLayer::backward(const UpdateCallback &callback) {
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("CudnnConvBpBiasTimer", getName().c_str());
+    bpropBiases();
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());
+    for (int g = 0; g < groups_[i]; ++g) {
+      real *outGrad = getOutputGrad()->getData() + outputOffset_[i] * g;
+      if (weights_[i]->getWGrad()) {
+        real *inputData = getInputValue(i)->getData() + inputOffset_[i] * g;
+        real *weightGrad =
+            weights_[i]->getWGrad()->getData() + weightOffset_[i] * g;
+        hl_convolution_backward_filter(
+            inputDesc_[i], inputData, outputDesc_[i], outGrad, filterDesc_[i],
+            weightGrad, convDesc_[i], workSpace_[g], bwdFilterLimitBytes_[i],
+            bwdFilterAlgo_[i]);
+      }
+
+      MatrixPtr preGrad = getInputGrad(i);
+      if (NULL != preGrad) {
+        real *inputGrad = preGrad->getData() + inputOffset_[i] * g;
+        real *wgtData = weights_[i]->getW()->getData() + weightOffset_[i] * g;
+        hl_convolution_backward_data(
+            inputDesc_[i], inputGrad, outputDesc_[i], outGrad, filterDesc_[i],
+            wgtData, convDesc_[i], workSpace_[g], bwdDataLimitBytes_[i],
+            bwdDataAlgo_[i]);
+      }
+    }
+    weights_[i]->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+CudnnConvLayer::~CudnnConvLayer() {
+  if (biasDesc_) {
+    hl_destroy_tensor_descriptor(biasDesc_);
+  }
+
+  for (size_t i = 0; i < inputDesc_.size(); i++) {
+    hl_destroy_tensor_descriptor(inputDesc_[i]);
+    hl_destroy_tensor_descriptor(outputDesc_[i]);
+    hl_destroy_filter_descriptor(filterDesc_[i]);
+    hl_destroy_convolution_descriptor(convDesc_[i]);
+  }
+  if (workSpaceInBytes_ != 0) {
+    hl_free_mem_device(workSpaceData_);
+    workSpaceInBytes_ = 0;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnConvLayer.h b/paddle/gserver/layers/CudnnConvLayer.h
new file mode 100644
index 00000000000000..2c72ba885ed104
--- /dev/null
+++ b/paddle/gserver/layers/CudnnConvLayer.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "ConvBaseLayer.h"
+#include "paddle/math/Matrix.h"
+#include <vector>
+
+namespace paddle {
+
+/**
+ * @brief A subclass of ConvBaseLayer by cuDNN implementation. It only
+ *        supports GPU mode. We automatic select CudnnConvLayer for GPU
+ *        mode and ExpandConvLayer for CPU mode if you set type of "conv".
+ *        User also can specfiy type of "exconv" or "cudnn_conv" for
+ *        particular type.
+ *
+ * The config file api is img_conv_layer.
+ */
+class CudnnConvLayer : public ConvBaseLayer {
+private:
+  /// resize Cudnn workspace size
+  void allocConvWorkSpace(size_t maxWorkSpace);
+
+protected:
+  int imageH_, imageW_, outputH_, outputW_;
+  /// Cudnn tensor descriptor for bias.
+  hl_tensor_descriptor biasDesc_;
+  /// Cudnn tensor descriptor for input.
+  std::vector<hl_tensor_descriptor> inputDesc_;
+  /// Cudnn tensor descriptor for output.
+  std::vector<hl_tensor_descriptor> outputDesc_;
+  /// Cudnn tensor descriptor for filter.
+  std::vector<hl_filter_descriptor> filterDesc_;
+  /// Cudnn tensor descriptor for a convolution operation.
+  std::vector<hl_convolution_descriptor> convDesc_;
+  /// One sample offset of input data.
+  IntV inputOffset_;
+  /// One sample offset of output data.
+  IntV outputOffset_;
+  /// One group offset of weight.
+  IntV weightOffset_;
+  /// One group offset of bias.
+  int biasOffset_;
+
+  /// Save the algorithm for forward convolution, which is obtained by cudnn
+  /// api to search the best suited algorithm.
+  std::vector<int> fwdAlgo_;
+  /// Save the algorithm for computing convolution gradient with respect to
+  /// filter coefficients.
+  std::vector<int> bwdFilterAlgo_;
+  /// Save the algorithm for computing convolution gradient with respect to
+  /// the output.
+  std::vector<int> bwdDataAlgo_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// forward convolution with the specified algo.
+  std::vector<size_t> fwdLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardFilter with the specified algo.
+  std::vector<size_t> bwdFilterLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardData with the specified algo.
+  std::vector<size_t> bwdDataLimitBytes_;
+
+  /// Device work space address for each group.
+  std::vector<void*> workSpace_;
+  /// Max number of groups.
+  int maxGroups_;
+  /// Total work space address in device for all groups.
+  void* workSpaceData_;
+  /// Size of total work space.
+  size_t workSpaceInBytes_;
+
+  /// Is or not select conv algorihtm.
+  bool isSelectAlgo_;
+
+public:
+  explicit CudnnConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+
+  ~CudnnConvLayer();
+
+  /**
+   * Intialization. Initialize member variables and create tenor descriptor.
+   */
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  /**
+   * Reshape is done each forward. Reshape tensor decriptor
+   * inputDesc_, outputDesc_, convDesc_. And search the faster algo
+   * or the fastest algo within a given memeory limit.
+   */
+  void reshape(int batchSize);
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback);
+  void addBiases();
+  void bpropBiases();
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnPoolLayer.cpp b/paddle/gserver/layers/CudnnPoolLayer.cpp
new file mode 100644
index 00000000000000..86c056ef5692a7
--- /dev/null
+++ b/paddle/gserver/layers/CudnnPoolLayer.cpp
@@ -0,0 +1,127 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/math/Matrix.h"
+#include "CudnnPoolLayer.h"
+
+namespace paddle {
+
+bool CudnnPoolLayer::typeCheck(const std::string &poolType,
+                               hl_pooling_mode_t *mode) {
+  if (poolType == "cudnn-max-pool") {
+    if (mode) {
+      *mode = HL_POOLING_MAX;
+    }
+  } else if (poolType == "cudnn-avg-pool") {
+    if (mode) {
+      *mode = HL_POOLING_AVERAGE;
+    }
+  } else if (poolType == "cudnn-avg-excl-pad-pool") {
+    if (mode) {
+      *mode = HL_POOLING_AVERAGE_EXCLUDE_PADDING;
+    }
+  } else {
+    return false;
+  }
+
+  return true;
+}
+
+CudnnPoolLayer::CudnnPoolLayer(const LayerConfig &config) : PoolLayer(config) {
+  const std::string &pool_type = config.inputs(0).pool_conf().pool_type();
+  CHECK_EQ(CudnnPoolLayer::typeCheck(pool_type, &mode_), true);
+}
+
+bool CudnnPoolLayer::init(const LayerMap &layerMap,
+                          const ParameterMap &parameterMap) {
+  PoolLayer::init(layerMap, parameterMap);
+
+  CHECK(useGpu_) << "CudnnPoolLayer only support gpu";
+  CHECK_EQ(start_, 0) << poolType_ << " dose not support 'start'";
+
+  hl_create_tensor_descriptor(&inputDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+
+  windowHeight = sizeY_;
+  windowWidth = sizeX_;
+  heightPadding = confPaddingY_;
+  widthPadding = confPadding_;
+  strideHeight = strideY_;
+  strideWidth = stride_;
+
+  hl_create_pooling_descriptor(&poolingDesc_, mode_, windowHeight,
+                               windowWidth, heightPadding, widthPadding,
+                               strideHeight, strideWidth);
+
+  return true;
+}
+
+void CudnnPoolLayer::reshape(int batchSize) {
+  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imageH_ == 0) {
+    imageH_ = imgSizeY_;
+  }
+  if (imageW_ == 0) {
+    imageW_ = imgSize_;
+  }
+  CHECK_EQ(inputLayers_[0]->getOutput().value->getWidth(),
+           channels_ * imageH_ * imageW_);
+  outputH_ = outputSize(imageH_, sizeY_, confPaddingY_, strideY_);
+  outputW_ = outputSize(imageW_, sizeX_, confPadding_, stride_);
+  getOutput().setFrameHeight(outputH_);
+  getOutput().setFrameWidth(outputW_);
+
+  hl_tensor_reshape(inputDesc_, batchSize, channels_, imageH_, imageW_);
+  hl_tensor_reshape(outputDesc_, batchSize, channels_, outputH_, outputW_);
+}
+
+void CudnnPoolLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  CHECK(inputLayers_[0]->getOutputValue()->useGpu());
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  reshape(batchSize);
+  resetOutput(batchSize, outputH_ * outputW_ * channels_);
+
+  real *inputData = getInputValue(0)->getData();
+  real *outData = getOutputValue()->getData();
+  hl_pooling_forward(inputDesc_, inputData, outputDesc_, outData,
+                     poolingDesc_);
+}
+
+void CudnnPoolLayer::backward(const UpdateCallback &callback) {
+  (void)callback;
+  if (NULL == getInputGrad(0)) {
+    return;
+  }
+
+  real *inputData = getInputValue(0)->getData();
+  real *inputGrad = getInputGrad(0)->getData();
+  real *outData = getOutputValue()->getData();
+  real *outGrad = getOutputGrad()->getData();
+  hl_pooling_backward(inputDesc_, inputData, inputGrad, outputDesc_,
+                      outData, outGrad, poolingDesc_);
+}
+
+CudnnPoolLayer::~CudnnPoolLayer() {
+  hl_destroy_tensor_descriptor(inputDesc_);
+  hl_destroy_tensor_descriptor(outputDesc_);
+  hl_destroy_pooling_descriptor(poolingDesc_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnPoolLayer.h b/paddle/gserver/layers/CudnnPoolLayer.h
new file mode 100644
index 00000000000000..df97ef2edfd012
--- /dev/null
+++ b/paddle/gserver/layers/CudnnPoolLayer.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "PoolLayer.h"
+
+namespace paddle {
+
+ /**
+  * @brief CudnnPoolLayer is subclass of PoolLayer, which is implemented by
+  * cudnn api and only supports GPU.
+  *
+  * The config file api is img_pool_layer.
+  */
+
+class CudnnPoolLayer : public PoolLayer {
+protected:
+  int windowHeight, windowWidth;
+  int heightPadding, widthPadding, strideHeight, strideWidth;
+  int imageH_, imageW_, outputH_, outputW_;
+  /// mode_ is poolint type, inlcuding "cudnn-max-pool", "cudnn-avg-pool"
+  /// "cudnn-avg-excl-pad-pool".
+  hl_pooling_mode_t mode_;
+  /// cudnn tensor descriptor for input.
+  hl_tensor_descriptor inputDesc_;
+  /// cudnn tensor descriptor for output.
+  hl_tensor_descriptor outputDesc_;
+  /// A description of a pooling operation.
+  hl_pooling_descriptor poolingDesc_;
+
+public:
+  static bool typeCheck(const std::string& poolType,
+                        hl_pooling_mode_t* mode = nullptr);
+  explicit CudnnPoolLayer(const LayerConfig& config);
+  ~CudnnPoolLayer();
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  /**
+   * Reshape input and output tensor descriptor.
+   * The batch size maybe change during training in last batch of each pass.
+   * So reshaping is needed.
+   */
+  void reshape(int batchSize);
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback = nullptr);
+
+  /**
+   * Calculate output size according window size of pooling.
+   */
+  int outputSize(int imageSize, int windowSize, int padding, int stride) {
+    int outputSize;
+    outputSize =
+        (imageSize - windowSize + 2 * padding + stride - 1) / stride + 1;
+    return outputSize;
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DataLayer.cpp b/paddle/gserver/layers/DataLayer.cpp
new file mode 100644
index 00000000000000..79b9181e694f00
--- /dev/null
+++ b/paddle/gserver/layers/DataLayer.cpp
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DataLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(data, DataLayer);
+
+void DataLayer::copyDataToOutput(Argument& output) {
+  if (output.deviceId == data_.deviceId) {
+    output.value = data_.value;
+    output.in = data_.in;
+    output.grad = data_.grad;
+    output.ids = data_.ids;
+  } else {
+    SetDevice device(output.deviceId);
+    if (data_.value) {
+      if (!output.value) {
+        output.value = data_.value->clone(data_.value->getHeight(),
+                                          data_.value->getWidth(),
+                                          useGpu(output.deviceId));
+      } else {
+        output.value->resize(data_.value->getHeight(),
+                             data_.value->getWidth());
+      }
+      output.value->copyFrom(*data_.value);
+    }
+    if (data_.grad) {
+      Matrix::resizeOrCreate(output.grad, data_.grad->getHeight(),
+                             data_.grad->getWidth(),
+                             /* trans= */ false, useGpu(output.deviceId));
+    }
+    if (data_.ids) {
+      IVector::resizeOrCreate(output.ids, data_.ids->getSize(),
+                              useGpu(output.deviceId));
+      output.ids->copyFrom(*data_.ids);
+    }
+  }
+  output.setFrameHeight(data_.getFrameHeight());
+  output.setFrameWidth(data_.getFrameWidth());
+  output.cpuSequenceDims = data_.cpuSequenceDims;
+  output.sequenceStartPositions = data_.sequenceStartPositions;
+  output.subSequenceStartPositions = data_.subSequenceStartPositions;
+  output.strs = data_.strs;
+
+  output.notifyValueReady();
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DataLayer.h b/paddle/gserver/layers/DataLayer.h
new file mode 100644
index 00000000000000..3abec1b0653a81
--- /dev/null
+++ b/paddle/gserver/layers/DataLayer.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <memory>
+
+#include "Layer.h"
+
+namespace paddle {
+/** 
+ * This layer just copy data to output, and has no backward propagation.
+ *
+ * The config file api is data_layer.
+ */
+class DataLayer : public Layer {
+public:
+  explicit DataLayer(const LayerConfig& config) : Layer(config) {}
+
+  virtual void setData(const Argument& data) { data_ = data; }
+
+  /**
+   * Prefetch sparse matrix/ids only.
+   */
+  void prefetch() {
+    output_ = data_;
+  }
+
+  /** 
+   * Forward propagation. Copy data_ (value, in, grad, ids, cpuSequenceDims, 
+   * sequenceStartPositions, subSequenceStartPositions, strs) to output_.
+   */
+  virtual void forward(PassType passType) {
+    Layer::forward(passType);
+    copyDataToOutput(output_);
+    if (FLAGS_show_layer_stat) {
+      showOutputStats();
+    }
+  }
+
+  /**
+   * Data layer's backward propagation do nothing.
+   */
+  virtual void backward(const UpdateCallback& callback) { (void)callback; }
+
+  virtual void copyOutputToOtherDevice() {
+    for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+      copyDataToOutput(outputOtherDevice_[i]);
+    }
+  }
+
+private:
+  void copyDataToOutput(Argument& output);
+
+protected:
+  Argument data_;
+};
+
+typedef std::shared_ptr<DataLayer> DataLayerPtr;
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DataNormLayer.cpp b/paddle/gserver/layers/DataNormLayer.cpp
new file mode 100644
index 00000000000000..150977ce1a589c
--- /dev/null
+++ b/paddle/gserver/layers/DataNormLayer.cpp
@@ -0,0 +1,129 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "DataNormLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(data_norm, DataNormLayer);
+
+bool DataNormLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* initialize the weight */
+  CHECK(!biasParameter_) << "DataNormLayer does not need bias";
+  CHECK(inputLayers_.size() == 1 && inputLayers_[0]->getType() == "data")
+      << "DataNormLayer accepts one and only one DataLayer as its input layer";
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  CHECK_EQ(inputLayers_[0]->getSize(), getSize());
+  CHECK_EQ(parameters_[0]->getSize(), 5 * getSize());
+  CHECK(parameters_[0]->isStatic())
+      << "The parameter of DataNormLayer must be static";
+
+  weight_ = std::unique_ptr<Weight>(new Weight(5, getSize(), parameters_[0]));
+  min_ = Matrix::create(nullptr, /* height= */ 1, getSize(), /* trans= */ false,
+                        useGpu_);
+  rangeReciprocal_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
+                                    /* trans= */ false, useGpu_);
+  mean_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
+                         /* trans= */ false, useGpu_);
+  stdReciprocal_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
+                                  /* trans= */ false, useGpu_);
+  decimalReciprocal_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
+                                      /* trans= */ false, useGpu_);
+
+  min_->setData(weight_->getW()->getData());
+  rangeReciprocal_->setData(weight_->getW()->getData() + getSize());
+  mean_->setData(weight_->getW()->getData() + 2 * getSize());
+  stdReciprocal_->setData(weight_->getW()->getData() + 3 * getSize());
+  decimalReciprocal_->setData(weight_->getW()->getData() + 4 * getSize());
+
+  /* normalization strategy */
+  if (config_.data_norm_strategy() == "z-score") {
+    mode_ = kZScore;
+  } else if (config_.data_norm_strategy() == "min-max") {
+    mode_ = kMinMax;
+  } else if (config_.data_norm_strategy() == "decimal-scaling") {
+    mode_ = kDecimalScaling;
+  } else {
+    LOG(FATAL) << "Unknown data normalization strategy: "
+               << config_.data_norm_strategy();
+  }
+
+  return true;
+}
+
+void DataNormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInput(0).getBatchSize();
+  int size = getSize();
+  reserveOutput(batchSize, size);
+
+  const MatrixPtr inValue = getInputValue(0);
+  MatrixPtr outValue = getOutputValue();
+  outValue->copyFrom(*inValue);
+  switch (mode_) {
+    case kZScore: {
+      outValue->addBias(*mean_, -1.0);
+      outValue->colScale(0, *outValue, *stdReciprocal_);
+      break;
+    }
+    case kMinMax: {
+      outValue->addBias(*min_, -1.0);
+      outValue->colScale(0, *outValue, *rangeReciprocal_);
+      break;
+    }
+    case kDecimalScaling: {
+      outValue->colScale(0, *outValue, *decimalReciprocal_);
+      break;
+    }
+    default:
+      LOG(FATAL) << "should not reach here";
+  }
+}
+
+void DataNormLayer::backward(const UpdateCallback& callback) {
+  // The parameter for DataNormLayer is static, and does not need to be updated
+  (void)callback;
+
+  /* Calculate the input layers error */
+  const MatrixPtr& outGrad = getOutputGrad();
+  MatrixPtr inGrad = getInputGrad(0);
+  if (inGrad) {
+    switch (mode_) {
+      case kZScore: {
+        inGrad->addColScale(0, *outGrad, *stdReciprocal_);
+        break;
+      }
+      case kMinMax: {
+        inGrad->addColScale(0, *outGrad, *rangeReciprocal_);
+        break;
+      }
+      case kDecimalScaling: {
+        inGrad->addColScale(0, *outGrad, *decimalReciprocal_);
+        break;
+      }
+      default: { LOG(FATAL) << "should not reach here"; }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DataNormLayer.h b/paddle/gserver/layers/DataNormLayer.h
new file mode 100644
index 00000000000000..31497367684d49
--- /dev/null
+++ b/paddle/gserver/layers/DataNormLayer.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * A layer for data normalization
+ * Input: One and only one input layer is accepted. The input layer must
+ *        be DataLayer with dense data type.
+ * Output: The normalization of the input data
+ *
+ * Reference:
+ *    LA Shalabi, Z Shaaban, B Kasasbeh. Data mining: A preprocessing engine
+ *
+ * Three data normalization methoeds are considered
+ *    z-score: y = (x-mean)/std
+ *    min-max: y = (x-min)/(max-min)
+ *    decimal-scaling: y = x/10^j, where j is the smallest integer such that
+ *max(|y|)<1
+ */
+
+class DataNormLayer : public Layer {
+public:
+  enum NormalizationStrategy { kZScore = 0, kMinMax = 1, kDecimalScaling = 2 };
+
+  explicit DataNormLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~DataNormLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+
+protected:
+  int mode_;
+  std::unique_ptr<Weight> weight_;
+  MatrixPtr min_;
+  MatrixPtr rangeReciprocal_;  // 1/(max-min)
+  MatrixPtr mean_;
+  MatrixPtr stdReciprocal_;      // 1/std
+  MatrixPtr decimalReciprocal_;  // 1/10^j
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DotMulOperator.cpp b/paddle/gserver/layers/DotMulOperator.cpp
new file mode 100644
index 00000000000000..e6d2375b474d81
--- /dev/null
+++ b/paddle/gserver/layers/DotMulOperator.cpp
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "Operator.h"
+
+namespace paddle {
+
+/**
+ * DotMulOperator takes two inputs, performs element-wise multiplication:
+ * \f[
+ *   out.row[i] += scale * (in1.row[i] .* in2.row[i])
+ * \f]
+ * where \f$.*\f$ means element-wise multiplication,
+ * and scale is a config scalar, its default value is one.
+ *
+ * The config file api is dotmul_operator.
+ */
+class DotMulOperator : public Operator {
+public:
+  DotMulOperator(const OperatorConfig& config, bool useGpu);
+  virtual void forward();
+  virtual void backward();
+};
+
+REGISTER_OPERATOR(dot_mul, DotMulOperator);
+
+DotMulOperator::DotMulOperator(const OperatorConfig& config, bool useGpu)
+    : Operator(config, useGpu) {
+  CHECK_EQ(config_.input_indices_size(), 2L);
+}
+
+void DotMulOperator::forward() {
+  out_->value->addDotMul(*ins_[0]->value, *ins_[1]->value, 1,
+                         config_.dotmul_scale());
+}
+
+void DotMulOperator::backward() {
+  const MatrixPtr& inV0 = ins_[0]->value;
+  const MatrixPtr& inV1 = ins_[1]->value;
+  const MatrixPtr& inG0 = ins_[0]->grad;
+  const MatrixPtr& inG1 = ins_[1]->grad;
+
+  if (inG0) {
+    inG0->addDotMul(*out_->grad, *inV1, 1, config_.dotmul_scale());
+  }
+  if (inG1) {
+    inG1->addDotMul(*out_->grad, *inV0, 1, config_.dotmul_scale());
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DotMulProjection.cpp b/paddle/gserver/layers/DotMulProjection.cpp
new file mode 100644
index 00000000000000..f6f14c4429e263
--- /dev/null
+++ b/paddle/gserver/layers/DotMulProjection.cpp
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * DotMulProjection performs element-wise multiplication with weight:
+ * \f[
+ *   out.row[i] += in.row[i] .* weight
+ * \f]
+ * where \f$.*\f$ means element-wise multiplication.
+ *
+ * The config file api is dotmul_projection.
+ */
+class DotMulProjection : public Projection {
+public:
+  DotMulProjection(const ProjectionConfig& config,
+                   const ParameterPtr& parameter, bool useGpu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+protected:
+  /// shared memory with parameter
+  std::unique_ptr<Weight> weight_;
+};
+
+REGISTER_PROJECTION(dot_mul, DotMulProjection);
+
+DotMulProjection::DotMulProjection(const ProjectionConfig& config,
+                                   const ParameterPtr& parameter, bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  weight_.reset(new Weight(1LU, config.output_size(), parameter));
+}
+
+void DotMulProjection::forward() {
+  out_->value->addDotMulMMV(*in_->value, *(weight_->getW()));
+}
+
+void DotMulProjection::backward(const UpdateCallback& callback) {
+  /* Calculate the W-gradient for the current layer */
+  if (weight_->getWGrad()) {
+    weight_->getWGrad()->addDotMulVMM(*out_->grad, *in_->value);
+  }
+
+  /* Calculate the input layers error */
+  if (in_->grad) {
+    in_->grad->addDotMulMMV(*out_->grad, *(weight_->getW()));
+  }
+
+  parameter_->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/EosIdCheckLayer.cpp b/paddle/gserver/layers/EosIdCheckLayer.cpp
new file mode 100644
index 00000000000000..2d0778a451aae5
--- /dev/null
+++ b/paddle/gserver/layers/EosIdCheckLayer.cpp
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "Layer.h"
+
+namespace paddle {
+/**
+ * A layer for checking EOS for each sample:
+ * - output_id = (input_id == conf.eos_id)
+ * 
+ * The result is stored in output_.ids.
+ * It is used by recurrent layer group.
+ */
+class EosIdCheckLayer : public Layer {
+public:
+  explicit EosIdCheckLayer(const LayerConfig& config) : Layer(config) {}
+
+  virtual bool init(const LayerMap& layerMap,
+                    const ParameterMap& parameterMap) {
+    bool ret = Layer::init(layerMap, parameterMap);
+    CHECK_EQ(1UL, inputLayers_.size());
+    return ret;
+  }
+
+  virtual void forward(PassType passType) {
+    Layer::forward(passType);
+
+    const Argument& input = getInput(0);
+    IVector::resizeOrCreate(output_.ids, input.ids->getSize(), useGpu_);
+    output_.ids->isEqualTo(*input.ids, config_.eos_id());
+  }
+
+  virtual void backward(const UpdateCallback& callback) {}
+};
+
+REGISTER_LAYER(eos_id, EosIdCheckLayer);
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
new file mode 100644
index 00000000000000..df79c3e3037cfc
--- /dev/null
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -0,0 +1,326 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+#include "ExpandConvLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(exconv, ExpandConvLayer);
+
+bool ExpandConvLayer::init(const LayerMap &layerMap,
+                           const ParameterMap &parameterMap) {
+  /* Initialize the basic convolutional parent class */
+  ConvBaseLayer::init(layerMap, parameterMap);
+
+  /* Initialize the projection */
+  for (auto &inputConfig : config_.inputs()) {
+    const ConvConfig &conf = inputConfig.conv_conf();
+    subM_.push_back(numFilters_ / conf.groups());
+    subN_.push_back(conf.output_x() * conf.output_x());
+    subK_.push_back(conf.channels() * conf.filter_size() * conf.filter_size() /
+                    conf.groups());
+    /* Consistent caffe mode for multiple input */
+    caffeMode_ = conf.caffe_mode();
+  }
+
+  return true;
+}
+
+size_t ExpandConvLayer::getSize() {
+  CHECK_NE(inputLayers_.size(), 0UL);
+  imgSizeH_.clear();
+  imgSizeW_.clear();
+  outputH_.clear();
+  outputW_.clear();
+  subN_.clear();
+  size_t layerSize = 0;
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    imgSizeH_.push_back(inputLayers_[i]->getOutput().getFrameHeight());
+    imgSizeW_.push_back(inputLayers_[i]->getOutput().getFrameWidth());
+    if (imgSizeH_[i] == 0) imgSizeH_[i] = imgSize_[i];
+    if (imgSizeW_[i] == 0) imgSizeW_[i] = imgSize_[i];
+    outputH_.push_back(
+        outputSize(imgSizeH_[i], filterSize_[i], padding_[i], stride_[i]));
+    outputW_.push_back(
+        outputSize(imgSizeW_[i], filterSize_[i], padding_[i], stride_[i]));
+    subN_.push_back(outputH_[i] * outputW_[i]);
+    CHECK(layerSize == 0 || subN_[i] * size_t(numFilters_) == layerSize);
+    layerSize = subN_[i] * numFilters_;
+  }
+  getOutput().setFrameHeight(outputH_[0]);
+  getOutput().setFrameWidth(outputW_[0]);
+  return layerSize;
+}
+
+void ExpandConvLayer::resetExpandInput(size_t height, size_t width) {
+  Matrix::resizeOrCreate(expandInput_, height, width, false, useGpu_);
+}
+
+void ExpandConvLayer::resetConvOutput(size_t batchSize, int inIdx) {
+  Matrix::resizeOrCreate(transOutValue_, batchSize * numFilters_, subN_[inIdx],
+                         false, useGpu_);
+}
+
+void ExpandConvLayer::expandOneFrame(MatrixPtr image, size_t startIdx,
+                                     int inIdx) {
+  resetExpandInput(subK_[inIdx] * groups_[inIdx], subN_[inIdx]);
+  real *imgData = image->getData() + startIdx * image->getWidth();
+  MatrixPtr imageTmp = Matrix::create(
+      imgData, 1, imgSizeH_[inIdx] * imgSizeW_[inIdx] * channels_[inIdx], false,
+      useGpu_);
+  expandInput_->convExpand(*imageTmp, imgSizeH_[inIdx], imgSizeW_[inIdx],
+                           channels_[inIdx], filterSize_[inIdx],
+                           filterSize_[inIdx], stride_[inIdx], stride_[inIdx],
+                           padding_[inIdx], padding_[inIdx],
+                           outputH_[inIdx], outputW_[inIdx]);
+  imageTmp->clear();
+}
+
+void ExpandConvLayer::expandFwdOnce(MatrixPtr image, int inIdx, int startIdx) {
+  int subM = subM_[inIdx];
+  int subN = subN_[inIdx];
+  int subK = subK_[inIdx];
+
+  expandOneFrame(image, startIdx, inIdx);
+
+  real *outData =
+      getOutputValue()->getData() + startIdx * subN * numFilters_;
+
+  real *wgtData = weights_[inIdx]->getW()->getData();
+  real *expInData = expandInput_->getData();
+  for (int g = 0; g < groups_[inIdx]; ++g) {
+    MatrixPtr A =
+        Matrix::create(wgtData, subK, subM, true, useGpu_);  // mark transpose
+    MatrixPtr B = Matrix::create(expInData, subK, subN, false, useGpu_);
+    MatrixPtr C = Matrix::create(outData, subM, subN, false, useGpu_);
+    C->mul(A, B, 1, 1);
+
+    A->clear();
+    B->clear();
+    C->clear();
+    wgtData += subK * subM;
+    expInData += subK * subN;
+    outData += subM * subN;
+  }
+}
+
+void ExpandConvLayer::addSharedBias() {
+  size_t mapW = getSize() / numFilters_;
+  size_t mapH = getOutputValue()->getElementCnt() / mapW;
+  MatrixPtr out =
+      Matrix::create(getOutputValue()->getData(), mapH, mapW, false, useGpu_);
+
+  Matrix::resizeOrCreate(transOutValue_, mapW, mapH, false, useGpu_);
+
+  out->transpose(transOutValue_, false);  // false means no memory allocation
+  transOutValue_->reshape(transOutValue_->getElementCnt() / numFilters_,
+                          numFilters_);
+
+  MatrixPtr bias =
+      Matrix::create(biases_->getW()->getData(), 1,
+                     biases_->getW()->getElementCnt(), false, useGpu_);
+  transOutValue_->addBias(*bias, 1.0f);
+
+  transOutValue_->reshape(mapW, mapH);
+  transOutValue_->transpose(out, false);  // false means no memory allocation
+
+  out->clear();
+  bias->clear();
+}
+
+void ExpandConvLayer::addUnsharedBias() {
+  MatrixPtr outValue = getOutputValue();
+  MatrixPtr bias =
+      Matrix::create(biases_->getW()->getData(), 1,
+                     biases_->getW()->getElementCnt(), false, useGpu_);
+  outValue->addBias(*bias, 1.0f);
+}
+
+void ExpandConvLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  /* note: one sample correspond to one colum, and the
+   *   transOutValue correspond sample to one row */
+  int batchSize = inputLayers_[0]->getOutputValue()->getWidth();
+  batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  resetOutput(batchSize, getSize());
+
+  MatrixPtr image = nullptr;
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    LayerPtr prevLayer = getPrev(i);
+    image = prevLayer->getOutputValue();
+    for (size_t off = 0; off < image->getHeight(); off++) {
+      REGISTER_TIMER_INFO("expandFwdOnce", getName().c_str());
+      expandFwdOnce(image, i, off);
+    }
+  }
+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    if (sharedBiases_) {
+      addSharedBias();
+    } else {
+      addUnsharedBias();
+    }
+  }
+
+  /* activation */
+  forwardActivation();
+}
+
+void ExpandConvLayer::bpropSharedBias(MatrixPtr biases, MatrixPtr v) {
+  size_t mapW = getSize() / numFilters_;
+  size_t mapH = v->getElementCnt() / mapW;
+  MatrixPtr vTmp = Matrix::create(v->getData(), mapH, mapW, false, useGpu_);
+
+  Matrix::resizeOrCreate(transOutValue_, mapW, mapH, false, useGpu_);
+
+  vTmp->transpose(transOutValue_, false);  // false means no memory allocation
+  vTmp->reshape(transOutValue_->getElementCnt() / numFilters_, numFilters_);
+  biases->collectBias(*vTmp, 1.0f);
+}
+
+void ExpandConvLayer::bpropBiases(MatrixPtr v) {
+  MatrixPtr biases =
+      Matrix::create(biases_->getWGrad()->getData(), 1,
+                     biases_->getWGrad()->getElementCnt(), false, useGpu_);
+  if (sharedBiases_) {
+    bpropSharedBias(biases, v);
+  } else {
+    biases->collectBias(*v, 1.0f);
+  }
+  biases->clear();
+}
+
+void ExpandConvLayer::backward(const UpdateCallback &callback) {
+  backwardActivation();
+
+  MatrixPtr outGrad = getOutputGrad();
+  if (biases_ && biases_->getWGrad()) {
+    bpropBiases(outGrad);
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    /* First, calculate the input layers error */
+    bpropActs(outGrad, i);
+    if (weights_[i]->getWGrad()) {
+      /* Then, calculate the W-gradient for the current layer */
+      bpropWeights(outGrad, i);
+      /* Increasing the number of gradient */
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+
+void ExpandConvLayer::bpropWeights(MatrixPtr v, int inpIdx) {
+  MatrixPtr weightGrad = weights_[inpIdx]->getWGrad();
+  MatrixPtr inputV = getPrev(inpIdx)->getOutputValue();
+
+  int subM = subM_[inpIdx];
+  int subN = subN_[inpIdx];
+  int subK = subK_[inpIdx];
+  size_t batchSize = inputV->getHeight();
+  resetExpandInput(subK * groups_[inpIdx], subN);
+  resetConvOutput(batchSize, inpIdx);
+
+  real *gradData = v->getData();
+
+  for (size_t n = 0; n < batchSize; n++) {  // frame by frame
+    // expand
+    expandOneFrame(inputV, n, inpIdx);
+    real *wGradData = weightGrad->getData();
+    real *expandInData = expandInput_->getData();
+
+    // expand-mul one-group by one
+    for (int g = 0; g < groups_[inpIdx]; g++) {
+      MatrixPtr A = Matrix::create(expandInData, subK, subN, false, useGpu_);
+      MatrixPtr B = Matrix::create(gradData, subM, subN, true, useGpu_);
+      MatrixPtr C = Matrix::create(wGradData, subK, subM, false, useGpu_);
+      C->mul(A, B, 1, 1);
+
+      A->clear();
+      B->clear();
+      C->clear();
+      gradData += subM * subN;
+      wGradData += subK * subM;
+      expandInData += subK * subN;
+    }
+  }
+}
+
+void ExpandConvLayer::bpropActs(MatrixPtr v, int inpIdx) {
+  LayerPtr prevLayer = getPrev(inpIdx);
+  if (NULL == prevLayer->getOutputGrad()) {
+    return;
+  }
+
+  int subM = subM_[inpIdx];
+  int subN = subN_[inpIdx];
+  int subK = subK_[inpIdx];
+  size_t batchSize = v->getHeight();
+  MatrixPtr tgtGrad = prevLayer->getOutputGrad();
+
+  /* reset the expand-grad memory */
+  resetExpandInput(subK * groups_[inpIdx], subN);
+  resetConvOutput(batchSize, inpIdx);
+
+  real *localGradData = v->getData();
+  real *tgtGradData = tgtGrad->getData();
+  for (size_t n = 0; n < batchSize; n++) {
+    real *wgtData = weights_[inpIdx]->getW()->getData();
+    real *expandInData = expandInput_->getData();
+
+    for (int g = 0; g < groups_[inpIdx]; g++) {
+      // create temporary matrix
+      MatrixPtr C = Matrix::create(expandInData, subK, subN, false, useGpu_);
+      MatrixPtr B = Matrix::create(localGradData, subM, subN, false, useGpu_);
+      MatrixPtr A = Matrix::create(wgtData, subK, subM, false, useGpu_);
+      C->mul(A, B);  // mul
+
+      // clear the temporary matrix
+      A->clear();
+      B->clear();
+      C->clear();
+
+      expandInData += subK * subN;
+      localGradData += subM * subN;
+      wgtData += subK * subM;
+    }
+
+    // shrink one frame outGrad
+    MatrixPtr oneGradTmp = Matrix::create(
+        expandInput_->getData(), subK * groups_[inpIdx], subN, false, useGpu_);
+    MatrixPtr vTmp = Matrix::create(
+        tgtGradData, 1,
+        imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channels_[inpIdx], false,
+        useGpu_);
+    vTmp->convShrink(*oneGradTmp, imgSizeH_[inpIdx], imgSizeW_[inpIdx],
+                     channels_[inpIdx], filterSize_[inpIdx],
+                     filterSize_[inpIdx], stride_[inpIdx], stride_[inpIdx],
+                     padding_[inpIdx], padding_[inpIdx],
+                     outputH_[inpIdx], outputW_[inpIdx], 1.0f, 1.0f);
+    vTmp->clear();
+    oneGradTmp->clear();
+
+    // move the data-pointer
+    tgtGradData += imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channels_[inpIdx];
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvLayer.h b/paddle/gserver/layers/ExpandConvLayer.h
new file mode 100644
index 00000000000000..fc3d69b1b7d14c
--- /dev/null
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -0,0 +1,100 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "ConvBaseLayer.h"
+#include "paddle/math/Matrix.h"
+#include <vector>
+
+namespace paddle {
+
+/**
+ * @brief A subclass of convolution layer.
+ * This layer expands input and use matrix multiplication to
+ * calculate convolution operation.
+ *
+ * The config file api is img_conv_layer.
+ */
+class ExpandConvLayer : public ConvBaseLayer {
+protected:
+  /// For expand convolution.
+  /// subM_ = numFilters_ / groups_.
+  IntV subM_;
+  /// subN_ = outputH_ * outputW_.
+  IntV subN_;
+  /// subK_ = channels_ * filterPixels_ * groups_.
+  IntV subK_;
+  /// The spatial dimensions of height of input feature map.
+  IntV imgSizeH_;
+  /// The spatial dimensions of width of input feature map.
+  IntV imgSizeW_;
+  /// The spatial dimensions of height of output feature map.
+  IntV outputH_;
+  /// The spatial dimensions of width of output feature map.
+  IntV outputW_;
+  /// Expand one sample at a time. shape:
+  /// (numChannels * filterPixels_, outputSizeH * outputSizeW)
+  MatrixPtr expandInput_;
+  /// The transpose of output, which is an auxiliary matrix.
+  MatrixPtr transOutValue_;
+
+public:
+  explicit ExpandConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+
+  ~ExpandConvLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  size_t getSize();
+
+  /**
+   * Create or resize expandInput_.
+   */
+  void resetExpandInput(size_t height, size_t width);
+
+  /**
+   * Create or resize transOutValue_.
+   */
+  void resetConvOutput(size_t batchSize, int inIdx);
+
+  /**
+   * Expand one input sample.
+   */
+  void expandOneFrame(MatrixPtr image, size_t startIdx, int inIdx);
+
+  /**
+   * Expand one input sample and perform matrix multiplication.
+   */
+  void expandFwdOnce(MatrixPtr image, int inIdx, int startIdx);
+
+  /**
+   * Add shared bias.
+   */
+  void addSharedBias();
+
+  /**
+   * Add unshared bias.
+   */
+  void addUnsharedBias();
+  void forward(PassType passType);
+  void bpropSharedBias(MatrixPtr biases, MatrixPtr v);
+  void bpropBiases(MatrixPtr v);
+  void backward(const UpdateCallback& callback);
+  void bpropWeights(MatrixPtr v, int inpIdx);
+  void bpropActs(MatrixPtr v, int inpIdx);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandLayer.cpp b/paddle/gserver/layers/ExpandLayer.cpp
new file mode 100644
index 00000000000000..bbd0b53273b430
--- /dev/null
+++ b/paddle/gserver/layers/ExpandLayer.cpp
@@ -0,0 +1,144 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "ExpandLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(expand, ExpandLayer);
+
+bool ExpandLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(inputLayers_.size(), 2UL);
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+  // which sequence type of input[0]
+  if (config_.trans_type() == "non-seq") {
+    type_ = kNonSeq;
+  } else if (config_.trans_type() == "seq") {
+    type_ = kSeq;
+  } else {
+    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
+  }
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void ExpandLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  // Expand layer should have exactly 2 input, one for data, one for size
+  CHECK_EQ(2U, inputLayers_.size());
+
+  // using two input:
+  // * first one for data;
+  // * second one only for sequence info
+  const Argument& shapeInput = getInput(1);
+  const Argument& dataInput = getInput(0);
+  size_t outputBatchSize = shapeInput.getBatchSize();
+  auto startPositions =
+      type_ ? shapeInput.subSequenceStartPositions
+            : shapeInput.sequenceStartPositions;
+  size_t numSequences = startPositions->getSize() - 1;
+  const int* starts = startPositions->getData(false);
+
+  CHECK_EQ(starts[numSequences], shapeInput.getBatchSize());
+  if (type_) {
+    // when trans_type = seq, input[1] must hasSubseq
+    CHECK_EQ(shapeInput.hasSubseq(), 1UL);
+    CHECK_EQ(dataInput.getNumSequences(), shapeInput.getNumSequences());
+  } else {
+    CHECK_EQ(dataInput.getBatchSize(), shapeInput.getNumSequences());
+  }
+
+  // set output sequence info as shape sequence
+  output_.sequenceStartPositions = shapeInput.sequenceStartPositions;
+  if (shapeInput.hasSubseq()) {
+    output_.subSequenceStartPositions =
+        shapeInput.subSequenceStartPositions;
+  }
+
+  // reserve output: Expand output to batchsize of sequence data.
+  reserveOutput(outputBatchSize, dataInput.value->getWidth());
+
+  MatrixPtr inputValue = getInputValue(0);
+  MatrixPtr outputValue = getOutputValue();
+
+  IVector::resizeOrCreate(cpuExpandStartsPos_, outputBatchSize, false);
+  int* expandStarts = cpuExpandStartsPos_->getData();
+  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
+    int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
+    for (int j = 0; j < sequenceLength; j++) {
+      expandStarts[starts[sequenceId] + j] = sequenceId;
+    }
+  }
+
+  if (useGpu_) {
+    // TODO(Dangqingqing) move copyFrom
+    IVector::resizeOrCreate(expandStartsPos_, outputBatchSize, true);
+    expandStartsPos_->copyFrom(*cpuExpandStartsPos_, HPPL_STREAM_DEFAULT);
+  } else {
+    expandStartsPos_ = cpuExpandStartsPos_;
+  }
+
+  outputValue->copyByRowIndex(*inputValue, *expandStartsPos_);
+
+  if (biases_.get() != NULL) {
+    outputValue->addBias(*(biases_->getW()), 1);
+  }
+}
+
+void ExpandLayer::backward(const UpdateCallback& callback) {
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+     /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  if (!getInputGrad(0)) return;
+  MatrixPtr inputGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+  auto cpuSeqStartPos =
+      type_ ? getInput(1).subSequenceStartPositions
+            : getInput(1).sequenceStartPositions;
+  size_t numSequences = cpuSeqStartPos->getSize() - 1;
+  const int* starts = cpuSeqStartPos->getData(false);
+
+  CHECK_EQ(inputGrad->getWidth(), outputGrad->getWidth());
+  CHECK_EQ(outputGrad->getHeight(), (size_t)starts[numSequences]);
+
+  AsyncGpuBlock asyncGpuBlock;
+
+  // sum to get the grad
+  real scale = 1;
+  for (size_t sequenceId = 0; sequenceId < numSequences; sequenceId++) {
+    // TODO(Dangqingqing) optimization for GPU
+    int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
+    if (sequenceLength == 0) {
+      // empty sequence
+      continue;
+    }
+    MatrixPtr copyData = inputGrad->subMatrix(sequenceId, 1);
+    copyData->collectBias(
+        *outputGrad->subMatrix(starts[sequenceId], sequenceLength), scale);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandLayer.h b/paddle/gserver/layers/ExpandLayer.h
new file mode 100644
index 00000000000000..8a3eb1c973a475
--- /dev/null
+++ b/paddle/gserver/layers/ExpandLayer.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * A layer for "Expand Dense data or (sequence data where the length of each
+ * sequence is one) to sequence data."
+ *
+ * It should have exactly 2 input, one for data, one for size:
+ * - first one for data
+ *   - If ExpandLevel = kNonSeq: dense data
+ *   - If ExpandLevel = kSeq: sequence data where the length of each sequence is
+ * one
+ * - second one only for sequence info
+ *   - should be sequence data with or without sub-sequence.
+ *
+ * And the output size is the batch size(not instances) of second input.
+ *
+ * The config file api is expand_layer.
+ */
+
+class ExpandLayer : public Layer {
+protected:
+  std::unique_ptr<Weight> biases_;
+  /// if input[0] is dense data, ExpandLevel=kNonSeq;
+  /// if input[0] is sequence data, ExpandLevel=kSeq
+  enum ExpandLevel { kNonSeq = 0, kSeq = 1 };
+  /// store the ExpandLevel
+  int type_;
+  // TODO(luotao) use ICpuGpuVectorPtr to merge cpuExpandStartsPos_
+  // and expandStartsPos_
+  /// expanded sequenceStartPositions or subSequenceStartPositions
+  /// of input[1]
+  IVectorPtr cpuExpandStartsPos_;
+  /// point to cpuExpandStartsPos_ when useGpu_ is false,
+  /// copy from cpuExpandStartsPos_ when useGpu_ is true
+  IVectorPtr expandStartsPos_;
+
+public:
+  explicit ExpandLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ExpandLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
new file mode 100644
index 00000000000000..d18b51dd797373
--- /dev/null
+++ b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for expanding a batch of images to feature maps.
+ * Each data of the input is a 2 dimensional matrix. Each element of the matrix
+ * is replicated num_filters times to create a feature map with num_filters
+ * channels.
+ * - Input: Input one should be dense image data.
+ * - Output: expanded fature maps.
+ * \f[
+ *  y.row[i] = x.row[i \mod x.width], i = 0,1,..., (x.width * num\_filters - 1)
+ * \f]
+ * For example, num_filters = 4:
+ * @code
+ *   x = [a1,a2;
+ *        b1,b2]
+ *   y = [a1, a2, a1, a2, a1, a2, a1, a2;
+ *        b1, b2, b1, b2, b1, b2, b1, b2;]
+ * @endcode
+ */
+
+class FeatureMapExpandLayer : public Layer {
+private:
+  int numFilters_;
+
+public:
+  explicit FeatureMapExpandLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~FeatureMapExpandLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+REGISTER_LAYER(featmap_expand, FeatureMapExpandLayer);
+
+bool FeatureMapExpandLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  numFilters_ = config_.num_filters();
+  return true;
+}
+
+void FeatureMapExpandLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr inputV = getInputValue(0);
+  size_t batchSize = getInput(0).getBatchSize();
+  int imgSize = inputV->getWidth();
+  resetOutput(batchSize, imgSize * numFilters_);
+
+  MatrixPtr outputV = getOutputValue();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    for (size_t i = 0; i < batchSize; i++) {
+      MatrixPtr outVTmp =
+          Matrix::create(outputV->getData() + i * imgSize * numFilters_,
+                         numFilters_, imgSize, false, useGpu_);
+      MatrixPtr inVTmp = Matrix::create(inputV->getData() + i * imgSize, 1,
+                                        imgSize, false, useGpu_);
+      outVTmp->addRowVector(*inVTmp);
+    }
+  }
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inGrad = getInputGrad(0);
+  MatrixPtr outGrad = getOutputGrad();
+  size_t batchSize = getInput(0).getBatchSize();
+  int imgSize = inGrad->getWidth();
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    for (size_t i = 0; i < batchSize; i++) {
+      MatrixPtr outGradTmp =
+          Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
+                         numFilters_, imgSize, false, useGpu_);
+      MatrixPtr inGradTmp = Matrix::create(inGrad->getData() + i * imgSize, 1,
+                                           imgSize, false, useGpu_);
+      inGradTmp->collectBias(*outGradTmp, 1);
+    }
+  }
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+}
+
+}  // namespace paddle.
diff --git a/paddle/gserver/layers/FullMatrixProjection.cpp b/paddle/gserver/layers/FullMatrixProjection.cpp
new file mode 100644
index 00000000000000..8241cbd37ec623
--- /dev/null
+++ b/paddle/gserver/layers/FullMatrixProjection.cpp
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "FullMatrixProjection.h"
+
+namespace paddle {
+
+REGISTER_PROJECTION(fc, FullMatrixProjection);
+
+FullMatrixProjection::FullMatrixProjection(const ProjectionConfig& config,
+                                           const ParameterPtr& parameter,
+                                           bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  weight_.reset(
+      new Weight(config.input_size(), config.output_size(), parameter));
+}
+
+void FullMatrixProjection::forward() {
+  REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
+  out_->value->mul(in_->value, weight_->getW(), 1, 1);
+}
+
+void FullMatrixProjection::backward(const UpdateCallback& callback) {
+  bool syncFlag = hl_get_sync_flag();
+
+  /* Calculate the W-gradient for the current layer */
+  if (weight_->getWGrad()) {
+    REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
+    weight_->getWGrad()->mul(in_->value->getTranspose(), out_->grad, 1, 1);
+  }
+
+  // If callback does not change value, backward propagation error
+  // asynchronously, so that we can do the callback concurrently.
+  hl_set_sync_flag(false);
+
+  /* Calculate the input layers error */
+  if (in_->grad) {
+    REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
+    in_->grad->mul(out_->grad, weight_->getW()->getTranspose(), 1, 1);
+  }
+
+  hl_set_sync_flag(syncFlag);
+  parameter_->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/FullMatrixProjection.h b/paddle/gserver/layers/FullMatrixProjection.h
new file mode 100644
index 00000000000000..e99444b33b82e4
--- /dev/null
+++ b/paddle/gserver/layers/FullMatrixProjection.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/utils/Stat.h"
+
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * FullMatrixProjection performs full matrix multiplication:
+ * \f[
+ *    out.row[i] += in.row[i] * weight
+ * \f]
+ *
+ * The config file api is full_matrix_projection.
+ */
+class FullMatrixProjection : public Projection {
+public:
+  FullMatrixProjection(const ProjectionConfig& config,
+                       const ParameterPtr& parameter, bool useGpu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+protected:
+  std::unique_ptr<Weight> weight_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/FullyConnectedLayer.cpp b/paddle/gserver/layers/FullyConnectedLayer.cpp
new file mode 100644
index 00000000000000..c754f8fd9480de
--- /dev/null
+++ b/paddle/gserver/layers/FullyConnectedLayer.cpp
@@ -0,0 +1,151 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "FullyConnectedLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/math/SparseMatrix.h"
+#include <vector>
+#include <algorithm>
+
+namespace paddle {
+
+REGISTER_LAYER(fc, FullyConnectedLayer);
+
+bool FullyConnectedLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* initialize the weightList */
+  CHECK(inputLayers_.size() == parameters_.size());
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    // Option the parameters
+    size_t height = inputLayers_[i]->getSize();
+    size_t width = getSize();
+
+    // create a new weight
+    if (parameters_[i]->isSparse()) {
+      CHECK_LE(parameters_[i]->getSize(), width * height);
+    } else {
+      CHECK_EQ(parameters_[i]->getSize(), width * height);
+    }
+    Weight* w = new Weight(height, width, parameters_[i]);
+
+    // append the new weight to the list
+    weights_.emplace_back(w);
+  }
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  return true;
+}
+
+void FullyConnectedLayer::prefetch() {
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    auto* sparseParam =
+        dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
+    if (sparseParam) {
+      MatrixPtr input = getInputValue(i);
+      sparseParam->addRows(input);
+    }
+  }
+}
+
+void FullyConnectedLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInput(0).getBatchSize();
+  int size = getSize();
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, size);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    auto input = getInput(i);
+    CHECK(input.value) << "The input of 'fc' layer must be matrix";
+    REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
+    i == 0 ? outV->mul(input.value, weights_[i]->getW(), 1, 0)
+           : outV->mul(input.value, weights_[i]->getW(), 1, 1);
+  }
+
+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void FullyConnectedLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  bool syncFlag = hl_get_sync_flag();
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    /* Calculate the W-gradient for the current layer */
+    if (weights_[i]->getWGrad()) {
+      MatrixPtr input_T = getInputValue(i)->getTranspose();
+      MatrixPtr oGrad = getOutputGrad();
+      {
+        REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
+        weights_[i]->getWGrad()->mul(input_T, oGrad, 1, 1);
+      }
+    }
+
+    // If callback does not change value, backprop error asynchronously so that
+    // we can do the callback concurrently.
+    hl_set_sync_flag(false);
+
+    /* Calculate the input layers error */
+    MatrixPtr preGrad = getInputGrad(i);
+    if (NULL != preGrad) {
+      MatrixPtr weights_T = weights_[i]->getW()->getTranspose();
+      REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
+      preGrad->mul(getOutputGrad(), weights_T, 1, 1);
+    }
+
+    hl_set_sync_flag(syncFlag);
+    {
+      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/FullyConnectedLayer.h b/paddle/gserver/layers/FullyConnectedLayer.h
new file mode 100644
index 00000000000000..24b6c547e7bc8a
--- /dev/null
+++ b/paddle/gserver/layers/FullyConnectedLayer.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+/** 
+ * A layer has full connections to all neurons in the previous layer.
+ * It computes an inner product with a set of learned weights, and 
+ * (optionally) adds biases.
+ *
+ * The config file api is fc_layer.
+ */
+
+class FullyConnectedLayer : public Layer {
+protected:
+  WeightList weights_;
+  std::unique_ptr<Weight> biases_;
+
+public:
+  explicit FullyConnectedLayer(const LayerConfig& config)
+      : Layer(config) {}
+  ~FullyConnectedLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  Weight& getWeight(int idx) { return *weights_[idx]; }
+
+  void prefetch();
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+}  // namespace paddle
+
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.cpp b/paddle/gserver/layers/GatedRecurrentLayer.cpp
new file mode 100644
index 00000000000000..e0c6ff7ea28418
--- /dev/null
+++ b/paddle/gserver/layers/GatedRecurrentLayer.cpp
@@ -0,0 +1,408 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "Layer.h"
+#include "GatedRecurrentLayer.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(gated_recurrent, GatedRecurrentLayer);
+
+bool GatedRecurrentLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(1U, inputLayers_.size());
+  CHECK_EQ(1U, parameters_.size());
+  CHECK_EQ(getSize() * getSize() * 3, parameters_[0]->getSize());
+  CHECK_EQ(getSize() * 3, biasParameter_->getSize());
+  weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[0]));
+  gateWeight_.reset(new Weight(getSize(), getSize() * 2, parameters_[0], 0));
+  stateWeight_.reset(new Weight(getSize(), getSize(), parameters_[0],
+                                2 * getSize() * getSize()));
+  if (biasParameter_.get() != NULL) {
+    bias_.reset(new Weight(1, getSize() * 3, biasParameter_));
+  }
+
+  reversed_ = config_.reversed();
+  activationGate_.reset(ActivationFunction::create(config_.active_gate_type()));
+
+  GruCompute::init(config_);
+  useBatch_ = true;
+
+  return true;
+}
+
+void GatedRecurrentLayer::resetState() {
+  CHECK(!reversed_) << "state is not allowed for reversed gated "
+                       "recurrent layer";
+  Matrix::resizeOrCreate(prevOutput_, 1, getSize(), /* trans= */ false,
+                         useGpu_);
+  prevOutput_->zeroMem();
+
+  // TODO(hedaoyuan): support prev_batch_state
+  CHECK(!FLAGS_prev_batch_state) << "Not supported";
+
+  useBatch_ = false;
+}
+
+void GatedRecurrentLayer::setState(LayerStatePtr state) {
+  CHECK(state->value.size() == 1)
+      << "one matrix is expected for GatedRecurrentLayer state";
+  prevOutput_->copyFrom(*(state->value[0]));
+}
+
+LayerStatePtr GatedRecurrentLayer::getState() {
+  LayerStatePtr res = std::make_shared<LayerState>();
+  res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
+  res->value[0]->copyFrom(*prevOutput_);
+  return res;
+}
+
+void GatedRecurrentLayer::forward(PassType passType) {
+  REGISTER_TIMER_INFO("GruFwTimer", getName().c_str());
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  size_t numSequences = input.getNumSequences();
+  resetOutput(batchSize, getSize());
+  CHECK_EQ(getSize() * 3, input.value->getWidth());
+  const int* starts = input.sequenceStartPositions->getData(false);
+  // batchSize = length of total frames in a batch (NOT size of mini-batch)
+  CHECK_EQ(starts[numSequences], batchSize);
+
+  Matrix::resizeOrCreate(gate_.value, /* height= */batchSize,
+                         getSize() * 3, /* trans= */false, useGpu_);
+  Matrix::resizeOrCreate(resetOutput_.value, /* height= */batchSize,
+                         getSize(), /* trans= */false, useGpu_);
+
+  if (useBatch_) {
+    forwardBatch(batchSize, numSequences, starts, input.value);
+  } else {
+    forwardSequence(batchSize, numSequences, starts, input.value);
+  }
+}
+
+void GatedRecurrentLayer::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("GruBwTimer", getName().c_str());
+  const Argument& input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  const int* starts = input.sequenceStartPositions->getData(false);
+  size_t numSequences = input.getNumSequences();
+
+  Matrix::resizeOrCreate(gate_.grad, /* height= */batchSize,
+                         getSize() * 3, /* trans= */false, useGpu_);
+  Matrix::resizeOrCreate(resetOutput_.grad, /* height= */batchSize,
+                         getSize(), /* trans= */false, useGpu_);
+
+  if (useBatch_) {
+    backwardBatch(batchSize, input.grad);
+  } else {
+    backwardSequence(batchSize, numSequences, starts, input.grad);
+  }
+
+  if (bias_) {
+    bias_->getParameterPtr()->incUpdate(callback);
+  }
+
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+void GatedRecurrentLayer::forwardSequence(int batchSize,
+                                          size_t numSequences,
+                                          const int *starts,
+                                          MatrixPtr inputValue) {
+  REGISTER_TIMER_INFO("GruFwSequenceTime", getName().c_str());
+  gate_.value->assign(*inputValue);
+  if (bias_) {
+    gate_.value->addBias(*(bias_->getW()), 1);
+  }
+
+  hl_gru_value gruValue;
+  gruValue.gateWeight = (gateWeight_->getW())->getData();
+  gruValue.stateWeight = (stateWeight_->getW())->getData();
+  gruValue.gateValue = gate_.value->getData();
+  gruValue.resetOutputValue = resetOutput_.value->getData();
+  gruValue.outputValue = output_.value->getData();
+  gruValue.prevOutValue = nullptr;
+
+  if (reversed_) {
+    gruValue.gateValue += (batchSize - 1) * getSize() * 3;
+    gruValue.resetOutputValue += (batchSize - 1) * getSize();
+    gruValue.outputValue += (batchSize - 1) * getSize();
+  }
+
+  auto nextFrame = [&gruValue](bool reversed, int frameSize) {
+    gruValue.prevOutValue = gruValue.outputValue;
+    if (!reversed) {
+      gruValue.gateValue += frameSize * 3;
+      gruValue.resetOutputValue += frameSize;
+      gruValue.outputValue += frameSize;
+    } else {
+      gruValue.gateValue -= frameSize * 3;
+      gruValue.resetOutputValue -= frameSize;
+      gruValue.outputValue -= frameSize;
+    }
+  };
+
+  if (!reversed_) {
+    if (prevOutput_) {
+      gruValue.prevOutValue = prevOutput_->getData();
+    }
+  }
+  AsyncGpuBlock asyncGpuBlock;
+  for (size_t n = 0; n < numSequences; ++n) {
+    int length;
+    if (!reversed_) {
+      length = starts[n + 1] - starts[n];
+    } else {
+      length = starts[numSequences - n] - starts[numSequences - n - 1];
+    }
+    for (int l = 0; l < length; ++l) {
+      if (useGpu_) {
+        GruCompute::forward<1>(gruValue, getSize());
+      } else {
+        GruCompute::forward<0>(gruValue, getSize());
+      }
+
+      nextFrame(reversed_, getSize());
+    }
+    if (!reversed_) {
+      if (!prevOutput_) gruValue.prevOutValue = nullptr;
+    } else {
+      gruValue.prevOutValue = nullptr;
+    }
+  }
+
+  if (!reversed_) {
+    if (prevOutput_) {
+      prevOutput_->assign(*output_.value->subMatrix(batchSize - 1, 1));
+    }
+  }
+}
+
+void GatedRecurrentLayer::backwardSequence(int batchSize,
+                                           size_t numSequences,
+                                           const int *starts,
+                                           MatrixPtr inputGrad) {
+  REGISTER_TIMER_INFO("GruBwSequenceTime", getName().c_str());
+
+  hl_gru_value gruValue;
+  gruValue.gateWeight = (gateWeight_->getW())->getData();
+  gruValue.stateWeight = (stateWeight_->getW())->getData();
+  gruValue.gateValue = gate_.value->getData();
+  gruValue.resetOutputValue = resetOutput_.value->getData();
+  gruValue.outputValue = output_.value->getData();
+
+  hl_gru_grad gruGrad;
+  gruGrad.gateWeightGrad =
+    (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
+  gruGrad.stateWeightGrad =
+    (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData() : nullptr);
+  gruGrad.gateGrad = gate_.grad->getData();
+  gruGrad.resetOutputGrad = resetOutput_.grad->getData();
+  gruGrad.outputGrad = output_.grad->getData();
+
+  if (!reversed_) {
+    gruValue.gateValue += (batchSize - 1) * getSize() * 3;
+    gruValue.resetOutputValue += (batchSize - 1) * getSize();
+    gruValue.outputValue += (batchSize - 1) * getSize();
+    gruGrad.gateGrad += (batchSize - 1) * getSize() * 3;
+    gruGrad.resetOutputGrad += (batchSize - 1) * getSize();
+    gruGrad.outputGrad += (batchSize - 1) * getSize();
+    gruValue.prevOutValue = gruValue.outputValue - getSize();
+    gruGrad.prevOutGrad = gruGrad.outputGrad - getSize();
+  } else {
+    gruValue.prevOutValue = gruValue.outputValue + getSize();
+    gruGrad.prevOutGrad = gruGrad.outputGrad + getSize();
+  }
+
+  auto nextFrame = [&gruValue, &gruGrad](bool reversed, int frameSize) {
+    if (reversed) {
+      gruValue.gateValue += frameSize * 3;
+      gruValue.resetOutputValue += frameSize;
+      gruValue.outputValue += frameSize;
+      gruGrad.gateGrad += frameSize * 3;
+      gruGrad.resetOutputGrad += frameSize;
+      gruGrad.outputGrad += frameSize;
+      gruValue.prevOutValue = gruValue.outputValue + frameSize;
+      gruGrad.prevOutGrad = gruGrad.outputGrad + frameSize;
+    } else {
+      gruValue.gateValue -= frameSize * 3;
+      gruValue.resetOutputValue -= frameSize;
+      gruValue.outputValue -= frameSize;
+      gruGrad.gateGrad -= frameSize * 3;
+      gruGrad.resetOutputGrad -= frameSize;
+      gruGrad.outputGrad -= frameSize;
+      gruValue.prevOutValue = gruValue.outputValue - frameSize;
+      gruGrad.prevOutGrad = gruGrad.outputGrad - frameSize;
+    }
+  };
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    for (size_t n = 0; n < numSequences; ++n) {
+      int length;
+      if (reversed_) {
+        length = starts[n + 1] - starts[n];
+      } else {
+        length = starts[numSequences - n] - starts[numSequences - n - 1];
+      }
+      for (int l = 0; l < length; ++l) {
+        if (l == length - 1) {
+          gruValue.prevOutValue = nullptr;
+          gruGrad.prevOutGrad = nullptr;
+        }
+        if (useGpu_) {
+          GruCompute::backward<1>(gruValue, gruGrad, getSize());
+        } else {
+          GruCompute::backward<0>(gruValue, gruGrad, getSize());
+        }
+        nextFrame(reversed_, getSize());
+      }
+    }
+  }
+
+  if (inputGrad) {
+    inputGrad->add(*gate_.grad);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    bias_->getWGrad()->collectBias(*gate_.grad, 1);
+  }
+}
+
+void GatedRecurrentLayer::forwardBatch(int batchSize,
+                                       size_t numSequences,
+                                       const int* starts,
+                                       MatrixPtr inputValue) {
+  REGISTER_TIMER_INFO("GruFwBatchTime", getName().c_str());
+  hl_gru_value gruValue;
+  gruValue.gateWeight = (gateWeight_->getW())->getData();
+  gruValue.stateWeight = (stateWeight_->getW())->getData();
+
+  if (!batchValue_) {
+    batchValue_.reset(new SequenceToBatch(useGpu_));
+  }
+  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts,
+                                   reversed_);
+
+  batchValue_->resizeOrCreate(*output_.value);
+  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */true);
+  if (bias_ && bias_->getWGrad()) {
+    gate_.value->addBias(*(bias_->getW()), 1);
+  }
+
+  {
+    int numBatch = batchValue_->getNumBatch();
+    int batchSize = 0;
+    AsyncGpuBlock asyncGpuBlock;
+    for (int n = 0; n < numBatch; n++) {
+      MatrixPtr outputValueTmp = batchValue_->getBatchValue(n);
+      gruValue.outputValue = outputValueTmp->getData();
+      gruValue.gateValue =
+        (batchValue_->getBatchValue(*gate_.value, n))->getData();
+      gruValue.resetOutputValue =
+        (batchValue_->getBatchValue(*resetOutput_.value, n))->getData();
+
+      batchSize = outputValueTmp->getHeight();
+      gruValue.prevOutValue =
+        (n == 0 ? nullptr
+                : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
+
+      {
+        if (useGpu_) {
+          GruCompute::forward<1>(gruValue, getSize(), batchSize);
+        } else {
+          GruCompute::forward<0>(gruValue, getSize(), batchSize);
+        }
+      }
+    }
+  }
+  {
+    batchValue_->copyBackSeq(*output_.value);
+  }
+}
+
+void GatedRecurrentLayer::backwardBatch(int batchSize,
+                                        MatrixPtr inputGrad) {
+  REGISTER_TIMER_INFO("GruBwBatchTime", getName().c_str());
+  hl_gru_value gruValue;
+  gruValue.gateWeight = (gateWeight_->getW())->getData();
+  gruValue.stateWeight = (stateWeight_->getW())->getData();
+
+  hl_gru_grad gruGrad;
+  gruGrad.gateWeightGrad =
+    (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
+  gruGrad.stateWeightGrad =
+    (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData() : nullptr);
+
+  if (!batchGrad_) {
+    batchGrad_.reset(new SequenceToBatch(useGpu_));
+  }
+  batchGrad_->shareIndexWith(*batchValue_);
+
+  {
+    batchGrad_->copyFromSeq(*output_.grad);
+  }
+
+  {
+    int numBatch = batchGrad_->getNumBatch();
+    int batchSize = 0;
+    AsyncGpuBlock asyncGpuBlock;
+    for (int n = (int)numBatch - 1; n >= 0; n--) {
+      gruValue.gateValue =
+        (batchGrad_->getBatchValue(*gate_.value, n))->getData();
+      gruValue.resetOutputValue =
+        (batchGrad_->getBatchValue(*resetOutput_.value, n))->getData();
+
+      MatrixPtr outputGradTmp  = batchGrad_->getBatchValue(n);
+      gruGrad.outputGrad = outputGradTmp->getData();
+      gruGrad.gateGrad =
+        (batchGrad_->getBatchValue(*gate_.grad , n))->getData();
+      gruGrad.resetOutputGrad =
+        (batchGrad_->getBatchValue(*resetOutput_.grad , n))->getData();
+
+      {
+        batchSize = outputGradTmp->getHeight();
+        gruValue.prevOutValue =
+          (n == 0 ? nullptr
+                  : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
+        gruGrad.prevOutGrad =
+          (n == 0 ? nullptr
+                  : (batchGrad_->getBatchValue(n - 1, batchSize))->getData());
+
+        if (useGpu_) {
+          GruCompute::backward<1>(gruValue, gruGrad, getSize(),
+                                  batchSize);
+        } else {
+          GruCompute::backward<0>(gruValue, gruGrad, getSize(),
+                                  batchSize);
+        }
+      }
+    }
+  }
+
+  if (inputGrad) {
+    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */false);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    bias_->getWGrad()->collectBias(*gate_.grad, /* scale */ 1);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.h b/paddle/gserver/layers/GatedRecurrentLayer.h
new file mode 100644
index 00000000000000..19f71206bc00a1
--- /dev/null
+++ b/paddle/gserver/layers/GatedRecurrentLayer.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/math/Matrix.h"
+#include "SequenceToBatch.h"
+#include "GruCompute.h"
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * @brief Please refer to "Junyoung Chung, Empirical Evaluation
+ * of Gated Recurrent Neural Networks on Sequence Modeling".
+ *
+ * GatedRecurrentLayer takes 1 input layer with size * 3.
+ * Input layer is diveded into 3 equal parts: (xz_t, xr_t, xi_t).
+ * parameter and biasParameter is also diveded into 3 equal parts:
+ *   - parameter consists of (U_z, U_r, U)
+ *   - baisParameter consists of (bias_z, bias_r, bias_o)
+ *
+ * \f[
+ * update \ gate: z_t = actGate(xz_t + U_z * h_{t-1} + bias_z) \\
+ * reset \ gate: r_t = actGate(xr_t + U_r * h_{t-1} + bias_r) \\
+ * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, h_{t-1}) + bias_o) \\
+ * hidden \ activation: h_t = dot((1-z_t), h_{t-1}) + dot(z_t, {h}_t) \\
+ * \f]
+ *
+ * @note
+ * - dot denotes "element-wise multiplication".
+ * - actNode is defined by config active_type
+ * - actGate is defined by config actvie_gate_type
+ *
+ * The config file is grumemory.
+ */
+
+class GatedRecurrentLayer : public Layer, public GruCompute {
+public:
+  explicit GatedRecurrentLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback& callback);
+
+  void resetState();
+
+  void setState(LayerStatePtr state);
+
+  LayerStatePtr getState();
+
+protected:
+  void forwardSequence(int batchSize, size_t numSequences,
+                       const int *starts, MatrixPtr inputValue);
+  void backwardSequence(int batchSize, size_t numSequences,
+                        const int *starts, MatrixPtr inputGrad);
+
+  void forwardBatch(int batchSize, size_t numSequences,
+                    const int *starts, MatrixPtr inputValue);
+  void backwardBatch(int batchSize, MatrixPtr inputGrad);
+
+protected:
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> gateWeight_;
+  std::unique_ptr<Weight> stateWeight_;
+  std::unique_ptr<Weight> bias_;
+
+  Argument gate_;
+  Argument resetOutput_;
+
+  bool reversed_;
+  bool useBatch_;
+  std::unique_ptr<SequenceToBatch> batchValue_;
+  std::unique_ptr<SequenceToBatch> batchGrad_;
+  std::unique_ptr<ActivationFunction> activationGate_;
+
+  MatrixPtr prevOutput_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/GetOutputLayer.cpp b/paddle/gserver/layers/GetOutputLayer.cpp
new file mode 100644
index 00000000000000..f036cd2b528422
--- /dev/null
+++ b/paddle/gserver/layers/GetOutputLayer.cpp
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "Layer.h"
+
+namespace paddle {
+
+class GetOutputLayer : public Layer {
+public:
+  explicit GetOutputLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~GetOutputLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+    if (!Layer::init(layerMap, parameterMap)) return false;
+    CHECK_EQ(1U, inputLayers_.size());
+    CHECK_NE(inputArgument_[0], "");
+    return true;
+  }
+
+  void forward(PassType passType) {
+    output_ = getPrev(0)->getOutput(inputArgument_[0]);
+  }
+  void backward(const UpdateCallback& callback = nullptr) {}
+};
+
+REGISTER_LAYER(get_output, GetOutputLayer);
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/GruCompute.cpp b/paddle/gserver/layers/GruCompute.cpp
new file mode 100644
index 00000000000000..c942122633c3d9
--- /dev/null
+++ b/paddle/gserver/layers/GruCompute.cpp
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Util.h"
+#include "GruCompute.h"
+#include "hl_recurrent_apply.cuh"
+
+namespace paddle {
+
+void GruCompute::init(LayerConfig &config) {
+    activeNode_ = hlActiveType(config.active_type());
+    activeGate_ = hlActiveType(config.active_gate_type());
+}
+
+template <>
+void GruCompute::forward<0>(hl_gru_value value,
+                            int frameSize,
+                            int batchSize) {
+  hl_cpu_gru_forward(hppl::forward::gru_resetOutput(),
+                     hppl::forward::gru_finalOutput(),
+                     value,
+                     frameSize,
+                     batchSize,
+                     activeNode_,
+                     activeGate_);
+}
+
+template <>
+void GruCompute::backward<0>(hl_gru_value value,
+                            hl_gru_grad  grad,
+                            int frameSize,
+                            int batchSize) {
+hl_cpu_gru_backward(hppl::backward::gru_stateGrad(),
+                    hppl::backward::gru_resetGrad(),
+                    value,
+                    grad,
+                    frameSize,
+                    batchSize,
+                    activeNode_,
+                    activeGate_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/GruCompute.cu b/paddle/gserver/layers/GruCompute.cu
new file mode 100644
index 00000000000000..4a3cf6b1ca73cc
--- /dev/null
+++ b/paddle/gserver/layers/GruCompute.cu
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "GruCompute.h"
+
+#include "hl_recurrent_apply.cuh"
+
+namespace paddle {
+
+template <>
+void GruCompute::forward<1>(hl_gru_value value, int frameSize, int batchSize) {
+  hl_gpu_gru_forward(hppl::forward::gru_resetOutput(),
+                     hppl::forward::gru_finalOutput(),
+                     value,
+                     frameSize,
+                     batchSize,
+                     activeNode_,
+                     activeGate_);
+}
+
+template <>
+void GruCompute::backward<1>(hl_gru_value value, hl_gru_grad grad,
+                            int frameSize, int batchSize) {
+  hl_gpu_gru_backward(hppl::backward::gru_stateGrad(),
+                      hppl::backward::gru_resetGrad(),
+                      value,
+                      grad,
+                      frameSize,
+                      batchSize,
+                      activeNode_,
+                      activeGate_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/GruCompute.h b/paddle/gserver/layers/GruCompute.h
new file mode 100644
index 00000000000000..3a1b69b940d089
--- /dev/null
+++ b/paddle/gserver/layers/GruCompute.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "paddle/utils/TypeDefs.h"
+#include "ModelConfig.pb.h"
+#include "hl_gpu.h"
+
+namespace paddle {
+
+class GruCompute {
+public:
+  void init(LayerConfig &config);
+
+  template <bool useGpu>
+  void forward(hl_gru_value value, int frameSize, int batchSize = 1);
+
+  template <bool useGpu>
+  void backward(hl_gru_value value, hl_gru_grad grad, int frameSize,
+                int batchSize = 1);
+
+public:
+  hl_activation_mode_t activeNode_;
+  hl_activation_mode_t activeGate_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/GruStepLayer.cpp b/paddle/gserver/layers/GruStepLayer.cpp
new file mode 100644
index 00000000000000..501229d10ab87a
--- /dev/null
+++ b/paddle/gserver/layers/GruStepLayer.cpp
@@ -0,0 +1,169 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "Layer.h"
+#include "GruCompute.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief GruStepLayer is like GatedRecurrentLayer, but used in recurrent
+ * layer group. GruStepLayer takes 2 input layer.
+ * - input[0] with size * 3 and diveded into 3 equal parts: (xz_t, xr_t, xi_t).
+ * - input[1] with size: {prev_out}.
+ *
+ * parameter and biasParameter is also diveded into 3 equal parts:
+ * - parameter consists of (U_z, U_r, U)
+ * - baisParameter consists of (bias_z, bias_r, bias_o)
+ *
+ * \f[
+ * update \ gate: z_t = actGate(xz_t + U_z * prev_out + bias_z) \\
+ * reset \ gate: r_t = actGate(xr_t + U_r * prev_out + bias_r)  \\
+ * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, prev_out) + bias_o) \\
+ * output: h_t = dot((1-z_t), prev_out) + dot(z_t, prev_out)
+ * \f]
+ *
+ * @note
+ *   - dot denotes "element-wise multiplication".
+ *   - actNode is defined by config active_type
+ *   - actGate is defined by config actvie_gate_type
+ *
+ * The config file api if gru_step_layer.
+ */
+class GruStepLayer : public Layer, public GruCompute {
+protected:
+  Argument gate_;
+  Argument resetOutput_;
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> bias_;
+
+public:
+  explicit GruStepLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~GruStepLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+REGISTER_LAYER(gru_step, GruStepLayer);
+
+bool GruStepLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(2U, inputLayers_.size());
+
+  CHECK_EQ(getSize() * getSize() * 3, parameters_[0]->getSize());
+  weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[0]));
+
+  if (biasParameter_.get() != NULL) {
+    CHECK_EQ(getSize() * 3, biasParameter_->getSize());
+    bias_.reset(new Weight(1, getSize() * 3, biasParameter_));
+  }
+
+  GruCompute::init(config_);
+  return true;
+}
+
+void GruStepLayer::forward(PassType passType) {
+  REGISTER_TIMER_INFO("GruStepFwTime", getName().c_str());
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  const Argument& prevOutput = getInput(1);
+  CHECK_EQ(getSize() * 3, input.value->getWidth());
+  CHECK_EQ(getSize(), prevOutput.value->getWidth());
+
+  int batchSize = input.getBatchSize();
+  resetOutput(batchSize, getSize());
+  resetSpecifyOutput(gate_, batchSize, getSize() * 3,
+                     /* isValueClean */ false, /* isGradClean */ false);
+  resetSpecifyOutput(resetOutput_, batchSize, getSize(),
+                     /* isValueClean */ false, /* isGradClean */ false);
+  gate_.value->assign(*input.value);
+  if (bias_) {
+    gate_.value->addBias(*(bias_->getW()), 1);
+  }
+
+  hl_gru_value gruValue;
+  gruValue.gateWeight = weight_->getW()->getData();
+  gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2;
+  gruValue.gateValue = gate_.value->getData();;
+  gruValue.resetOutputValue = resetOutput_.value->getData();
+  gruValue.outputValue = output_.value->getData();
+  gruValue.prevOutValue = prevOutput.value->getData();
+
+  if (useGpu_) {
+    GruCompute::forward<1>(gruValue, getSize(), batchSize);
+  } else {
+    GruCompute::forward<0>(gruValue, getSize(), batchSize);
+  }
+}
+
+void GruStepLayer::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("GruStepBwTime", getName().c_str());
+
+  const Argument& input = getInput(0);
+  const Argument& prevOutput = getInput(1);
+  int batchSize = input.getBatchSize();
+
+  hl_gru_value gruValue;
+  gruValue.gateWeight = weight_->getW()->getData();
+  gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2;
+  gruValue.gateValue = gate_.value->getData();;
+  gruValue.resetOutputValue = resetOutput_.value->getData();
+  gruValue.outputValue = output_.value->getData();
+  gruValue.prevOutValue = prevOutput.value->getData();
+
+  hl_gru_grad  gruGrad;
+  gruGrad.gateWeightGrad =
+    (weight_->getWGrad() ? weight_->getWGrad()->getData() : nullptr);
+  gruGrad.stateWeightGrad =
+    (weight_->getWGrad() ?
+     weight_->getWGrad()->getData() + getSize() * getSize() * 2 : nullptr);
+
+  gruGrad.gateGrad = gate_.grad->getData();
+  gruGrad.resetOutputGrad = resetOutput_.grad->getData();
+  gruGrad.outputGrad = output_.grad->getData();
+  if (prevOutput.grad) {
+    gruGrad.prevOutGrad = prevOutput.grad->getData();
+  } else {
+    gruGrad.prevOutGrad = nullptr;
+  }
+
+  if (useGpu_) {
+    GruCompute::backward<1>(gruValue, gruGrad, getSize(), batchSize);
+  } else {
+    GruCompute::backward<0>(gruValue, gruGrad, getSize(), batchSize);
+  }
+
+  if (input.grad) {
+    input.grad->add(*gate_.grad);
+  }
+
+  if (bias_ && bias_->getWGrad()) {
+    bias_->getWGrad()->collectBias(*gate_.grad, 1);
+  }
+
+  if (bias_) {
+    bias_->getParameterPtr()->incUpdate(callback);
+  }
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
new file mode 100644
index 00000000000000..fc9832af867937
--- /dev/null
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
@@ -0,0 +1,128 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "HierarchicalSigmoidLayer.h"
+
+#include "paddle/math/Bits.h"
+
+namespace paddle {
+
+REGISTER_LAYER(hsigmoid, HierarchicalSigmoidLayer);
+
+bool HierarchicalSigmoidLayer::init(const LayerMap& layerMap,
+                                    const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK(config_.has_num_classes()) << "num_classes must be specifed in config";
+  numClasses_ = config_.num_classes();
+  CHECK_GE(numClasses_, (size_t)2);
+  codeLength_ = findLastSet(numClasses_ - 1);
+
+  size_t height = numClasses_ - 1;
+
+  /* initialize the weightList */
+  // The last input layer is for label
+  CHECK(!parameters_.back());
+  for (size_t i = 0; i < inputLayers_.size() - 1; i++) {
+    size_t width = inputLayers_[i]->getSize();
+    // create a new weight
+    CHECK_EQ(parameters_[i]->getSize(), width * height);
+    Weight* w = new Weight(height, width, parameters_[i]);
+
+    // append the new weight to the list
+    weights_.emplace_back(w);
+  }
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    CHECK_EQ(biasParameter_->getSize(), numClasses_ - 1);
+    biases_.reset(new Weight(1, numClasses_ - 1, biasParameter_));
+  }
+
+  return true;
+}
+
+void HierarchicalSigmoidLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(0)->getHeight();
+  int size = getSize();
+  reserveOutput(batchSize, size);
+  Matrix::resizeOrCreate(preOutput_.value, batchSize, codeLength_,
+                         /* trans */ false, useGpu(deviceId_));
+  Matrix::resizeOrCreate(preOutput_.grad, batchSize, codeLength_,
+                         /* trans */ false, useGpu(deviceId_));
+
+  IVectorPtr label = getInput(*getLabelLayer()).ids;
+
+  preOutput_.value->zeroMem();
+
+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    preOutput_.value->addByBitCode(numClasses_, *label, *biases_->getW());
+  }
+  for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
+    MatrixPtr input = getInputValue(i);
+    preOutput_.value->mulByBitCode(numClasses_, *label, *weights_[i]->getW(),
+                                   *input);
+  }
+  // keep consistent with the clipping in the following softrelu
+  preOutput_.value->clip(-40.0, 40.0);
+  preOutput_.value->sumByBitCode(numClasses_, *label, *output_.value,
+                                 -1);  // scaleSum
+  preOutput_.value->softrelu(*preOutput_.value);
+  MatrixPtr sum = Matrix::create(batchSize,
+    1, /* trans= */ false, useGpu(deviceId_));
+  preOutput_.value->rowSum(*sum);
+  output_.value->add(*sum);
+}
+
+void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
+  IVectorPtr label = getInput(*getLabelLayer()).ids;
+  preOutput_.grad->one();
+  preOutput_.grad->softreluDerivative(*preOutput_.value);
+  preOutput_.grad->subByBitCode(numClasses_, *label);
+
+  if (biases_ && biases_->getWGrad()) {
+    preOutput_.grad->addByBitCodeBackward(numClasses_, *label,
+                                          *biases_->getWGrad());
+
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
+    /* Calculate the W-gradient for the current layer */
+    MatrixPtr input = getInputValue(i);
+    if (weights_[i]->getWGrad()) {
+      preOutput_.grad->mulByBitCodeBackwardWeight(
+          numClasses_, *label, *weights_[i]->getWGrad(), *input);
+
+      /* Increasing the number of gradient */
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+
+    /* Calculate the input layers error */
+    MatrixPtr inputGrad = getInputGrad(i);
+    if (inputGrad) {
+      preOutput_.grad->mulByBitCodeBackwardError(
+          numClasses_, *label, *weights_[i]->getW(), *inputGrad);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
new file mode 100644
index 00000000000000..1942c5fe1e4f4d
--- /dev/null
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * Organize the classes into a binary tree. At each node, a sigmoid function 
+ * is used to calculate the probability of belonging to the right branch.
+ * This idea is from "F. Morin, Y. Bengio (AISTATS 05): 
+ * Hierarchical Probabilistic Neural Network Language Model."
+ *
+ * Here we uses a simple way of making the binary tree.
+ * Assuming the number of classes C = 6,
+ * The classes are organized as a binary tree in the following way:
+ * 
+ * @code{.py}
+ * *-*-*- 2
+ * | | |- 3
+ * | |
+ * | |-*- 4
+ * |   |- 5
+ * |
+ * |-*- 0
+ * |- 1
+ * @endcode
+ *
+ * where * indicates an internal node, and each leaf node represents a class.
+ * - Node 0 ... C-2 are internal nodes.
+ * - Node C-1 ... 2C-2 are leaf nodes.
+ * - Class c is represented by leaf node \f$c+C-1\f$.
+ * 
+ * We assign an id for each node:
+ * - the id of root be 0.
+ * - the left child of a node i is 2*i+1.
+ * - the right child of a node i is 2*i+2.
+ *
+ * It's easy to see that:
+ * - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$. 
+ * - the j-th level ancestor of node i is 
+ * \f$\left\lfloor(i+1)/2^{j+1}\right\rfloor - 1\f$.
+ * - A node i is a left child of its parent if \f$(i-1)\%2==0\f$.
+ *
+ * The config file api is hsigmod_layer.
+ */
+class HierarchicalSigmoidLayer : public Layer {
+public:
+  explicit HierarchicalSigmoidLayer(const LayerConfig& config)
+      : Layer(config) {}
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback);
+
+protected:
+  /**
+   * The last of inputs is label layer.
+   */ 
+  LayerPtr getLabelLayer() { return inputLayers_.back(); }
+
+  WeightList weights_;
+  std::unique_ptr<Weight> biases_;
+  /// number of classes
+  size_t numClasses_;
+  /// codeLength_ = \f$1 + \left\lfloor log_{2}(numClasses-1)\right\rfloor\f$
+  int codeLength_;
+  /// temporary result of output_
+  Argument preOutput_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/IdentityProjection.cpp b/paddle/gserver/layers/IdentityProjection.cpp
new file mode 100644
index 00000000000000..6b7d20cc507e45
--- /dev/null
+++ b/paddle/gserver/layers/IdentityProjection.cpp
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Stat.h"
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * IdentityProjection performs addition:
+ * \f[
+ *   out.row[i] += in.row[i]
+ * \f]
+ *
+ * The config file api is identity_projection.
+ */
+class IdentityProjection : public Projection {
+public:
+  IdentityProjection(const ProjectionConfig& config,
+                     const ParameterPtr& parameter, bool useGpu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+};
+
+REGISTER_PROJECTION(identity, IdentityProjection);
+
+/**
+ * Constructed function.
+ * @note IdentityProjection should not have any parameter.
+ */
+IdentityProjection::IdentityProjection(const ProjectionConfig& config,
+                                       const ParameterPtr& parameter,
+                                       bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(!parameter) << "'identity' projection should not have any parameter";
+}
+
+void IdentityProjection::forward() { out_->value->add(*in_->value); }
+
+void IdentityProjection::backward(const UpdateCallback& callback) {
+  if (in_->grad) {
+    in_->grad->add(*out_->grad);
+  }
+}
+
+/**
+ * IdentityOffsetProjection likes IdentityProjection, but layer size may be
+ * smaller
+ * than input size. It selects dimensions [offset, offset+layer_size) from input
+ * to
+ * perform addition:
+ * \f[
+ *   out.row[i] += in.row[i + \textrm{offset}]
+ * \f]
+ *
+ * The config file api is identity_projection.
+ */
+class IdentityOffsetProjection : public Projection {
+public:
+  IdentityOffsetProjection(const ProjectionConfig& config,
+                           const ParameterPtr& parameter, bool useGpu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+};
+
+REGISTER_PROJECTION(identity_offset, IdentityOffsetProjection);
+
+/**
+ * Constructed function.
+ * @note IdentityOffsetProjection should not have any parameter.
+ */
+IdentityOffsetProjection::IdentityOffsetProjection(
+    const ProjectionConfig& config, const ParameterPtr& parameter, bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(!parameter) << "'identity_offset' projection "
+                       "should not have any parameter";
+  CHECK_LE(config.output_size() + config.offset(), config.input_size());
+}
+
+void IdentityOffsetProjection::forward() {
+  out_->value->addAtOffset(*in_->value, config_.offset());
+}
+
+void IdentityOffsetProjection::backward(const UpdateCallback& callback) {
+  if (in_->grad) {
+    in_->grad->addAtOffset(*out_->grad, config_.offset());
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/InterpolationLayer.cpp b/paddle/gserver/layers/InterpolationLayer.cpp
new file mode 100644
index 00000000000000..4102df840a4841
--- /dev/null
+++ b/paddle/gserver/layers/InterpolationLayer.cpp
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for linear interpolation with two inputs,
+ * which is used in NEURAL TURING MACHINE.
+ * \f[
+ *   y.row[i] = w[i] * x_1.row[i] + (1 - w[i]) * x_2.row[i]
+ * \f]
+ * where \f$x_1\f$ and \f$x_2\f$ are two (batchSize x dataDim) inputs, 
+ * \f$w\f$ is (batchSize x 1) weight vector, 
+ * and \f$y\f$ is (batchSize x dataDim) output.
+ *
+ * The config file api is interpolation_layer.
+ */
+
+class InterpolationLayer : public Layer {
+protected:
+  /// weightLast = 1 - weight
+  MatrixPtr weightLast_;
+  MatrixPtr tmpMatrix;
+
+public:
+  explicit InterpolationLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~InterpolationLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+REGISTER_LAYER(interpolation, InterpolationLayer);
+
+bool InterpolationLayer::init(const LayerMap& layerMap,
+                              const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(3U, inputLayers_.size());
+
+  return true;
+}
+
+void InterpolationLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr weightV = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inV2 = getInputValue(2);
+
+  size_t batchSize = inV1->getHeight();
+  size_t dataDim = inV1->getWidth();
+
+  CHECK_EQ(dataDim, getSize());
+  CHECK_EQ(dataDim, inV2->getWidth());
+  CHECK_EQ(batchSize, inV1->getHeight());
+  CHECK_EQ(batchSize, inV2->getHeight());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(weightLast_, batchSize, 1, false, useGpu_);
+  weightLast_->one();
+  weightLast_->sub(*weightV);
+
+  REGISTER_TIMER_INFO("FwInterpTimer", getName().c_str());
+  // outV = inV1 * weight + inV2 * weightLast
+  outV->addRowScale(0, *inV1, *weightV);
+  outV->addRowScale(0, *inV2, *weightLast_);
+}
+
+void InterpolationLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr weightV = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inV2 = getInputValue(2);
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+  MatrixPtr inG2 = getInputGrad(2);
+
+  size_t batchSize = inV1->getHeight();
+  size_t dataDim = inV1->getWidth();
+
+  REGISTER_TIMER_INFO("BwInterpTimer", getName().c_str());
+
+  if (inG0) {
+    Matrix::resizeOrCreate(tmpMatrix, batchSize, dataDim, false, useGpu_);
+
+    // inG0 += outG .* (inV1 - inV2)
+    tmpMatrix->sub(*inV1, *inV2);
+    inG0->rowDotMul(0, *outG, *tmpMatrix);
+  }
+
+  if (inG1) {
+    // inG1 += outG * weight
+    inG1->addRowScale(0, *outG, *weightV);
+  }
+
+  if (inG2) {
+    // inG2 += outG * weightLast
+    inG2->addRowScale(0, *outG, *weightLast_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
new file mode 100644
index 00000000000000..44ea95c80ab083
--- /dev/null
+++ b/paddle/gserver/layers/Layer.cpp
@@ -0,0 +1,386 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Util.h"
+
+#include "paddle/utils/Logging.h"
+
+#include "AddtoLayer.h"
+#include "CosSimLayer.h"
+#include "CostLayer.h"
+#include "ExpandConvLayer.h"
+#include "CRFLayer.h"
+#include "DataLayer.h"
+#include "FullyConnectedLayer.h"
+#include "HierarchicalSigmoidLayer.h"
+#include "MaxLayer.h"
+#include "MixedLayer.h"
+#include "NormLayer.h"
+#include "PoolLayer.h"
+#include "TensorLayer.h"
+#include "TransLayer.h"
+#include "ValidationLayer.h"
+
+P_DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");
+
+namespace paddle {
+
+Layer::Layer(const LayerConfig& config, bool useGpu)
+    : config_(config),
+      useGpu_(useGpu),
+      deviceId_(-1),
+      needSequenceInfo_(true) {}
+
+bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  if (useGpu_ && FLAGS_parallel_nn) {
+    /* gpu environment is specified by device property */
+    deviceId_ = config_.device();
+    if (deviceId_ < 0) {
+      useGpu_ = false;
+    }
+  }
+
+  output_.deviceId = deviceId_;
+
+  for (auto& inputConfig : config_.inputs()) {
+    std::string inputName = inputConfig.input_layer_name();
+    LayerPtr inputLayer;
+    CHECK(mapGet(inputName, layerMap, &inputLayer))
+        << "Cannot find input layer " << inputName << " for layer "
+        << getName();
+    this->addPrev(inputLayer);
+
+    inputLayer->addOutputArgument(deviceId_);
+
+    if (inputConfig.has_input_parameter_name()) {
+      ParameterPtr parameter;
+      CHECK(
+          mapGet(inputConfig.input_parameter_name(), parameterMap, &parameter))
+          << "Cannot find input parameter "
+          << inputConfig.input_parameter_name() << " for layer " << getName();
+      parameter->incShared();
+      CHECK_EQ(parameter->getDeviceId(), getDeviceId());
+      parameters_.push_back(parameter);
+    } else {
+      parameters_.push_back(nullptr);
+    }
+
+    if (inputConfig.has_input_layer_argument()) {
+      inputArgument_.push_back(inputConfig.input_layer_argument());
+    } else {
+      inputArgument_.push_back("");
+    }
+  }
+
+  if (config_.has_bias_parameter_name()) {
+    CHECK(mapGet(config_.bias_parameter_name(), parameterMap, &biasParameter_))
+        << "Cannot find bias parameter " << config_.bias_parameter_name()
+        << " for layer " << getName();
+    biasParameter_->incShared();
+    CHECK_EQ(biasParameter_->getDeviceId(), getDeviceId());
+  }
+
+  /* specify the activation function according to the configuration */
+  std::string action_type = config_.active_type();
+  activation_.reset(ActivationFunction::create(action_type));
+  CHECK(activation_);
+
+  initNeedFlags();
+  markInBackward_.assign(inputLayers_.size(), false);
+
+  return true;
+}
+
+ClassRegistrar<Layer, LayerConfig> Layer::registrar_;
+
+LayerPtr Layer::create(const LayerConfig& config) {
+  std::string type = config.type();
+
+  if (type == "multi-class-cross-entropy")
+    return LayerPtr(new MultiClassCrossEntropy(config));
+  else if (type == "rank-cost")
+    return LayerPtr(new RankingCost(config));
+  else if (type == "auc-validation")
+    return LayerPtr(new AucValidation(config));
+  else if (type == "pnpair-validation")
+    return LayerPtr(new PnpairValidation(config));
+  // NOTE: stop adding "if" statements here.
+  // Instead, use REGISTER_LAYER to add more layer types
+
+  return LayerPtr(registrar_.createByType(config.type(), config));
+}
+
+void Layer::resetSpecifyOutput(Argument& output, size_t height, size_t width,
+                               bool isValueClean, bool isGradClean) {
+  SetDevice device(output.deviceId);
+
+  Matrix::resizeOrCreate(output.value, height, width, /* trans */ false,
+                         useGpu(output.deviceId));
+  if (isValueClean) {
+    output.value->zeroMem();
+  }
+
+  if (passType_ != PASS_TEST && needGradient()) {
+    Matrix::resizeOrCreate(output.grad, height, width, /* trans */ false,
+                           useGpu(output.deviceId));
+    if (isGradClean) {
+      output.grad->zeroMem();
+    }
+  }
+}
+
+void Layer::resizeOutput(size_t height, size_t width) {
+  resetSpecifyOutput(output_, height, width, false, false);
+
+  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+    resetSpecifyOutput(outputOtherDevice_[i], height, width, false, false);
+  }
+}
+
+void Layer::reserveOutput(size_t height, size_t width) {
+  resetSpecifyOutput(output_, height, width, false, true);
+
+  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+    resetSpecifyOutput(outputOtherDevice_[i], height, width, false, true);
+  }
+}
+
+void Layer::resetOutput(size_t height, size_t width) {
+  resetSpecifyOutput(output_, height, width, true, true);
+
+  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+    resetSpecifyOutput(outputOtherDevice_[i], height, width, true, true);
+  }
+}
+
+void Layer::addOutputArgument(int deviceId) {
+  if (deviceId == deviceId_) {
+    output_.countIncrement();
+    return;
+  } else {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      if (outputOtherDevice_[i].deviceId == deviceId) {
+        outputOtherDevice_[i].countIncrement();
+        return;
+      }
+    }
+  }
+
+  Argument argu;
+  argu.deviceId = deviceId;
+  outputOtherDevice_.push_back(argu);
+  outputOtherDevice_.back().countIncrement();
+}
+
+void Layer::copyOutputToOtherDevice() {
+  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+    SetDevice device(outputOtherDevice_[i].deviceId);
+    outputOtherDevice_[i].value->copyFrom(*getOutputValue(),
+                                          HPPL_STREAM_DEFAULT);
+    outputOtherDevice_[i].sequenceStartPositions =
+        output_.sequenceStartPositions;
+    outputOtherDevice_[i].subSequenceStartPositions =
+        output_.subSequenceStartPositions;
+    outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
+
+    outputOtherDevice_[i].notifyValueReady();
+  }
+}
+
+void Layer::waitInputValue() {
+  for (size_t i = 0; i != inputLayers_.size(); i++) {
+    if (inputLayers_[i]->getDeviceId() != deviceId_) {
+      getInput(i).waitValueReady();
+    }
+  }
+}
+
+void Layer::waitAndMergeOutputGrad() {
+  if (!output_.grad || !outputOtherDevice_.size()) {
+    return;
+  }
+
+  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+    outputOtherDevice_[i].waitGradReady();
+  }
+
+  /* merge output grad */
+  size_t i = 0;
+  if (!output_.getAllCount()) {
+    output_.grad->copyFrom(*outputOtherDevice_[0].grad, HPPL_STREAM_1);
+    hl_stream_synchronize(HPPL_STREAM_1);
+
+    i++;
+    if (outputOtherDevice_.size() == 1) return;
+  }
+
+  Matrix::resizeOrCreate(tmpGrad_, output_.grad->getHeight(),
+                         output_.grad->getWidth(), /* trans */ false,
+                         useGpu(output_.deviceId));
+
+  for (; i != outputOtherDevice_.size(); i++) {
+    tmpGrad_->copyFrom(*outputOtherDevice_[i].grad, HPPL_STREAM_1);
+    hl_stream_synchronize(HPPL_STREAM_1);
+    output_.grad->add(*tmpGrad_);
+  }
+}
+
+void Layer::markAllInputGrad() {
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    if (!markInBackward_[i]) {
+      inputLayers_[i]->getOutput(deviceId_).notifyGradReady();
+    }
+    markInBackward_[i] = false;
+  }
+}
+
+void Layer::markInputGrad(int inputIndex) {
+  inputLayers_[inputIndex]->getOutput(deviceId_).notifyGradReady();
+  markInBackward_[inputIndex] = true;
+}
+
+void Layer::zeroGrad() {
+  CHECK(output_.grad.get() != NULL);
+  output_.grad->zeroMem();
+}
+
+void Layer::initNeedFlags() {
+  auto initFlag = [this](bool& flag, bool (Layer::*flagQueryFunc)() const,
+                         ParameterType type) {
+    flag = false;
+    if (biasParameter_ && biasParameter_->hasType(type)) {
+      flag = true;
+    }
+    if (!flag) {
+      for (auto& para : parameters_) {
+        if (para && para->hasType(type)) {
+          flag = true;
+          break;
+        }
+      }
+    }
+    if (!flag) {
+      for (auto& layer : inputLayers_) {
+        if ((layer.get()->*flagQueryFunc)()) {
+          flag = true;
+        }
+      }
+    }
+  };
+  initFlag(needGradient_, &Layer::needGradient, PARAMETER_GRADIENT);
+}
+
+void Layer::showOutputStats() {
+  MatrixPtr out = getOutputValue();
+  if (!out) return;
+  if (!out->getElementCnt()) {
+    LOG(INFO) << "The number of output of " << config_.name()
+              << " is 0, skip to show the statistics";
+    return;
+  }
+  real mean = out->getSum() / out->getElementCnt();
+  MatrixPtr outSquare = out->clone();
+  outSquare->copyFrom(*out);
+  if (dynamic_cast<CpuSparseMatrix*>(outSquare.get())) {
+    auto tmpMat = dynamic_cast<CpuSparseMatrix*>(outSquare.get());
+    tmpMat->square();
+    LOG(INFO) << "show statistics of [none zero values] in sparse matrix";
+  } else {
+    outSquare->square();
+  }
+  real std = (outSquare->getSum() / outSquare->getElementCnt()) - mean * mean;
+  std = std > 0 ? std : 0;
+  LOG(INFO) << "The output state of " << config_.name() << ": mean=" << mean
+            << ", "
+            << "std=" << std
+            << ", "
+            << "min=" << out->getMin() << ", "
+            << "max=" << out->getMax();
+}
+
+void Layer::forwardActivation() {
+  /* activation */
+  activation_->forward(output_);
+
+  /* dropout */
+  if (config_.drop_rate() > 0) {
+    forwardDropOut();
+    CHECK_NE(activation_->getName(), "softmax")
+        << "Softmax activation cannot be used with Dropout";
+  }
+
+  if (FLAGS_show_layer_stat) {
+    showOutputStats();
+  }
+}
+
+void Layer::backwardActivation() {
+  /* Do error clipping */
+  if (config_.error_clipping_threshold() > 0.0f) {
+    if (FLAGS_log_error_clipping) {
+      CpuVector outGradVec(0, nullptr);
+      outGradVec.subVecFrom(output_.grad->getData(), 0,
+                            output_.grad->getElementCnt());
+      real maxAbsGrad = outGradVec.getAbsMax();
+      if (maxAbsGrad > config_.error_clipping_threshold()) {
+        real avgAbsGrad = outGradVec.getAbsSum() / outGradVec.getSize();
+        LOG(INFO) << " layer=" << config_.name() << " need clipping,"
+                  << " max error=" << maxAbsGrad << " avg error=" << avgAbsGrad;
+      }
+    }
+    output_.grad->clip(-config_.error_clipping_threshold(),
+                       config_.error_clipping_threshold());
+  }
+
+  /* Do dropout for delta*/
+  if (config_.drop_rate() > 0 && passType_ != PASS_TEST) {
+    MatrixPtr oGrad = getOutputGrad();
+    oGrad->dotMul(*oGrad, *dropOutMask_);
+  }
+
+  activation_->backward(output_);
+}
+
+void Layer::forwardDropOut() {
+  auto& outV = getOutputValue();
+
+  if (passType_ == PASS_TRAIN || passType_ == PASS_METRIC_TRAIN ||
+      passType_ == PASS_METRIC_TRAIN_WITH_NOERROR) {
+    // new dropOutMask_ if dropOutMask_ is null ptr
+    Matrix::resizeOrCreate(dropOutMask_, outV->getHeight(), outV->getWidth(),
+                           false, useGpu(deviceId_));
+    dropOutMask_->randomizeUniform();  // generate a uniform random matrix
+    dropOutMask_->biggerThanScalar(config_.drop_rate());  // random mask
+    outV->dotMul(*outV, *dropOutMask_);                   // dropout
+  } else if (passType_ == PASS_GC) {
+    // only initialize once
+    if (!dropOutMask_) {
+      dropOutMask_ = Matrix::create(outV->getHeight(), outV->getWidth(), false,
+                                    useGpu(deviceId_));
+      // We use cpu matrix to generate mask so that the mask
+      // will be same for both gpu version and cpu version.
+      // This will help unittest to make sure they have same result.
+      MatrixPtr tmpMask = Matrix::create(outV->getHeight(), outV->getWidth());
+      tmpMask->randomizeUniform();  // generate a uniform random matrix
+      tmpMask->biggerThanScalar(config_.drop_rate());  // random mask
+      dropOutMask_->copyFrom(*tmpMask);
+    }
+    outV->dotMul(*outV, *dropOutMask_);
+  } else {  // passType == PASS_TEST
+    outV->mulScalar(1.0 - config_.drop_rate());
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
new file mode 100644
index 00000000000000..ae7cdb00281207
--- /dev/null
+++ b/paddle/gserver/layers/Layer.h
@@ -0,0 +1,450 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <memory>
+#include <functional>
+#include <paddle/parameter/Argument.h>
+#include "paddle/utils/ClassRegistrar.h"
+#include "paddle/math/CpuSparseMatrix.h"
+#include "paddle/parameter/Parameter.h"
+#include "paddle/utils/Util.h"
+#include "ModelConfig.pb.h"
+
+#include "paddle/gserver/activations/ActivationFunction.h"
+#include <paddle/parameter/ParallelParameter.h>
+#include <paddle/parameter/Weight.h>
+
+/// Macro for registering a layer type.
+/// Example: REGISTER_LAYER(crf_error, CRFDecodingErrorLayer);
+#define REGISTER_LAYER(__type_name, __class_name) \
+  static InitFunction __reg_type_##__type_name(   \
+      []() { Layer::registrar_.registerClass<__class_name>(#__type_name); })
+
+#define REGISTER_LAYER_CREATE_FUNC(__type_name, createFunction) \
+  static InitFunction __reg_type_##__type_name(                 \
+      []() { Layer::registrar_.registerClass(#__type_name, createFunction); })
+
+namespace paddle {
+
+class Layer;
+typedef std::shared_ptr<Layer> LayerPtr;
+typedef std::map<std::string, LayerPtr> LayerMap;
+class NeuralNetwork;
+
+/// layer state, used for RNN and LSTM layers
+struct LayerState {
+  std::vector<MatrixPtr> value;
+};
+typedef std::shared_ptr<LayerState> LayerStatePtr;
+
+/**
+ * @brief Base class for layer.
+ * Define necessary variables and functions for every layer.
+ */
+class Layer {
+protected:
+  /// Layer config
+  LayerConfig config_;
+  /// whether to use GPU
+  bool useGpu_;
+  /// Device Id. CPU is -1, and GPU is 0, 1, 2 ...
+  int deviceId_;
+  /// Input layers
+  std::vector<LayerPtr> inputLayers_;
+  /// Argument of input layers
+  std::vector<std::string> inputArgument_;
+
+  /// Parameter for each input layer.
+  /// Parameters_[i] is nullptr if inputLayers_[i] does not need parameter.
+  std::vector<ParameterPtr> parameters_;
+
+  /// nullptr if bias is not needed.
+  ParameterPtr biasParameter_;
+
+  /// Output
+  Argument output_;
+  /// Several outputs stored on different devices, used in 'parallel_nn' case,
+  /// and record them by deviceId_.
+  std::vector<Argument> outputOtherDevice_;
+  /// If there are several outputs, map them by each name.
+  std::map<std::string, Argument*> outputMap_;
+  /// Used to merge grad on different devices.
+  MatrixPtr tmpGrad_;
+
+  std::unique_ptr<ActivationFunction> activation_;
+
+  /// Current passType, PASS_TRAIN or PASS_TEST
+  PassType passType_;
+
+  /// Random 0-1 matrix for dropOut
+  MatrixPtr dropOutMask_;
+
+  /// Whether the layer need to compute gradient
+  bool needGradient_;
+  /// Whether the layer need to compute re-sequence information
+  bool needSequenceInfo_;
+
+  /// Mark input grad in(true) or out(false) of backward function.
+  std::vector<bool> markInBackward_;
+
+public:
+  /**
+    * Wait until all input value ready.
+    * Called before Layer::forward() function.
+    */
+  virtual void waitInputValue();
+
+  /**
+   * Copy layer's output_ to other device. 
+   * If output layer is in other device, called after Layer::forward() function.
+   */
+  virtual void copyOutputToOtherDevice();
+
+  /**
+    * Wait until all output grad ready and merge them to output_.grad.
+    * Called before Layer::backward() function.
+    */
+  virtual void waitAndMergeOutputGrad();
+
+  /**
+   * Notify previous layer the output grad ready.
+   * Called after Layer::backward() function.
+   */
+  virtual void markAllInputGrad();
+
+protected:
+  /**
+   * Notify specified layer the output grad ready.
+   * Called in the backward function.
+   * If do mark input grad in the backward function, you should to ensure
+   * that all input grad will be marked in the backward function.
+   */
+  void markInputGrad(int inputIndex);
+
+  /**
+   * Get the argument of input layer.
+   */
+  const Argument& getInput(size_t inputIndex) const {
+    return inputLayers_[inputIndex]->getOutput(deviceId_);
+  }
+
+  /**
+   * Get the argument of input layer.
+   */
+  const Argument& getInput(const Layer& inputLayer) const {
+    return inputLayer.getOutput(deviceId_);
+  }
+
+  /**
+   * Get the forward-input value.
+   */
+  const MatrixPtr& getInputValue(int inputIndex) {
+    return inputLayers_[inputIndex]->getOutput(deviceId_).value;
+  }
+
+  /**
+   * Get the forward-input value.
+   */
+  const MatrixPtr& getInputValue(const Layer& inputLayer) {
+    return inputLayer.getOutput(deviceId_).value;
+  }
+
+  /**
+   * Get the forward-input grad.
+   */
+  const MatrixPtr& getInputGrad(int inputIndex) {
+    return inputLayers_[inputIndex]->getOutput(deviceId_).grad;
+  }
+
+  /**
+   * Get the forward-input grad.
+   */
+  const MatrixPtr& getInputGrad(const Layer& inputLayer) {
+    return inputLayer.getOutput(deviceId_).grad;
+  }
+
+  /**
+   * Get the forward-input label.
+   */
+  const IVectorPtr& getInputLabel(const Layer& inputLayer) {
+    return inputLayer.getOutput(deviceId_).ids;
+  }
+
+  /**
+   * Change the size of output (value, grad).
+   * Reset to value zero if isValueClean = true,
+   * Reset to grad zero if isGradClean = true.
+   */
+  void resetSpecifyOutput(Argument& output, size_t height, size_t width,
+                          bool isValueClean, bool isGradClean);
+
+  /**
+   * Add output argument to other devices.
+   */
+  void addOutputArgument(int deviceId);
+
+public:
+  explicit Layer(const LayerConfig& config, bool useGpu = FLAGS_use_gpu);
+  virtual ~Layer() {}
+
+  /// Register a Layer
+  static ClassRegistrar<Layer, LayerConfig> registrar_;
+
+  /** 
+   * Get the flag whether layer need to compute gradient.
+   */
+  bool needGradient() const { return needGradient_; }
+
+  /** 
+   * Set the flag whether layer need to compute gradient.
+   */
+  void setNeedGradient(bool need) { needGradient_ = need; }
+
+  /** 
+   * Set the flag whether layer need to re-compute sequence information,
+   * which includes sequenceStartPositions or subSequenceStartPositions.
+   */
+  void setNeedSequenceInfo(bool need) { needSequenceInfo_ = need; }
+
+  /** 
+   * Get layer's name.
+   */
+  const std::string& getName() const { return config_.name(); }
+
+  /** 
+   * Get layer's type.
+   */
+  const std::string& getType() const { return config_.type(); }
+
+  /** 
+   * Get layer's size.
+   */
+  size_t getSize() const { return config_.size(); }
+
+  /** 
+   * Get layer's deviceId.
+   */
+  int getDeviceId() const { return deviceId_; }
+
+  /** 
+   * Add the inputLayer.
+   */
+  void addPrev(LayerPtr l) { inputLayers_.push_back(l); }
+
+  /** 
+   * Get the size of inputLayer[i].
+   */
+  const LayerPtr& getPrev(size_t i) { return inputLayers_[i]; }
+
+  /**
+   * Get the forward-output value.
+   */
+  const MatrixPtr& getOutputValue() { return output_.value; }
+
+  /**
+   * Get the forward-output label.
+   */
+  const IVectorPtr& getOutputLabel() { return output_.ids; }
+
+  /**
+   * Get the backward-Loss value.
+   */
+  const MatrixPtr& getOutputGrad() { return output_.grad; }
+  /**
+   * If layer has multi-output, set output into outputMap_. 
+   */
+  void setOutput(const std::string& name, Argument* output) {
+    outputMap_[name] = output;
+  }
+
+  /**
+   * Get the output based on layer's name.
+   */
+  Argument& getOutput(const std::string& str = "") {
+    if (str == "") {
+      return output_;
+    } else {
+      auto output = outputMap_.find(str);
+      if (output != outputMap_.end()) {
+        return *output->second;
+      } else {
+        LOG(FATAL) << "No specific output " << str;
+      }
+    }
+  }
+
+  /**
+   * Get the output based on deviceId.
+   */
+  const Argument& getOutput(int deviceId) const {
+    if (deviceId == getDeviceId()) {
+      return output_;
+    } else {
+      for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+        if (outputOtherDevice_[i].deviceId == deviceId) {
+          return outputOtherDevice_[i];
+        }
+      }
+
+      LOG(FATAL) << "No specific device output ";
+      return *((Argument*)nullptr);
+    }
+  }
+
+  /**
+   * Get layer's parameters.
+   */
+  const std::vector<ParameterPtr>& getParameters() { return parameters_; }
+
+  /**
+   * Get layer's bias-parameters.
+   */
+  const ParameterPtr& getBiasParameter() { return biasParameter_; }
+
+  /**
+   * Create pointer of layer.
+   */
+  static LayerPtr create(const LayerConfig& config);
+
+  /**
+   * Resize the output matrix size.
+   */
+  void resizeOutput(size_t height, size_t width);
+
+  /**
+   * Resize the output matrix size,
+   * and reset value to zero.
+   */
+  void reserveOutput(size_t height, size_t width);
+
+  /**
+   * Resize the output matrix size,
+   * and reset value and grad to zero.
+   */
+  void resetOutput(size_t height, size_t width);
+
+  /**
+   * Clear the gradient of output.
+   */
+  void zeroGrad();
+
+  /**
+   * Intialization.
+   * For example, adding input layers from layerMap and parameterMap.
+   */
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  /**
+   * Intialization for sub network if there has sub network.
+   * @param rootNetwork root network
+   * @param config model config 
+   * @param parameterTypes parameter's type 
+   * @param useGpu whether to use gpu or not
+   */
+  virtual void initSubNetwork(NeuralNetwork* rootNetwork,
+                              const ModelConfig& config,
+                              const std::vector<ParameterType>& parameterTypes,
+                              bool useGpu) {}
+
+  /**
+   * @brief Access SubNetwork Object.
+   *        If subnetwork exists, then invoke callback with subnetwrk.
+   * @param callback if sub-network is exist, the callback is invoked.
+   */
+  virtual void accessSubNetwork(
+      const std::function<void(NeuralNetwork&)>& callback) {}
+
+  /**
+   * If use sparse row matrix as parameter,
+   * prefetch feature ids in input label.
+   */
+  virtual void prefetch() {}
+
+  /**
+   * Forward propagation.
+   * All inherited implementation should call Layer::foward() function.
+   */
+  virtual void forward(PassType passType) {
+    passType_ = passType;
+    if (!inputLayers_.empty() && needSequenceInfo_) {
+      const Argument& input = getInput(0);
+      output_.sequenceStartPositions = input.sequenceStartPositions;
+      output_.subSequenceStartPositions = input.subSequenceStartPositions;
+      output_.cpuSequenceDims = input.cpuSequenceDims;
+    }
+  }
+
+  /**
+   * Reset the internal state variables.
+   * Allocate them if they have not been allocated.
+   * This function need to called before Layer::forward() for generating sequence.
+   *
+   * This is used for sequence generation. When generating sequence, the
+   * calculation at current timestamp depends on the state from previous
+   * timestamp. The model needs to keep the information about the previous
+   * timestamp in the state variables. Layers such as RecurrentLayer,
+   * LstmLayer and ContextLayer have state variables.
+   */
+  virtual void resetState() {}
+
+  /**
+   * Set layer state.
+   */
+  virtual void setState(LayerStatePtr state) {}
+
+  /**
+   * Get layer state. 
+   * @return A copy of internal state.
+   */
+  virtual LayerStatePtr getState() { return nullptr; }
+
+  /**
+   * Show output state.
+   */
+  void showOutputStats();
+
+  /**
+   * Backward propagation.
+   * Should only be called after Layer::forward() function.
+   */
+  virtual void backward(const UpdateCallback& callback = nullptr) = 0;
+
+  /**
+   * One pass is finished.
+   */
+  virtual void onPassEnd() {}
+
+protected:
+  /**
+   * Forward of activation function.
+   */
+  void forwardActivation();
+  /**
+   * Backward of activation function.
+   */
+  void backwardActivation();
+  /**
+   * Forward of dropOut.
+   */
+  void forwardDropOut();
+  /**
+   * Initilize the needGradient_ flag.
+   */
+  void initNeedFlags();
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/LinearChainCRF.cpp b/paddle/gserver/layers/LinearChainCRF.cpp
new file mode 100644
index 00000000000000..fb54fd26cf36e2
--- /dev/null
+++ b/paddle/gserver/layers/LinearChainCRF.cpp
@@ -0,0 +1,222 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <algorithm>
+#include "LinearChainCRF.h"
+
+namespace paddle {
+
+LinearChainCRF::LinearChainCRF(int numClasses, real* para, real* grad)
+    : numClasses_(numClasses) {
+  a_ = Matrix::create(para, 1, numClasses_);
+  b_ = Matrix::create(para + numClasses_, 1, numClasses_);
+  w_ = Matrix::create(para + 2 * numClasses_, numClasses_, numClasses_);
+
+  if (grad) {
+    da_ = Matrix::create(grad, 1, numClasses_);
+    db_ = Matrix::create(grad + numClasses_, 1, numClasses_);
+    dw_ = Matrix::create(grad + 2 * numClasses_, numClasses_, numClasses_);
+  }
+
+  ones_ = Matrix::create(1, numClasses_);
+  ones_->one();
+
+  expW_ = Matrix::create(numClasses_, numClasses_);
+}
+
+// normalize x so that its sum is 1 and return the original sum;
+static real normalizeL1(real* x, int n) {
+  real sum = 0;
+  for (int i = 0; i < n; ++i) {
+    sum += x[i];
+  }
+  // Right now, we just bet that sum won't be zero. If this really happens,
+  // we will figure out what should be done then.
+  CHECK_GT(sum, 0);
+  real s = 1 / sum;
+  for (int i = 0; i < n; ++i) {
+    x[i] *= s;
+  }
+  return sum;
+}
+
+real LinearChainCRF::forward(real* x, int* s, int length) {
+  Matrix::resizeOrCreate(maxX_, length, 1);
+  Matrix::resizeOrCreate(expX_, length, numClasses_);
+  Matrix::resizeOrCreate(alpha_, length, numClasses_);
+  MatrixPtr matX = Matrix::create(x, length, numClasses_);
+  matX->rowMax(*maxX_);
+  expX_->assign(*matX);
+  // subtract max to avoid overflow or underflow
+  expX_->mul(maxX_, ones_, (real)-1, (real)1);
+  expX_->exp();
+
+  real* a = a_->getData();
+  real* b = b_->getData();
+  real* w = w_->getData();
+  real* alpha = alpha_->getData();
+  real* expX = expX_->getData();
+  real* maxX = maxX_->getData();
+
+  expW_->exp(*w_);
+  real* expW = expW_->getData();
+
+  for (int i = 0; i < numClasses_; ++i) {
+    alpha[i] = exp(a[i]) * expX[i];
+  }
+  real ll = -maxX[0] - log(normalizeL1(alpha, numClasses_));
+
+  for (int k = 1; k < length; ++k) {
+    for (int i = 0; i < numClasses_; ++i) {
+      real sum = 0;
+      for (int j = 0; j < numClasses_; ++j) {
+        sum += alpha[(k - 1) * numClasses_ + j]  // (*)
+               * expW[j * numClasses_ + i];
+      }
+      alpha[k * numClasses_ + i] = expX[k * numClasses_ + i] * sum;
+    }
+    // normalizeL1 is to avoid underflow or overflow at (*)
+    ll -= maxX[k] + log(normalizeL1(alpha + k * numClasses_, numClasses_));
+  }
+  real sum = 0;
+  for (int i = 0; i < numClasses_; ++i) {
+    sum += alpha[(length - 1) * numClasses_ + i] * exp(b[i]);
+  }
+  ll -= log(sum);
+  // Now ll is equal to -log(Z)
+
+  CHECK_LT(*std::max_element(s, s + length), numClasses_);
+  // Calculate the nominator part, which depends on s
+  ll += a[s[0]] + x[s[0]] + b[s[length - 1]];
+  for (int k = 1; k < length; ++k) {
+    ll += x[k * numClasses_ + s[k]] + w[s[k - 1] * numClasses_ + s[k]];
+  }
+
+  VLOG(1) << "ll=" << ll;
+  return -ll;
+}
+
+void LinearChainCRF::backward(real* x, real* dx, int* s, int length) {
+  MatrixPtr matX = Matrix::create(x, length, numClasses_);
+  MatrixPtr matDX = Matrix::create(dx, length, numClasses_);
+  MatrixPtr matGrad = Matrix::create(length, numClasses_);
+  Matrix::resizeOrCreate(beta_, length, numClasses_);
+  real* b = b_->getData();
+  real* dw = dw_ ? dw_->getData() : nullptr;
+
+  real* alpha = alpha_->getData();
+  real* beta = beta_->getData();
+  real* expW = expW_->getData();
+  real* expX = expX_->getData();
+  real* grad = matGrad->getData();
+
+  for (int i = 0; i < numClasses_; ++i) {
+    beta[(length - 1) * numClasses_ + i] = exp(b[i]);
+  }
+  normalizeL1(beta + (length - 1) * numClasses_, numClasses_);
+
+  for (int k = length - 2; k >= 0; --k) {
+    for (int i = 0; i < numClasses_; ++i) {
+      real sum = 0;
+      for (int j = 0; j < numClasses_; ++j) {
+        sum += expW[i * numClasses_ + j]  // (**)
+               * beta[(k + 1) * numClasses_ + j] *
+               expX[(k + 1) * numClasses_ + j];
+      }
+      beta[k * numClasses_ + i] = sum;
+    }
+    // normalizeL1 is to avoid underflow or overflow at (**)
+    normalizeL1(beta + k * numClasses_, numClasses_);
+  }
+
+  matGrad->dotMul(*alpha_, *beta_);
+  matGrad->rowNormalizeL1(*matGrad);
+  for (int k = 0; k < length; ++k) {
+    grad[k * numClasses_ + s[k]] -= (real)1;
+  }
+  matDX->add(*matGrad);
+  if (da_) {
+    da_->add(*matGrad->subMatrix(/* startRow= */ 0, /* numRows= */ 1));
+  }
+  if (db_) {
+    db_->add(*matGrad->subMatrix(/* startRow= */ length - 1, 1));
+  }
+
+  beta_->dotMul(*beta_, *expX_);
+  beta_->rowNormalizeL1(*beta_);
+
+  for (int k = 1; dw && k < length; ++k) {
+    real sum = 0;
+    for (int i = 0; i < numClasses_; ++i) {
+      for (int j = 0; j < numClasses_; ++j) {
+        sum += expW[i * numClasses_ + j] * alpha[(k - 1) * numClasses_ + i] *
+               beta[k * numClasses_ + j];
+      }
+    }
+    sum = 1 / sum;
+    for (int i = 0; i < numClasses_; ++i) {
+      for (int j = 0; j < numClasses_; ++j) {
+        dw[i * numClasses_ + j] += sum * expW[i * numClasses_ + j] *
+                                   alpha[(k - 1) * numClasses_ + i] *
+                                   beta[k * numClasses_ + j];
+      }
+    }
+    dw[s[k - 1] * numClasses_ + s[k]] -= (real)1;
+  }
+}
+
+void LinearChainCRF::decode(real* x, int* s, int length) {
+  Matrix::resizeOrCreate(alpha_, length, numClasses_);
+  real* a = a_->getData();
+  real* b = b_->getData();
+  real* w = w_->getData();
+  IVector::resizeOrCreate(track_, numClasses_ * length, /* useGpu= */ false);
+  int* track = track_->getData();
+  real* alpha = alpha_->getData();
+
+  for (int i = 0; i < numClasses_; ++i) {
+    alpha[i] = a[i] + x[i];
+  }
+  for (int k = 1; k < length; ++k) {
+    for (int i = 0; i < numClasses_; ++i) {
+      real maxScore = -std::numeric_limits<real>::max();
+      int maxJ = 0;
+      for (int j = 0; j < numClasses_; ++j) {
+        real score = alpha[(k - 1) * numClasses_ + j] + w[j * numClasses_ + i];
+        if (score > maxScore) {
+          maxScore = score;
+          maxJ = j;
+        }
+      }
+      alpha[k * numClasses_ + i] = maxScore + x[k * numClasses_ + i];
+      track[k * numClasses_ + i] = maxJ;
+    }
+  }
+  real maxScore = -std::numeric_limits<real>::max();
+  int maxI = 0;
+  for (int i = 0; i < numClasses_; ++i) {
+    real score = alpha[(length - 1) * numClasses_ + i] + b[i];
+    if (score > maxScore) {
+      maxScore = score;
+      maxI = i;
+    }
+  }
+  s[length - 1] = maxI;
+  for (int k = length - 1; k >= 1; --k) {
+    s[k - 1] = maxI = track[k * numClasses_ + maxI];
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/LinearChainCRF.h b/paddle/gserver/layers/LinearChainCRF.h
new file mode 100644
index 00000000000000..3bde1aa415ce9b
--- /dev/null
+++ b/paddle/gserver/layers/LinearChainCRF.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+class LinearChainCRF {
+public:
+  /*
+    The size of para and grad must be (numClasses + 2) * numClasses.
+    The first numClasses values of para are for starting weights (a).
+    The next numClasses values of para are for ending weights (b),
+    The remaning values are for transition weights (w).
+
+    The probability of a state sequence s of length L is defined as:
+    P(s) = (1/Z) exp(a_{s_1} + b_{s_L}
+                     + \sum_{l=1}^L x_{s_l}
+                     + \sum_{l=2}^L w_{s_{l-1},s_l})
+    where Z is a normalization value so that the sum of P(s) over all possible
+    sequences is 1, and x is the input feature to the CRF.
+   */
+  LinearChainCRF(int numClasses, real* para, real* grad);
+
+  /*
+    Calculate the negative log likelihood of s given x.
+    The size of x must be length * numClasses. Each consecutive numClasses
+    values are the features for one time step.
+   */
+  real forward(real* x, int* s, int length);
+
+  /*
+    Calculate the gradient with respect to x, a, b, and w.
+    The gradient of x will be stored in dx.
+    backward() can only be called after a corresponding call to forward() with
+    the same x, s and length.
+    NOTE: The gradient is added to dx and grad (provided at constructor).
+   */
+  void backward(real* x, real* dx, int* s, int length);
+
+  /*
+    Find the most probable sequence given x. The result will be stored in s.
+   */
+  void decode(real* x, int* s, int length);
+
+protected:
+  int numClasses_;
+  MatrixPtr a_;
+  MatrixPtr b_;
+  MatrixPtr w_;
+  MatrixPtr da_;
+  MatrixPtr db_;
+  MatrixPtr dw_;
+  MatrixPtr ones_;
+
+  MatrixPtr expX_;
+  MatrixPtr alpha_;
+  MatrixPtr beta_;
+  MatrixPtr maxX_;
+  MatrixPtr expW_;
+
+  // track_(k,i) = j means that the best sequence at time k for class i comes
+  // from the sequence at time k-1 for class j
+  IVectorPtr track_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/LinearChainCTC.cpp b/paddle/gserver/layers/LinearChainCTC.cpp
new file mode 100644
index 00000000000000..c0ffadbd91c78f
--- /dev/null
+++ b/paddle/gserver/layers/LinearChainCTC.cpp
@@ -0,0 +1,262 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <math.h>
+#include "LinearChainCTC.h"
+#include <limits>
+
+namespace paddle {
+
+/* log scale */
+const real EXP_MAX = std::numeric_limits<real>::max();
+const real EXP_MIN = std::numeric_limits<real>::min();
+const real LOG_ZERO = std::log(EXP_MIN);
+const real LOG_INFINITY = std::log(EXP_MAX);
+
+static inline real safeExp(real x) {
+  if (x <= LOG_ZERO) {
+    return 0;
+  }
+  if (x >= LOG_INFINITY) {
+    return EXP_MAX;
+  }
+  return std::exp(x);
+}
+
+static inline real safeLog(real x) {
+  if (x <= EXP_MIN) {
+    return LOG_ZERO;
+  }
+  return std::log(x);
+}
+
+// x=lna and y=lnb is log scale, ln(a/b)=lna-lnb
+static inline real logDiv(real x, real y) {
+  if (x - y <= LOG_ZERO) {
+    return LOG_ZERO;
+  }
+  if (x - y >= LOG_INFINITY) {
+    return LOG_INFINITY;
+  }
+  return x - y;
+}
+
+// x=lna and y=lnb is log scale, ln(a*b)=lna+lnb
+static inline real logMul(real x, real y) {
+  if (x + y <= LOG_ZERO) {
+    return LOG_ZERO;
+  }
+  if (x + y >= LOG_INFINITY) {
+    return LOG_INFINITY;
+  }
+  return x + y;
+}
+
+// x=lna and y=lnb is log scale, ln(a+b)=lna+ln(1+exp(lnb-lna)), where b > a
+static inline real logAdd(real x, real y) {
+  if (x < y) {
+    real t = y;
+    y = x;
+    x = t;
+  }
+  return x + safeLog(1 + safeExp(y - x));
+}
+
+static void setLogZero(MatrixPtr mat) {
+  size_t size = mat->getElementCnt();
+  real* data = mat->getData();
+  for (size_t i = 0; i < size; i++) {
+    data[i] = LOG_ZERO;
+  }
+}
+
+LinearChainCTC::LinearChainCTC(int numClasses, bool normByTimes)
+    : numClasses_(numClasses), normByTimes_(normByTimes), logProb_(0) {
+  // set the class label of blank as "numClasses-1"
+  blank_ = numClasses - 1;
+
+  Matrix::resizeOrCreate(gradTerms_, 1, numClasses_);
+}
+
+real LinearChainCTC::forward(real* softmaxSeq, int softmaxSeqLen, int* labelSeq,
+                             int labelSeqLen) {
+  isInvalid_ = false;
+  totalTime_ = softmaxSeqLen;
+  totalSegments_ = labelSeqLen * 2 + 1;
+
+  int requiredTime = labelSeqLen;
+  int oldLabel = -1;
+
+  for (int i = 0; i < labelSeqLen; i++) {
+    if (labelSeq[i] == oldLabel) {
+      requiredTime++;
+    }
+    oldLabel = labelSeq[i];
+  }
+
+  if (totalTime_ < requiredTime) {
+    isInvalid_ = true;
+    return 0;
+  }
+
+  /* calculate the forward and backward variables,
+   * reference Chapter 7.3 of "Alex Grave, Supervised Sequence
+   * Labelling with Recurrent Neural Networks" */
+  Matrix::resizeOrCreate(logActs_, totalTime_, numClasses_, false, false);
+  real* logActsData = logActs_->getData();
+  for (int i = 0; i < totalTime_ * numClasses_; i++) {
+    logActsData[i] = safeLog(softmaxSeq[i]);
+  }
+
+  Matrix::resizeOrCreate(forwardVars_, totalTime_, totalSegments_);
+  Matrix::resizeOrCreate(backwardVars_, totalTime_, totalSegments_);
+
+  /* calculate the forward variables */
+  setLogZero(forwardVars_);
+  real* fwdVars = forwardVars_->getData();
+
+  /* dp initialization at t0 */
+  fwdVars[0] = logActs_->getData()[blank_];
+  if (totalSegments_ > 1) {
+    fwdVars[1] = logActs_->getData()[labelSeq[0]];
+  }
+  /* dp from t1 */
+  for (int i = 1; i < totalTime_; i++) {
+    real* dataPerStep = logActsData + i * numClasses_;
+    real* oldFvars = fwdVars + (i - 1) * totalSegments_;
+    real* fvars = fwdVars + i * totalSegments_;
+    int start, end;
+    segmentRange(start, end, i);
+    for (int j = start; j < end; j++) {
+      real fv;
+      if (j & 1) {
+        int labelIdx = j / 2;
+        int labelVal = labelSeq[labelIdx];
+        fv = logAdd(oldFvars[j], oldFvars[j - 1]);
+        if (j > 1 && (labelVal != labelSeq[labelIdx - 1])) {
+          fv = logAdd(fv, oldFvars[j - 2]);
+        }
+        fv = logMul(fv, dataPerStep[labelVal]);
+      } else {
+        fv = oldFvars[j];
+        if (j) {
+          fv = logAdd(fv, oldFvars[j - 1]);
+        }
+        fv = logMul(fv, dataPerStep[blank_]);
+      }
+      fvars[j] = fv;
+    }
+  }
+
+  real* lastFvs = fwdVars + (totalTime_ - 1) * totalSegments_;
+
+  /* sum the last two value as logprob */
+  logProb_ = lastFvs[totalSegments_ - 1];
+  if (totalSegments_ > 1) {
+    logProb_ = logAdd(logProb_, lastFvs[totalSegments_ - 2]);
+  }
+
+  /* calculate the backward variables */
+  setLogZero(backwardVars_);
+  real* bwdVars = backwardVars_->getData();
+  real* lastBvs = bwdVars + (totalTime_ - 1) * totalSegments_;
+
+  lastBvs[totalSegments_ - 1] = 0;
+  if (totalSegments_ > 1) {
+    lastBvs[totalSegments_ - 2] = 0;
+  }
+
+  for (int i = totalTime_ - 2; i >= 0; i--) {
+    real* oldDataPerStep = logActsData + (i + 1) * numClasses_;
+    real* oldBvars = bwdVars + (i + 1) * totalSegments_;
+    real* bvars = bwdVars + i * totalSegments_;
+    int start, end;
+    segmentRange(start, end, i);
+    for (int j = start; j < end; j++) {
+      real bv;
+      if (j & 1) {
+        int labelIdx = j / 2;
+        int labelVal = labelSeq[labelIdx];
+
+        bv = logAdd(logMul(oldBvars[j], oldDataPerStep[labelVal]),
+                    logMul(oldBvars[j + 1], oldDataPerStep[blank_]));
+        if (j < (totalSegments_ - 2)) {
+          int nextLabelVal = labelSeq[labelIdx + 1];
+          if (labelVal != nextLabelVal) {
+            bv = logAdd(bv,
+                        logMul(oldBvars[j + 2], oldDataPerStep[nextLabelVal]));
+          }
+        }
+      } else {
+        bv = logMul(oldBvars[j], oldDataPerStep[blank_]);
+        if (j < (totalSegments_ - 1)) {
+          bv = logAdd(bv,
+                      logMul(oldBvars[j + 1], oldDataPerStep[labelSeq[j / 2]]));
+        }
+      }
+      bvars[j] = bv;
+    }
+  }
+
+  VLOG(1) << "ctcLoss=" << -logProb_;
+
+  return -logProb_;
+}
+
+void LinearChainCTC::backward(real* softmaxSeq, real* grad, int* labelSeq,
+                              int labelSeqLen) {
+  /* if not meet the conditions of CTC computing, then set the grads to zeros */
+  if (isInvalid_) {
+    for (int i = 0; i < totalTime_ * numClasses_; i++) {
+      grad[i] += 0;
+    }
+    return;
+  }
+
+  real* fwdVars = forwardVars_->getData();
+  real* bwdVars = backwardVars_->getData();
+  real* logActsData = logActs_->getData();
+
+  for (int i = 0; i < totalTime_; i++) {
+    setLogZero(gradTerms_);
+    real* gradTermsData = gradTerms_->getData();
+    real* fvars = fwdVars + i * totalSegments_;
+    real* bvars = bwdVars + i * totalSegments_;
+    for (int j = 0; j < totalSegments_; j++) {
+      int k = (j & 1) ? labelSeq[j / 2] : blank_;
+      gradTermsData[k] = logAdd(gradTermsData[k], logMul(fvars[j], bvars[j]));
+    }
+    for (int j = 0; j < numClasses_; j++) {
+      if (normByTimes_) {
+        grad[i * numClasses_ + j] +=
+            -safeExp(
+                logDiv(gradTermsData[j],
+                       logMul(logProb_, logActsData[i * numClasses_ + j]))) /
+            totalTime_;
+      } else {
+        grad[i * numClasses_ + j] += -safeExp(logDiv(
+            gradTermsData[j],
+            logMul(logProb_, logActsData[i * numClasses_ + j])));
+      }
+    }
+  }
+}
+
+void LinearChainCTC::segmentRange(int& start, int& end, int time) {
+  start = std::max(0, totalSegments_ - (2 * (totalTime_ - time)));
+  end = std::min(totalSegments_, 2 * (time + 1));
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/LinearChainCTC.h b/paddle/gserver/layers/LinearChainCTC.h
new file mode 100644
index 00000000000000..b09218e3e78e16
--- /dev/null
+++ b/paddle/gserver/layers/LinearChainCTC.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <vector>
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+class LinearChainCTC {
+public:
+  LinearChainCTC(int numClasses, bool normByTimes);
+
+  // Calculate the negative log probability as loss
+  real forward(real* softmaxSeq, int softmaxSeqLen, int* labelSeq,
+               int labelSeqLen);
+
+  // calculate the gradient
+  void backward(real* softmaxSeq, real* softmaxSeqGrad, int* labelSeq,
+                int labelSeqLen);
+
+protected:
+  int numClasses_, blank_, totalSegments_, totalTime_;
+  bool normByTimes_;
+  bool isInvalid_;
+
+  MatrixPtr logActs_, forwardVars_, backwardVars_, gradTerms_;
+
+  real logProb_;
+
+  void segmentRange(int& start, int& end, int time);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmCompute.cpp b/paddle/gserver/layers/LstmCompute.cpp
new file mode 100644
index 00000000000000..ced9636d3528ac
--- /dev/null
+++ b/paddle/gserver/layers/LstmCompute.cpp
@@ -0,0 +1,83 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Util.h"
+#include "hl_recurrent_apply.cuh"
+#include "LstmCompute.h"
+
+namespace paddle {
+
+void LstmCompute::init(LayerConfig &config) {
+  activeNode_ = hlActiveType(config.active_type());
+  activeGate_ = hlActiveType(config.active_gate_type());
+  activeState_ = hlActiveType(config.active_state_type());
+}
+
+template <>
+void LstmCompute::forwardOneSequence<0>(hl_lstm_value value, int frameSize) {
+  hl_cpu_lstm_forward(hppl::forward::lstm(), value,
+                      frameSize, activeNode_, activeGate_,
+                      activeState_);
+}
+
+template <>
+void LstmCompute::backwardOneSequence<0>(hl_lstm_value value, hl_lstm_grad grad,
+                                        int frameSize) {
+  hl_cpu_lstm_backward(hppl::backward::lstm(), value, grad,
+                       frameSize, activeNode_, activeGate_,
+                       activeState_);
+}
+
+template <>
+void LstmCompute::forwardBatch<0>(hl_lstm_value value, int frameSize,
+                                 int batchSize) {
+  for (int b = 0; b < batchSize; b++) {
+    forwardOneSequence<0>(value, frameSize);
+
+    value.gateValue += frameSize * 4;
+    value.stateValue += frameSize;
+    value.stateActiveValue += frameSize;
+    value.outputValue += frameSize;
+    if (value.prevStateValue) {
+      value.prevStateValue += frameSize;
+    }
+  }
+}
+
+template <>
+void LstmCompute::backwardBatch<0>(hl_lstm_value value, hl_lstm_grad grad,
+                                  int frameSize, int batchSize) {
+  for (int b = 0; b < batchSize; b++) {
+    backwardOneSequence<0>(value, grad, frameSize);
+
+    value.gateValue += frameSize * 4;
+    value.stateValue += frameSize;
+    value.stateActiveValue += frameSize;
+    value.outputValue += frameSize;
+    if (value.prevStateValue) {
+      value.prevStateValue += frameSize;
+    }
+
+    grad.gateGrad += frameSize * 4;
+    grad.stateGrad += frameSize;
+    grad.stateActiveGrad += frameSize;
+    grad.outputGrad += frameSize;
+    if (grad.prevStateGrad) {
+      grad.prevStateGrad += frameSize;
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmCompute.cu b/paddle/gserver/layers/LstmCompute.cu
new file mode 100644
index 00000000000000..af271d682f6f10
--- /dev/null
+++ b/paddle/gserver/layers/LstmCompute.cu
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "LstmCompute.h"
+#include "hl_recurrent_apply.cuh"
+
+namespace paddle {
+
+template <>
+void LstmCompute::forwardBatch<1>(hl_lstm_value value, int frameSize,
+                                 int batchSize) {
+  hl_gpu_lstm_forward(hppl::forward::lstm(), value, frameSize,
+                      batchSize, activeNode_, activeGate_,
+                      activeState_);
+}
+
+template <>
+void LstmCompute::backwardBatch<1>(hl_lstm_value value, hl_lstm_grad grad,
+                                   int frameSize, int batchSize) {
+  hl_gpu_lstm_backward(hppl::backward::lstm(), value, grad,
+                       frameSize, batchSize, activeNode_,
+                       activeGate_, activeState_);
+}
+
+template <>
+void LstmCompute::forwardOneSequence<1>(hl_lstm_value value, int frameSize) {
+  hl_gpu_lstm_forward(hppl::forward::lstm(), value,
+                      frameSize, /* batchSize */ 1,
+                      activeNode_, activeGate_, activeState_);
+}
+
+template <>
+void LstmCompute::backwardOneSequence<1>(hl_lstm_value value, hl_lstm_grad grad,
+                                         int frameSize) {
+  hl_gpu_lstm_backward(hppl::backward::lstm(), value, grad,
+                       frameSize, /* batchSize */ 1,
+                       activeNode_, activeGate_, activeState_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmCompute.h b/paddle/gserver/layers/LstmCompute.h
new file mode 100644
index 00000000000000..638acdb56d7505
--- /dev/null
+++ b/paddle/gserver/layers/LstmCompute.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/utils/TypeDefs.h"
+#include "ModelConfig.pb.h"
+#include "hl_gpu.h"
+
+namespace paddle {
+
+class LstmCompute {
+public:
+  void init(LayerConfig &config);
+
+  /**
+   * LstmLayer batch compute API (forwardBatch, backwardBatch).
+   * If use batch compute api, lstm value(and grad) need to be batch structure.
+   * Compute order:
+   *   forwardBatch:  for 0 <= id < numBatch
+   *   backwardBatch:  for numBatch > id >= 0
+   */
+  template <bool useGpu>
+  void forwardBatch(hl_lstm_value value, int frameSize, int batchSize);
+
+  template <bool useGpu>
+  void backwardBatch(hl_lstm_value value, hl_lstm_grad grad, int frameSize,
+                     int batchSize);
+
+  /**
+   * LstmLayer sequence compute API (forwardOneSequence, backwardOneSequence).
+   * Compute order(for each sequence):
+   *   forwardOneSequence:
+   *     if (!reversed) for 0 <= seqId < seqLength
+   *     if (reversed)  for seqLength > seqId >= 0
+   *   backwardOneSequence:
+   *     if (!reversed) for seqLength > seqId >= 0
+   *     if (reversed)  for 0 <= seqId < seqLength
+   */
+  template <bool useGpu>
+  void forwardOneSequence(hl_lstm_value value, int frameSize);
+  template <bool useGpu>
+  void backwardOneSequence(hl_lstm_value value, hl_lstm_grad grad,
+                           int frameSize);
+
+public:
+  hl_activation_mode_t activeNode_;
+  hl_activation_mode_t activeGate_;
+  hl_activation_mode_t activeState_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmLayer.cpp b/paddle/gserver/layers/LstmLayer.cpp
new file mode 100644
index 00000000000000..61ad47a7fbd02f
--- /dev/null
+++ b/paddle/gserver/layers/LstmLayer.cpp
@@ -0,0 +1,700 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "LstmLayer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/BaseMatrix.h"
+#include "paddle/utils/Stat.h"
+
+P_DECLARE_bool(prev_batch_state);
+
+namespace paddle {
+
+REGISTER_LAYER(lstmemory, LstmLayer);
+
+bool LstmLayer::init(const LayerMap &layerMap,
+                     const ParameterMap &parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(1U, inputLayers_.size());
+  CHECK_EQ(1U, parameters_.size());
+  CHECK_EQ(getSize() * getSize() * 4, parameters_[0]->getSize());
+  CHECK_EQ(getSize() * 7, biasParameter_->getSize());
+  weight_.reset(new Weight(getSize(), getSize() * 4, parameters_[0]));
+  if (biasParameter_.get() != NULL) {
+    bias_.reset(new Weight(1, getSize() * 7, biasParameter_));
+    if (bias_->getW()) {
+      localBias_ = Matrix::create(nullptr, /* height= */ 1, getSize() * 4,
+                                  /* trans= */ false, useGpu_);
+      checkIg_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
+                                /* trans= */ false, useGpu_);
+      checkFg_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
+                                /* trans= */ false, useGpu_);
+      checkOg_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
+                                /* trans= */ false, useGpu_);
+
+      localBias_->setData(bias_->getW()->getData());
+      checkIg_->setData(bias_->getW()->getData() + getSize() * 4);
+      checkFg_->setData(bias_->getW()->getData() + getSize() * 5);
+      checkOg_->setData(bias_->getW()->getData() + getSize() * 6);
+    }
+
+    if (bias_->getWGrad()) {
+      localBiasGrad_ = Matrix::create(nullptr, /* height= */ 1, getSize() * 4,
+                                      /* trans= */ false, useGpu_);
+      checkIgGrad_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
+                                    /* trans= */ false, useGpu_);
+      checkFgGrad_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
+                                    /* trans= */ false, useGpu_);
+      checkOgGrad_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
+                                    /* trans= */ false, useGpu_);
+      localBiasGrad_->setData(bias_->getWGrad()->getData());
+      checkIgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 4);
+      checkFgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 5);
+      checkOgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 6);
+    }
+  } else {
+    LOG(FATAL) << "Bias should be here.";
+  }
+  reversed_ = config_.reversed();
+
+  // create IdentityActivation for using drop_rate
+  activation_.reset(ActivationFunction::create(""));
+
+  LstmCompute::init(config_);
+  useBatch_ = true;
+  useSeqParallel_ = false;
+  if (useGpu_ && (getSize() == 32 || getSize() == 64)) {
+    useSeqParallel_ = true;
+  }
+
+  return true;
+}
+
+void LstmLayer::resetState() {
+  CHECK(!reversed_) << "state is not allowed for reversed lstmemory layer";
+  Matrix::resizeOrCreate(prevOutput_, 1, getSize(), /* trans= */ false,
+                         useGpu_);
+  Matrix::resizeOrCreate(prevState_, 1, getSize(), /* trans= */ false, useGpu_);
+  prevOutput_->resize(0, getSize());
+  prevState_->resize(0, getSize());
+  if (FLAGS_prev_batch_state) {
+    useBatch_ = true;
+  } else {
+    useBatch_ = false;
+  }
+}
+
+void LstmLayer::setState(LayerStatePtr state) {
+  CHECK(state->value.size() == 2) << "two matrices are expected for LSTM state";
+  prevOutput_->resize(state->value[0]->getHeight(),
+                      state->value[0]->getWidth());
+  prevState_->resize(state->value[1]->getHeight(), state->value[1]->getWidth());
+  prevOutput_->copyFrom(*(state->value[0]));
+  prevState_->copyFrom(*(state->value[1]));
+}
+
+LayerStatePtr LstmLayer::getState() {
+  LayerStatePtr res = std::make_shared<LayerState>();
+  if (prevOutput_->getHeight() && prevOutput_->getWidth()) {
+    res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
+    res->value[0]->copyFrom(*prevOutput_);
+    res->value.push_back(prevState_->clone(0, 0, useGpu_));
+    res->value[1]->copyFrom(*prevState_);
+  } else {
+    MatrixPtr output =
+        Matrix::create(1, getSize(), /* trans= */ false, useGpu_);
+    MatrixPtr state = Matrix::create(1, getSize(), /* trans= */ false, useGpu_);
+    output->resize(0, getSize());
+    state->resize(0, getSize());
+    res->value.push_back(output);
+    res->value.push_back(state);
+  }
+  return res;
+}
+
+void LstmLayer::forward(PassType passType) {
+  REGISTER_TIMER_INFO("LstmFwTimer", getName().c_str());
+  Layer::forward(passType);
+
+  const Argument &input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  resetOutput(batchSize, getSize());
+  CHECK_EQ(getSize() * 4, input.value->getWidth());
+  size_t numSequences = input.getNumSequences();
+  const int *starts = input.sequenceStartPositions->getData(false);
+  CHECK_EQ(starts[numSequences], batchSize);
+
+  Matrix::resizeOrCreate(gate_.value,
+                         /* height= */ batchSize, getSize() * 4,
+                         /* trans= */ false, useGpu_);
+  if (prevOutput_) {
+    size_t prevNumSeq = useBatch_ ? numSequences : 1;
+    if (prevOutput_->getHeight() == 0) {
+      prevOutput_->resize(prevNumSeq, getSize());
+      prevState_->resize(prevNumSeq, getSize());
+      prevOutput_->zeroMem();
+      prevState_->zeroMem();
+    } else {
+      CHECK_EQ(prevOutput_->getHeight(), prevNumSeq)
+          << "the number of sequences must be the same";
+    }
+    Matrix::resizeOrCreate(totalState_, prevState_->getHeight() + batchSize,
+                           getSize(), /*trans*/ false, useGpu_);
+    state_.value = Matrix::create(nullptr, /* height= */ batchSize, getSize(),
+                                  /* trans= */ false, useGpu_);
+    state_.value->setData(totalState_->getData() +
+                          prevState_->getHeight() * getSize());
+  } else {
+    Matrix::resizeOrCreate(state_.value, /* height= */ batchSize, getSize(),
+                           /* trans= */ false, useGpu_);
+  }
+  Matrix::resizeOrCreate(preOutput_.value,
+                         /* height= */ batchSize, getSize(), /* trans= */ false,
+                         useGpu_);
+
+  if (!useBatch_) {
+    forwardSequence(batchSize, numSequences, starts, input.value);
+  } else {
+    if (!useSeqParallel_) {
+      forwardBatch(batchSize, numSequences, starts, input.value);
+    } else {
+      const int* starts = input.sequenceStartPositions->getData(useGpu_);
+      forwardSeqParallel(batchSize, numSequences, starts, input.value);
+    }
+  }
+  /*  activation */ { forwardActivation(); }
+}
+
+void LstmLayer::backward(const UpdateCallback &callback) {
+  REGISTER_TIMER_INFO("LstmBwTimer", getName().c_str());
+  /*  Do derivation */ { backwardActivation(); }
+
+  const Argument &input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  size_t numSequences = input.getNumSequences();
+
+  Matrix::resizeOrCreate(gate_.grad,
+                         /* height= */ batchSize, getSize() * 4,
+                         /* trans= */ false, useGpu_);
+  Matrix::resizeOrCreate(state_.grad,
+                         /* height= */ batchSize, getSize(), /* trans= */ false,
+                         useGpu_);
+  Matrix::resizeOrCreate(preOutput_.grad,
+                         /* height= */ batchSize, getSize(), /* trans= */ false,
+                         useGpu_);
+  state_.grad->zero();
+
+  const int *starts = input.sequenceStartPositions->getData(false);
+  if (!useBatch_) {
+    backwardSequence(batchSize, numSequences, starts, input.grad);
+  } else {
+    if (!useSeqParallel_) {
+      backwardBatch(batchSize, numSequences, starts, input.grad);
+    } else {
+      const int* starts = input.sequenceStartPositions->getData(useGpu_);
+      backwardSeqParallel(batchSize, numSequences, starts, input.grad);
+    }
+  }
+
+  if (bias_) {
+    bias_->getParameterPtr()->incUpdate(callback);
+  }
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+void LstmLayer::forwardSequence(int batchSize, size_t numSequences,
+                                const int *starts, MatrixPtr inputValue) {
+  REGISTER_TIMER_INFO("LstmFwSequenceTime", getName().c_str());
+  gate_.value->assign(*inputValue);
+  if (bias_) {
+    gate_.value->addBias(*localBias_, 1);
+  }
+
+  hl_lstm_value lstmValue;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+  lstmValue.gateValue = gate_.value->getData();
+  lstmValue.stateValue = state_.value->getData();
+  lstmValue.stateActiveValue = preOutput_.value->getData();
+  lstmValue.outputValue = output_.value->getData();
+  lstmValue.prevStateValue = nullptr;
+  if (reversed_) {
+    lstmValue.gateValue += (batchSize - 1) * getSize() * 4;
+    lstmValue.stateValue += (batchSize - 1) * getSize();
+    lstmValue.stateActiveValue += (batchSize - 1) * getSize();
+    lstmValue.outputValue += (batchSize - 1) * getSize();
+  }
+
+  auto nextFrame = [&lstmValue](bool reversed, int frameSize) {
+    lstmValue.prevStateValue = lstmValue.stateValue;
+    if (!reversed) {
+      lstmValue.gateValue += frameSize * 4;
+      lstmValue.stateValue += frameSize;
+      lstmValue.stateActiveValue += frameSize;
+      lstmValue.outputValue += frameSize;
+    } else {
+      lstmValue.gateValue -= frameSize * 4;
+      lstmValue.stateValue -= frameSize;
+      lstmValue.stateActiveValue -= frameSize;
+      lstmValue.outputValue -= frameSize;
+    }
+  };
+
+  MatrixPtr frameGate = Matrix::create(nullptr, /* height= */ 1, getSize() * 4,
+                                       /* trans= */ false, useGpu_);
+  MatrixPtr frameOutput = Matrix::create(nullptr, /* height= */ 1, getSize(),
+                                         /* trans= */ false, useGpu_);
+
+  if (!reversed_) {
+    if (prevState_) {
+      lstmValue.prevStateValue = prevState_->getData();
+    }
+    if (prevOutput_) {
+      frameGate->setData(lstmValue.gateValue);
+      frameGate->mul(prevOutput_, weight_->getW(), 1, 1);
+    }
+  }
+  AsyncGpuBlock asyncGpuBlock;
+  for (size_t n = 0; n < numSequences; ++n) {
+    int length;
+    if (!reversed_) {
+      length = starts[n + 1] - starts[n];
+    } else {
+      length = starts[numSequences - n] - starts[numSequences - n - 1];
+    }
+    for (int l = 0; l < length; ++l) {
+      if (useGpu_) {
+        LstmCompute::forwardOneSequence<1>(lstmValue, getSize());
+      } else {
+        LstmCompute::forwardOneSequence<0>(lstmValue, getSize());
+      }
+
+      if (l != length - 1) {
+        frameOutput->setData(lstmValue.outputValue);
+        nextFrame(reversed_, getSize());
+        frameGate->setData(lstmValue.gateValue);
+        frameGate->mul(frameOutput, weight_->getW(), 1, 1);
+      }
+    }
+    if (n != numSequences - 1) {
+      frameOutput->setData(lstmValue.outputValue);
+      nextFrame(reversed_, getSize());
+      frameGate->setData(lstmValue.gateValue);
+      if (!reversed_) {
+        if (!prevState_) lstmValue.prevStateValue = nullptr;
+        if (prevOutput_) {
+          frameGate->mul(frameOutput, weight_->getW(), 1, 1);
+        }
+      } else {
+        lstmValue.prevStateValue = nullptr;
+      }
+    }
+  }
+
+  if (!reversed_) {
+    if (prevState_) {
+      prevState_->assign(*state_.value->subMatrix(batchSize - 1, 1));
+    }
+    if (prevOutput_) {
+      prevOutput_->assign(*output_.value->subMatrix(batchSize - 1, 1));
+    }
+  }
+}
+
+void LstmLayer::backwardSequence(int batchSize, size_t numSequences,
+                                 const int *starts, MatrixPtr inputGrad) {
+  REGISTER_TIMER_INFO("LstmBwSequenceTime", getName().c_str());
+  MatrixPtr weightT = weight_->getW()->getTranspose();
+
+  hl_lstm_value lstmValue;
+  hl_lstm_grad lstmGrad;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+  lstmValue.gateValue = gate_.value->getData();
+  lstmValue.stateValue = state_.value->getData();
+  lstmValue.stateActiveValue = preOutput_.value->getData();
+  lstmValue.outputValue = nullptr;
+
+  if (bias_->getWGrad()) {
+    lstmGrad.checkIgGrad = checkIgGrad_->getData();
+    lstmGrad.checkFgGrad = checkFgGrad_->getData();
+    lstmGrad.checkOgGrad = checkOgGrad_->getData();
+  } else {
+    lstmGrad.checkIgGrad = nullptr;
+    lstmGrad.checkFgGrad = nullptr;
+    lstmGrad.checkOgGrad = nullptr;
+  }
+  lstmGrad.gateGrad = gate_.grad->getData();
+  lstmGrad.stateGrad = state_.grad->getData();
+  lstmGrad.stateActiveGrad = nullptr;
+  lstmGrad.outputGrad = output_.grad->getData();
+
+  if (!reversed_) {
+    lstmValue.gateValue += (batchSize - 1) * getSize() * 4;
+    lstmGrad.gateGrad += (batchSize - 1) * getSize() * 4;
+    lstmValue.stateValue += (batchSize - 1) * getSize();
+    lstmGrad.stateGrad += (batchSize - 1) * getSize();
+    lstmValue.stateActiveValue += (batchSize - 1) * getSize();
+    lstmGrad.outputGrad += (batchSize - 1) * getSize();
+    lstmValue.prevStateValue = lstmValue.stateValue - getSize();
+    lstmGrad.prevStateGrad = lstmGrad.stateGrad - getSize();
+  } else {
+    lstmValue.prevStateValue = lstmValue.stateValue + getSize();
+    lstmGrad.prevStateGrad = lstmGrad.stateGrad + getSize();
+  }
+
+  auto nextFrame = [&lstmValue, &lstmGrad](bool reversed, int frameSize) {
+    if (reversed) {
+      lstmValue.gateValue += frameSize * 4;
+      lstmGrad.gateGrad += frameSize * 4;
+      lstmValue.stateValue += frameSize;
+      lstmGrad.stateGrad += frameSize;
+      lstmValue.stateActiveValue += frameSize;
+      lstmGrad.outputGrad += frameSize;
+      lstmValue.prevStateValue = lstmValue.stateValue + frameSize;
+      lstmGrad.prevStateGrad = lstmGrad.stateGrad + frameSize;
+    } else {
+      lstmValue.gateValue -= frameSize * 4;
+      lstmGrad.gateGrad -= frameSize * 4;
+      lstmValue.stateValue -= frameSize;
+      lstmGrad.stateGrad -= frameSize;
+      lstmValue.stateActiveValue -= frameSize;
+      lstmGrad.outputGrad -= frameSize;
+      lstmValue.prevStateValue = lstmValue.stateValue - frameSize;
+      lstmGrad.prevStateGrad = lstmGrad.stateGrad - frameSize;
+    }
+  };
+
+  MatrixPtr frameGate = Matrix::create(nullptr, /* height= */ 1, getSize() * 4,
+                                       /* trans= */ false, useGpu_);
+  MatrixPtr frameOutput = Matrix::create(nullptr, /* height= */ 1, getSize(),
+                                         /* trans= */ false, useGpu_);
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    for (size_t n = 0; n < numSequences; ++n) {
+      int length;
+      int start;
+      if (reversed_) {
+        length = starts[n + 1] - starts[n];
+        start = starts[n];
+      } else {
+        length = starts[numSequences - n] - starts[numSequences - n - 1];
+        start = starts[numSequences - n - 1];
+      }
+      for (int l = 0; l < length; ++l) {
+        if (l == length - 1) {
+          lstmValue.prevStateValue = nullptr;
+          lstmGrad.prevStateGrad = nullptr;
+        }
+        if (useGpu_) {
+          LstmCompute::backwardOneSequence<1>(lstmValue, lstmGrad, getSize());
+        } else {
+          LstmCompute::backwardOneSequence<0>(lstmValue, lstmGrad, getSize());
+        }
+        if (l != length - 1) {
+          frameGate->setData(lstmGrad.gateGrad);
+          nextFrame(reversed_, getSize());
+          frameOutput->setData(lstmGrad.outputGrad);
+          frameOutput->mul(frameGate, weightT, 1, 1);
+        } else {
+          nextFrame(reversed_, getSize());
+        }
+      }
+
+      if (weight_->getWGrad()) {
+        if (!reversed_) {
+          weight_->getWGrad()->mul(
+              output_.value->subMatrix(start, length - 1)->getTranspose(),
+              gate_.grad->subMatrix(start + 1, length - 1), 1, 1);
+        } else {
+          weight_->getWGrad()->mul(
+              output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
+              gate_.grad->subMatrix(start, length - 1), 1, 1);
+        }
+      }
+    }
+  }
+
+  if (inputGrad) {
+    inputGrad->add(*gate_.grad);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    localBiasGrad_->collectBias(*gate_.grad, 1);
+  }
+}
+
+void LstmLayer::forwardBatch(int batchSize, size_t numSequences,
+                             const int *starts, MatrixPtr inputValue) {
+  REGISTER_TIMER_INFO("LstmFwBatchTime", getName().c_str());
+
+  hl_lstm_value lstmValue;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+
+  if (!batchValue_) {
+    batchValue_.reset(new SequenceToBatch(useGpu_));
+  }
+  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_,
+                                   prevOutput_ ? true : false);
+
+  batchValue_->resizeOrCreate(*output_.value);
+  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
+  if (bias_) {
+    gate_.value->addBias(*localBias_, 1);
+  }
+
+  {
+    int numBatch = batchValue_->getNumBatch();
+    int batchSize = 0;
+    AsyncGpuBlock asyncGpuBlock;
+    if (prevState_) {
+      lstmValue.prevStateValue = totalState_->getData();
+    } else {
+      lstmValue.prevStateValue = nullptr;
+    }
+    for (int n = 0; n < numBatch; n++) {
+      MatrixPtr outputValue = batchValue_->getBatchValue(n);
+      MatrixPtr gateValue = batchValue_->getBatchValue(*gate_.value, n);
+      batchSize = outputValue->getHeight();
+
+      if (n != 0) {
+        MatrixPtr batch1 = batchValue_->getBatchValue(n - 1, batchSize);
+        gateValue->mul(batch1, weight_->getW(), 1, 1);
+      } else if (prevOutput_) {
+        Matrix::resizeOrCreate(prevBatchOutput2_, gateValue->getHeight(),
+                               getSize(), false, useGpu_);
+        batchValue_->prevOutput2Batch(*prevOutput_, *prevBatchOutput2_);
+        gateValue->mul(prevBatchOutput2_, weight_->getW(), 1, 1);
+
+        batchValue_->prevOutput2Batch(*prevState_,
+                                      *totalState_->subMatrix(0, numSequences));
+      }
+
+      lstmValue.gateValue = gateValue->getData();
+      lstmValue.outputValue = outputValue->getData();
+      lstmValue.stateValue =
+          batchValue_->getBatchValue(*state_.value, n)->getData();
+      lstmValue.stateActiveValue =
+          batchValue_->getBatchValue(*preOutput_.value, n)->getData();
+      {
+        if (useGpu_) {
+          LstmCompute::forwardBatch<1>(lstmValue, getSize(), batchSize);
+        } else {
+          LstmCompute::forwardBatch<0>(lstmValue, getSize(), batchSize);
+        }
+      }
+      lstmValue.prevStateValue = lstmValue.stateValue;
+    }
+  }
+  {
+    REGISTER_TIMER_INFO("batchToSeq", getName().c_str());
+    batchValue_->copyBackSeq(*output_.value);
+  }
+  if (prevOutput_) {
+    getPrevBatchOutput(numSequences);
+    getPrevBatchState(numSequences);
+  }
+}
+
+void LstmLayer::getPrevBatchOutput(size_t numSequences) {
+  prevOutput_->resize(numSequences, getSize());
+  batchValue_->getSeqOutputFromBatch(*prevOutput_,
+                                     *batchValue_->getBatchValue());
+}
+
+void LstmLayer::getPrevBatchState(size_t numSequences) {
+  prevState_->resize(numSequences, getSize());
+  batchValue_->getSeqOutputFromBatch(*prevState_, *state_.value);
+}
+
+void LstmLayer::backwardBatch(int batchSize, size_t numSequences,
+                              const int *starts, MatrixPtr inputGrad) {
+  REGISTER_TIMER_INFO("LstmBwBatchTime", getName().c_str());
+
+  hl_lstm_value lstmValue;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+
+  hl_lstm_grad lstmGrad;
+  lstmGrad.stateActiveGrad = preOutput_.grad->getData();
+
+  if (bias_->getWGrad()) {
+    lstmGrad.checkIgGrad = checkIgGrad_->getData();
+    lstmGrad.checkFgGrad = checkFgGrad_->getData();
+    lstmGrad.checkOgGrad = checkOgGrad_->getData();
+  } else {
+    lstmGrad.checkIgGrad = nullptr;
+    lstmGrad.checkFgGrad = nullptr;
+    lstmGrad.checkOgGrad = nullptr;
+  }
+
+  if (!batchGrad_) {
+    batchGrad_.reset(new SequenceToBatch(useGpu_));
+  }
+  batchGrad_->shareIndexWith(*batchValue_);
+
+  {
+    REGISTER_TIMER_INFO("seqToBatch", getName().c_str());
+    batchGrad_->copyFromSeq(*output_.grad);
+  }
+
+  {
+    MatrixPtr weightT = weight_->getW()->getTranspose();
+    int numBatch = batchGrad_->getNumBatch();
+    int batchSize = 0;
+    AsyncGpuBlock asyncGpuBlock;
+    for (int n = (int)numBatch - 1; n >= 0; n--) {
+      MatrixPtr outputGrad = batchGrad_->getBatchValue(n);
+      MatrixPtr gateGrad = batchGrad_->getBatchValue(*gate_.grad, n);
+
+      lstmValue.gateValue =
+          batchGrad_->getBatchValue(*gate_.value, n)->getData();
+      lstmValue.stateValue =
+          batchGrad_->getBatchValue(*state_.value, n)->getData();
+      lstmValue.stateActiveValue =
+          batchGrad_->getBatchValue(*preOutput_.value, n)->getData();
+      lstmGrad.stateGrad =
+          batchGrad_->getBatchValue(*state_.grad, n)->getData();
+      lstmGrad.gateGrad = gateGrad->getData();
+      lstmGrad.outputGrad = outputGrad->getData();
+      {
+        batchSize = outputGrad->getHeight();
+        if (n != 0) {
+          lstmValue.prevStateValue =
+              batchGrad_->getBatchValue(*state_.value, n - 1)->getData();
+          lstmGrad.prevStateGrad =
+              batchGrad_->getBatchValue(*state_.grad, n - 1)->getData();
+        } else {
+          if (prevState_) {
+            lstmValue.prevStateValue = totalState_->getData();
+            lstmGrad.prevStateGrad = nullptr;
+          } else {
+            lstmValue.prevStateValue = nullptr;
+            lstmGrad.prevStateGrad = nullptr;
+          }
+        }
+        if (useGpu_) {
+          LstmCompute::backwardBatch<1>(lstmValue, lstmGrad,
+                                        getSize(), batchSize);
+        } else {
+          LstmCompute::backwardBatch<0>(lstmValue, lstmGrad,
+                                        getSize(), batchSize);
+        }
+      }
+
+      if (n != 0) {
+        MatrixPtr tmp = batchGrad_->getBatchValue(n - 1, batchSize);
+        tmp->mul(gateGrad, weightT, 1, 1);
+      }
+
+      if (n != 0 && weight_->getWGrad()) {
+        /* backward weight */
+        MatrixPtr outputValue = batchValue_->getBatchValue(n - 1, batchSize);
+        weight_->getWGrad()->mul(outputValue->getTranspose(), gateGrad, 1, 1);
+      } else if (prevOutput_ && weight_->getWGrad()) {
+        weight_->getWGrad()->mul(prevBatchOutput2_->getTranspose(), gateGrad, 1,
+                                 1);
+      }
+    }
+  }
+
+  if (inputGrad) {
+    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */ false);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    localBiasGrad_->collectBias(*gate_.grad, /* scale */ 1);
+  }
+}
+
+void LstmLayer::forwardSeqParallel(int batchSize, size_t numSequences,
+                                   const int *starts, MatrixPtr inputValue) {
+  REGISTER_TIMER_INFO("LstmFwSeqParallelTime", getName().c_str());
+  gate_.value->assign(*inputValue);
+  if (bias_) {
+    gate_.value->addBias(*localBias_, /* scale */ 1);
+  }
+
+  real *gateValue = gate_.value->getData();
+  real *stateValue = state_.value->getData();
+  real *outputValue = output_.value->getData();
+  real *preOutputValue = preOutput_.value->getData();
+  real *checkIg = checkIg_->getData();
+  real *checkFg = checkFg_->getData();
+  real *checkOg = checkOg_->getData();
+  real *weight = weight_->getW()->getData();
+  hl_lstm_parallel_forward(
+      gateValue, stateValue, preOutputValue, outputValue, checkIg, checkFg,
+      checkOg, weight, starts, getSize(), numSequences, reversed_, activeNode_,
+      activeGate_, activeState_);
+}
+
+void LstmLayer::backwardSeqParallel(int batchSize, size_t numSequences,
+                                    const int *starts, MatrixPtr inputGrad) {
+  REGISTER_TIMER_INFO("LstmBwSeqParallelTime", getName().c_str());
+  real *gateValue = gate_.value->getData();
+  real *gateGrad = gate_.grad->getData();
+  real *stateValue = state_.value->getData();
+  real *stateGrad = state_.grad->getData();
+  real *preOutputValue = preOutput_.value->getData();
+  real *preOutputGrad = preOutput_.grad->getData();
+  real *checkIg = checkIg_->getData();
+  real *checkFg = checkFg_->getData();
+  real *checkOg = checkOg_->getData();
+  real *outputGrad = output_.grad->getData();
+  real *weight = weight_->getW()->getData();
+
+  real *checkIgGrad;
+  real *checkFgGrad;
+  real *checkOgGrad;
+  if (bias_->getWGrad()) {
+    checkIgGrad = checkIgGrad_->getData();
+    checkFgGrad = checkFgGrad_->getData();
+    checkOgGrad = checkOgGrad_->getData();
+  } else {
+    checkIgGrad = nullptr;
+    checkFgGrad = nullptr;
+    checkOgGrad = nullptr;
+  }
+
+  hl_lstm_parallel_backward_data(
+      gateValue, gateGrad, stateValue, stateGrad, preOutputValue, preOutputGrad,
+      outputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
+      checkOgGrad, weight, starts, getSize(), numSequences, reversed_,
+      activeNode_, activeGate_, activeState_);
+
+  if (inputGrad) {
+    inputGrad->add(*gate_.grad);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    localBiasGrad_->collectBias(*gate_.grad, 1);
+  }
+
+  real *outputValue = output_.value->getData();
+  if (weight_->getWGrad()) {
+    real *weightGrad = weight_->getWGrad()->getData();
+    hl_lstm_parallel_backward_weight(weightGrad, outputValue, gateGrad,
+                                     starts, getSize(), batchSize,
+                                     numSequences, reversed_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmLayer.h b/paddle/gserver/layers/LstmLayer.h
new file mode 100644
index 00000000000000..75d73d365fc7f8
--- /dev/null
+++ b/paddle/gserver/layers/LstmLayer.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/BaseMatrix.h"
+#include "SequenceToBatch.h"
+#include "LstmCompute.h"
+namespace paddle {
+
+/*
+LstmLayer takes 1 input layer with size * 4.
+Input layer is diveded into 4 equal parts:
+  (input_s, input_ig, input_fg, input_og)
+
+For each sequence [start, end] it performs the following computation:
+
+out_i   = actState(state_i) * actGate(outputGate_i)
+state_i = actInput(input_s_i + bias_s + output_{i-1} * recurrIW)
+          * actGate(inputGate_i) + actGate(forgetGate_i) * state_{i-1}
+inputGate = input_ig_i + bias_ig + output_{i-1} * recurrIGW
+            + state_{i-1} * inputCheck
+ouputGate = input_og_i + bias_og + output_{i-1} * recurrOGW
+            + state_{i} * outputCheck
+forgetGate = input_fg_i + bias_fg + output_{i-1} * recurrFGW
+             + state_{i-1} * forgetCheck
+
+parameter[0] consists of (recurrIW, recurrIGW, recurrFGW, recurrOGW)
+baisParameter consists of
+  (bias_s, bias_ig, bias_og, bias_fg, inputCheck, forgetCheck, outputCheck)
+
+actInput is defined by config active_type
+actState is defined by config active_state_type
+actGate is defined by config actvie_gate_type
+*/
+
+class LstmLayer : public Layer, public LstmCompute {
+public:
+  explicit LstmLayer(const LayerConfig &config) : Layer(config) {}
+
+  bool init(const LayerMap &layerMap, const ParameterMap &parameterMap);
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback &callback);
+
+  void resetState();
+
+  void setState(LayerStatePtr state);
+
+  LayerStatePtr getState();
+
+protected:
+  void forwardSequence(int batchSize, size_t numSequences,
+                       const int *starts, MatrixPtr inputValue);
+  void backwardSequence(int batchSize, size_t numSequences,
+                        const int *starts, MatrixPtr inputGrad);
+
+  void forwardBatch(int batchSize, size_t numSequences,
+                    const int *starts, MatrixPtr inputValue);
+  void backwardBatch(int batchSize, size_t numSequences,
+                     const int *starts, MatrixPtr inputGrad);
+
+  void forwardSeqParallel(int batchSize, size_t numSequences,
+                          const int *starts, MatrixPtr inputValue);
+  void backwardSeqParallel(int batchSize, size_t numSequences,
+                           const int *starts, MatrixPtr inputGrad);
+  void getPrevBatchOutput(size_t numSequences);
+  void getPrevBatchState(size_t numSequences);
+
+protected:
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> bias_;
+  /* real bias and peephole for different gates */
+  MatrixPtr localBias_, checkIg_, checkFg_, checkOg_;
+  /* the gradient of, real bias and peephole for different gates */
+  MatrixPtr localBiasGrad_, checkIgGrad_, checkFgGrad_, checkOgGrad_;
+
+  Argument state_;
+  Argument preOutput_;
+  Argument gate_;
+  bool reversed_;
+  bool useBatch_;
+  bool useSeqParallel_;
+  std::unique_ptr<SequenceToBatch> batchValue_;
+  std::unique_ptr<SequenceToBatch> batchGrad_;
+
+  MatrixPtr prevState_;
+  MatrixPtr prevOutput_;
+  MatrixPtr prevBatchOutput2_;
+  MatrixPtr totalState_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmStepLayer.cpp b/paddle/gserver/layers/LstmStepLayer.cpp
new file mode 100644
index 00000000000000..fb0fdbf7e9c9a1
--- /dev/null
+++ b/paddle/gserver/layers/LstmStepLayer.cpp
@@ -0,0 +1,175 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "Layer.h"
+#include "LstmCompute.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/*
+ * LstmStepLayer used in recurrent layer group.
+ */
+class LstmStepLayer : public Layer, public LstmCompute {
+protected:
+  Argument state_;
+  Argument gate_;
+  Argument stateActive_;
+  MatrixPtr checkIg_, checkFg_, checkOg_;
+  MatrixPtr checkIgGrad_, checkFgGrad_, checkOgGrad_;
+  std::unique_ptr<Weight> weight_;
+
+public:
+  explicit LstmStepLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~LstmStepLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+REGISTER_LAYER(lstm_step, LstmStepLayer);
+
+bool LstmStepLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(2U, inputLayers_.size());
+
+  checkIg_ =
+      Matrix::create(nullptr,
+                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
+  checkFg_ =
+      Matrix::create(nullptr,
+                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
+  checkOg_ =
+      Matrix::create(nullptr,
+                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
+  checkIgGrad_ =
+      Matrix::create(nullptr,
+                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
+  checkFgGrad_ =
+      Matrix::create(nullptr,
+                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
+  checkOgGrad_ =
+      Matrix::create(nullptr,
+                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
+
+  if (biasParameter_.get() != NULL) {
+    CHECK_EQ(getSize() * 3, biasParameter_->getSize());
+    weight_.reset(new Weight(1, getSize() * 3, biasParameter_));
+    if (weight_->getW()) {
+      real* data = weight_->getW()->getData();
+      checkIg_->setData(data);
+      checkFg_->setData(data + getSize());
+      checkOg_->setData(data + getSize() * 2);
+    }
+
+    if (weight_->getWGrad()) {
+      real* data = weight_->getWGrad()->getData();
+      checkIgGrad_->setData(data);
+      checkFgGrad_->setData(data + getSize());
+      checkOgGrad_->setData(data + getSize() * 2);
+    }
+  }
+
+  setOutput("state", &state_);
+  LstmCompute::init(config_);
+  return true;
+}
+
+void LstmStepLayer::forward(PassType passType) {
+  REGISTER_TIMER_INFO("LstmRecurrentFwTime", getName().c_str());
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  const Argument& prevState = getInput(1);
+  CHECK_EQ(getSize() * 4, input.value->getWidth());
+  CHECK_EQ(getSize(), prevState.value->getWidth());
+  int batchSize = input.getBatchSize();
+  reserveOutput(batchSize, getSize());
+  resetSpecifyOutput(state_, batchSize, getSize(), /*  isValueClean */ false,
+                     /* isGradClean */ true);
+  resetSpecifyOutput(gate_, batchSize, getSize() * 4,
+                     /* isValueClean */ false, /* isGradClean */ false);
+  resetSpecifyOutput(stateActive_, batchSize, getSize(),
+                     /*  isValueClean */ false, /* isGradClean */ false);
+  gate_.value->assign(*input.value);
+
+  hl_lstm_value lstmValue;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+  lstmValue.gateValue = gate_.value->getData();
+  lstmValue.stateValue = state_.value->getData();
+  lstmValue.prevStateValue = prevState.value->getData();
+  lstmValue.stateActiveValue = stateActive_.value->getData();
+  lstmValue.outputValue = output_.value->getData();
+
+  if (useGpu_) {
+    LstmCompute::forwardBatch<1>(lstmValue, getSize(), batchSize);
+  } else {
+    LstmCompute::forwardBatch<0>(lstmValue, getSize(), batchSize);
+  }
+}
+
+void LstmStepLayer::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("LstmRecurrentBwTime", getName().c_str());
+  const Argument& input = getInput(0);
+  const Argument& prevState = getInput(1);
+  int batchSize = input.getBatchSize();
+
+  hl_lstm_value lstmValue;
+  hl_lstm_grad lstmGrad;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+  lstmValue.gateValue = gate_.value->getData();
+  lstmValue.prevStateValue = prevState.value->getData();
+  lstmValue.stateValue = state_.value->getData();
+  lstmValue.stateActiveValue = stateActive_.value->getData();
+
+  lstmGrad.gateGrad = gate_.grad->getData();
+  if (prevState.grad) {
+    lstmGrad.prevStateGrad = prevState.grad->getData();
+  } else {
+    lstmGrad.prevStateGrad = nullptr;
+  }
+  lstmGrad.stateGrad = state_.grad->getData();
+  lstmGrad.stateActiveGrad = stateActive_.grad->getData();
+  lstmGrad.outputGrad = output_.grad->getData();
+  lstmGrad.checkIgGrad = checkIgGrad_->getData();
+  lstmGrad.checkFgGrad = checkFgGrad_->getData();
+  lstmGrad.checkOgGrad = checkOgGrad_->getData();
+
+  if (useGpu_) {
+    LstmCompute::backwardBatch<1>(lstmValue, lstmGrad, getSize(),
+                                  batchSize);
+  } else {
+    LstmCompute::backwardBatch<0>(lstmValue, lstmGrad, getSize(),
+                                  batchSize);
+  }
+
+  if (input.grad) {
+    input.grad->add(*gate_.grad);
+  }
+
+  if (weight_) {
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MDLstmLayer.cpp b/paddle/gserver/layers/MDLstmLayer.cpp
new file mode 100644
index 00000000000000..8ca92dee6d0720
--- /dev/null
+++ b/paddle/gserver/layers/MDLstmLayer.cpp
@@ -0,0 +1,670 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "LstmLayer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/BaseMatrix.h"
+
+namespace paddle {
+
+class CoordIterator {
+public:
+  std::vector<int> dims_;
+  std::vector<bool> directions_;
+  std::vector<int> curPos_;
+  bool end_;
+
+  void step(size_t d, bool reversed) {
+    if (directions_[d] ^ reversed) {
+      if (curPos_[d] == dims_[d] - 1) {
+        curPos_[d] = 0;
+        if (d) {
+          step(d - 1, reversed);
+        } else {
+          end_ = true;
+        }
+      } else {
+        curPos_[d]++;
+      }
+    } else {
+      if (curPos_[d] == 0) {
+        curPos_[d] = dims_[d] - 1;
+        if (d) {
+          step(d - 1, reversed);
+        } else {
+          end_ = true;
+        }
+      } else {
+        curPos_[d]--;
+      }
+    }
+  }
+
+public:
+  CoordIterator(std::vector<int> dim, std::vector<bool> directions)
+      : dims_(dim), directions_(directions), end_(false) {
+    CHECK_EQ(dims_.size(), directions_.size());
+    for (size_t i = 0; i < dims_.size(); i++) {
+      curPos_.push_back(-1);
+    }
+  }
+  CoordIterator& operator++() {
+    step(dims_.size() - 1, false);
+    return *this;
+  }
+
+  CoordIterator& operator--() {
+    step(dims_.size() - 1, true);
+    return *this;
+  }
+
+  std::vector<int>& curPos() { return curPos_; }
+
+  int offset() {
+    int offset = curPos_[0];
+    for (size_t i = 1; i < dims_.size(); i++) {
+      offset = offset * dims_[i] + curPos_[i];
+    }
+    return offset;
+  }
+
+  int offset(const std::vector<int>& pos) {
+    int offset = pos[0];
+    for (size_t i = 1; i < dims_.size(); i++) {
+      offset = offset * dims_[i] + pos[i];
+    }
+    return offset;
+  }
+
+  std::vector<int>& begin() {
+    for (size_t i = 0; i < dims_.size(); i++) {
+      curPos_[i] = directions_[i] ? 0 : dims_[i] - 1;
+    }
+    end_ = false;
+    return curPos_;
+  }
+
+  std::vector<int>& rbegin() {
+    for (size_t i = 0; i < dims_.size(); i++) {
+      curPos_[i] = directions_[i] ? dims_[i] - 1 : 0;
+    }
+    end_ = false;
+    return curPos_;
+  }
+
+  bool end() { return end_; }
+
+  bool getPrePos(const std::vector<int>& delays, int idx,
+                 std::vector<int>& prePos) {
+    bool isAvial = true;
+    prePos.clear();
+    prePos.reserve(directions_.size());
+    for (size_t i = 0; i < directions_.size(); i++) {
+      if (int(i) == idx) {
+        prePos.push_back(curPos_[i] + delays[i] * (directions_[i] ? 1 : -1));
+        if (prePos[i] < 0) {
+          prePos[i] = 0;
+          isAvial = false;
+        }
+        if (prePos[i] >= dims_[i]) {
+          prePos[i] = dims_[i] - 1;
+          isAvial = false;
+        }
+      } else {
+        prePos.push_back(curPos_[i]);
+      }
+    }
+    return isAvial;
+  }
+
+  bool getNextPos(const std::vector<int>& delays, int idx,
+                  std::vector<int>& nextPos) {
+    bool isAvial = true;
+    nextPos.clear();
+    nextPos.reserve(directions_.size());
+    for (size_t i = 0; i < directions_.size(); i++) {
+      if (int(i) == idx) {
+        nextPos.push_back(curPos_[i] - delays[i] * (directions_[i] ? 1 : -1));
+        if (nextPos[i] < 0) {
+          nextPos[i] = 0;
+          isAvial = false;
+        }
+        if (nextPos[i] >= dims_[i]) {
+          nextPos[i] = dims_[i] - 1;
+          isAvial = false;
+        }
+      } else {
+        nextPos.push_back(curPos_[i]);
+      }
+    }
+    return isAvial;
+  }
+};
+/*
+ * MDLstmLayer takes 1 input layer with size * (3+numDims).
+ * For each sequence [start, end] it performs the following computation:
+ * out_i = actState(state_i) * actGate(outputGate_i)
+ *
+ * For example the image with 2 dims, we take the scanning order from left-top
+ * to right-bottom, then the 2 previous states of the current pixels are the
+ * ones located at left and top. And each of them has a independent forget gate.
+ *
+ * state_i = actInput(input_i) * actGate(inputGate_i) +
+ *           \sum{j}(actGate(forgetGate_i_j) * state_prev_i_j)
+ *
+ * inputGate = input_i * inputW + \sum{j}(output_prev_i_j * recurrInputW_j) +
+ *             \sum{j}(state_prev_i_j * inputCheck_j)
+ *
+ * ouputGate = input_i * outputW + \sum{j}(output_prev_i_j * recurrOutputW_j) +
+ *             state_i * outputCheck
+ *
+ * forgetGate_j = input_i * forgetW_j + \sum{j}(output_prev_i_j *
+ *                recurrForgetW_j) + \sum{j}(state_prev_i_j * forgetCheck_j)
+ *
+ * IG Layer: (Input, InputGate, ForgetGates, OutputGate) * OutputSize
+ * */
+
+class MDLstmLayer : public LstmLayer {
+public:
+  explicit MDLstmLayer(const LayerConfig& config) : LstmLayer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback& callback);
+
+protected:
+  void forwardOneSequence(int start, CoordIterator& coordIter);
+  void backwardOneSequence(int start, CoordIterator& coordIter);
+  void forwardGate2OutputSequence(int start, CoordIterator& coordIter);
+  void backwardGate2OutputSequence(int start, CoordIterator& coordIter);
+
+protected:
+  std::vector<Argument> frameInputGate_;
+  std::vector<Argument> frameForgetGate_;
+  std::vector<Argument> frameOutputGate_;
+  std::vector<Argument> frameInputNode_;
+  std::vector<Argument> frameGate_;
+  std::vector<Argument> frameState_;
+  std::vector<Argument> framePreOutput_;
+  std::vector<Argument> frameOutput_;
+
+  // Activation
+  std::unique_ptr<ActivationFunction> activationGate_;
+  std::unique_ptr<ActivationFunction> activationState_;
+
+  int numDims_;
+  size_t numBlocks_;
+  std::vector<bool> directions_;
+  std::vector<int> delays_;
+  std::vector<std::vector<int>> dimsV_;
+};
+
+REGISTER_LAYER(mdlstmemory, MDLstmLayer);
+
+bool MDLstmLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(1U, inputLayers_.size());
+  CHECK_EQ(1U, parameters_.size());
+
+  numBlocks_ = getSize();
+  numDims_ = config_.directions_size();
+  CHECK_EQ(numBlocks_ * numBlocks_ * (3 + numDims_), parameters_[0]->getSize());
+
+  // inode(1), ig(1), fg(numDims_), og(1), peepIg(1), peepFg(numDims_),
+  // peepOg(1), then size of localBias_ is 3+numDims_
+  CHECK_EQ(numBlocks_ * (5 + 2 * numDims_), biasParameter_->getSize());
+  weight_.reset(
+      new Weight(numBlocks_, numBlocks_ * (3 + numDims_), parameters_[0]));
+  if (biasParameter_.get() != NULL) {
+    bias_.reset(new Weight(1, numBlocks_ * (5 + 2 * numDims_), biasParameter_));
+    localBias_ =
+        Matrix::create(nullptr, /* height= */ 1, numBlocks_ * (3 + numDims_),
+                       /* trans= */ false, useGpu_);
+    checkIg_ = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
+                              /* trans= */ false, useGpu_);
+    checkFg_ = Matrix::create(nullptr, /* height= */ numDims_, numBlocks_,
+                              /* trans= */ false, useGpu_);
+    checkOg_ = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
+                              /* trans= */ false, useGpu_);
+    localBiasGrad_ =
+        Matrix::create(nullptr, /* height= */ 1, numBlocks_ * (3 + numDims_),
+                       /* trans= */ false, useGpu_);
+    checkIgGrad_ = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
+                                  /* trans= */ false, useGpu_);
+    checkFgGrad_ = Matrix::create(nullptr, /* height= */ numDims_, numBlocks_,
+                                  /* trans= */ false, useGpu_);
+    checkOgGrad_ = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
+                                  /* trans= */ false, useGpu_);
+
+    localBias_->setData(bias_->getW()->getData());
+    checkIg_->setData(bias_->getW()->getData() + numBlocks_ * (3 + numDims_));
+    checkFg_->setData(bias_->getW()->getData() + numBlocks_ * (4 + numDims_));
+    checkOg_->setData(bias_->getW()->getData() +
+                      numBlocks_ * (4 + 2 * numDims_));
+
+    if (bias_->getWGrad()) {
+      localBiasGrad_->setData(bias_->getWGrad()->getData());
+      checkIgGrad_->setData(bias_->getWGrad()->getData() +
+                            numBlocks_ * (3 + numDims_));
+      checkFgGrad_->setData(bias_->getWGrad()->getData() +
+                            numBlocks_ * (4 + numDims_));
+      checkOgGrad_->setData(bias_->getWGrad()->getData() +
+                            numBlocks_ * (4 + 2 * numDims_));
+    }
+  } else {
+    LOG(FATAL) << "Bias should be here.";
+  }
+  for (int i = 0; i < numDims_; i++) {
+    directions_.push_back(config_.directions(i));
+  }
+  for (int i = 0; i < numDims_; i++) {
+    delays_.push_back(-1);
+  }
+  activationGate_.reset(ActivationFunction::create(config_.active_gate_type()));
+  activationState_.reset(
+      ActivationFunction::create(config_.active_state_type()));
+
+  return true;
+}
+
+void MDLstmLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  int numSequences = input.getNumSequences();
+  resetOutput(batchSize, numBlocks_);
+  CHECK_EQ(numBlocks_ * (3 + numDims_), input.value->getWidth());
+  const int* starts = input.sequenceStartPositions->getData(false);
+  CHECK_EQ(starts[numSequences], batchSize);
+
+  int* dimsData = input.cpuSequenceDims->getData();
+  CHECK_EQ(int(input.cpuSequenceDims->getSize()), numDims_ * numSequences);
+
+  for (int i = 0; i < numSequences; i++) {
+    std::vector<int> dims;
+    for (int j = 0; j < numDims_; j++) {
+      dims.push_back(dimsData[i * numDims_ + j]);
+    }
+    dimsV_.push_back(dims);
+  }
+
+  frameInputGate_.reserve(batchSize);
+  frameForgetGate_.reserve(batchSize);
+  frameOutputGate_.reserve(batchSize);
+  frameInputNode_.reserve(batchSize);
+  frameGate_.reserve(batchSize);
+  frameState_.reserve(batchSize);
+  framePreOutput_.reserve(batchSize);
+  frameOutput_.reserve(batchSize);
+
+  Matrix::resizeOrCreate(gate_.value,
+                         /* height= */ batchSize, numBlocks_ * (3 + numDims_),
+                         /* trans= */ false, useGpu_);
+
+  for (int i = frameGate_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value =
+        Matrix::create(nullptr, /* height= */ 1, numBlocks_ * (3 + numDims_),
+                       /* trans= */ false, useGpu_);
+    arg.grad =
+        Matrix::create(nullptr, /* height= */ 1, numBlocks_ * (3 + numDims_),
+                       /* trans= */ false, useGpu_);
+    frameGate_.push_back(arg);
+  }
+  for (int i = frameInputGate_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
+                               /* trans= */ false, useGpu_);
+    arg.grad = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
+                              /* trans= */ false, useGpu_);
+    frameInputGate_.push_back(arg);
+  }
+  for (int i = frameForgetGate_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr, /* height= */ numDims_, numBlocks_,
+                               /* trans= */ false, useGpu_);
+    arg.grad = Matrix::create(nullptr, /* height= */ numDims_, numBlocks_,
+                              /* trans= */ false, useGpu_);
+    frameForgetGate_.push_back(arg);
+  }
+  for (int i = frameOutputGate_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
+                               /* trans= */ false, useGpu_);
+    arg.grad = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
+                              /* trans= */ false, useGpu_);
+    frameOutputGate_.push_back(arg);
+  }
+  for (int i = frameInputNode_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
+                               /* trans= */ false, useGpu_);
+    arg.grad = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
+                              /* trans= */ false, useGpu_);
+    frameInputNode_.push_back(arg);
+  }
+  for (int i = frameState_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(
+        /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
+    frameState_.push_back(arg);
+  }
+  for (int i = framePreOutput_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(
+        /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
+    framePreOutput_.push_back(arg);
+  }
+  for (int i = frameOutput_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
+                               /* trans= */ false, useGpu_);
+    arg.grad = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
+                              /* trans= */ false, useGpu_);
+    frameOutput_.push_back(arg);
+  }
+
+  for (int i = 0; i < batchSize; i++) {
+    frameOutput_[i].value->setData(output_.value->getData() + i * numBlocks_);
+    frameGate_[i].value->setData(gate_.value->getData() +
+                                 i * numBlocks_ * (3 + numDims_));
+    frameInputNode_[i].value->setData(gate_.value->getData() +
+                                      i * numBlocks_ * (3 + numDims_) +
+                                      numBlocks_ * 0);
+    frameInputGate_[i].value->setData(gate_.value->getData() +
+                                      i * numBlocks_ * (3 + numDims_) +
+                                      numBlocks_ * 1);
+    frameForgetGate_[i].value->setData(gate_.value->getData() +
+                                       i * numBlocks_ * (3 + numDims_) +
+                                       numBlocks_ * 2);
+    frameOutputGate_[i].value->setData(gate_.value->getData() +
+                                       i * numBlocks_ * (3 + numDims_) +
+                                       numBlocks_ * (2 + numDims_));
+  }
+
+  AsyncGpuBlock asyncGpuBlock;
+  gate_.value->assign(*input.value);
+
+  if (bias_) {
+    gate_.value->addBias(*localBias_, 1);
+  }
+
+  for (int i = 0; i < numSequences; i++) {
+    CoordIterator coordIter(dimsV_[i], directions_);
+    forwardOneSequence(starts[i], coordIter);
+  }
+}
+
+void MDLstmLayer::forwardGate2OutputSequence(int start,
+                                             CoordIterator& coordIter) {
+  int idxCurr = start + coordIter.offset();
+  std::vector<int> preOffsetV;
+  preOffsetV.reserve(numDims_);
+  for (int i = 0; i < numDims_; i++) {
+    std::vector<int> prePos;
+    if (coordIter.getPrePos(delays_, i, prePos)) {
+      preOffsetV[i] = coordIter.offset(prePos);
+    } else {
+      preOffsetV[i] = -1;
+    }
+  }
+
+  for (int i = 0; i < numDims_; i++) {
+    if (preOffsetV[i] >= 0) {
+      frameInputGate_[idxCurr].value->addDotMul(
+          *frameState_[start + preOffsetV[i]].value, *checkIg_, 1.0, 1.0);
+
+      MatrixPtr fgGateOneDim = Matrix::create(
+          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_, 1,
+          numBlocks_, false, useGpu_);
+      MatrixPtr checkFgOneDim =
+          Matrix::create(checkFg_->getData() + i * numBlocks_, 1.0, numBlocks_,
+                         false, useGpu_);
+      fgGateOneDim->addDotMul(*frameState_[start + preOffsetV[i]].value,
+                              *checkFgOneDim, 1.0, 1.0);
+    }
+  }
+  activationGate_->forward(frameInputGate_[idxCurr]);
+  activationGate_->forward(frameForgetGate_[idxCurr]);
+  activation_->forward(frameInputNode_[idxCurr]);
+
+  frameState_[idxCurr].value->zeroMem();
+  for (int i = 0; i < numDims_; i++) {
+    if (preOffsetV[i] >= 0) {
+      MatrixPtr fgGateOneDim = Matrix::create(
+          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_, 1,
+          numBlocks_, false, useGpu_);
+      frameState_[idxCurr].value->addDotMul(
+          *frameState_[start + preOffsetV[i]].value, *fgGateOneDim, 1.0, 1.0);
+    }
+  }
+  frameState_[idxCurr].value->addDotMul(*frameInputNode_[idxCurr].value,
+                                        *frameInputGate_[idxCurr].value, 1.0,
+                                        1.0);
+
+  frameOutputGate_[idxCurr].value->addDotMul(*frameState_[idxCurr].value,
+                                             *checkOg_, 1.0, 1.0);
+  activationGate_->forward(frameOutputGate_[idxCurr]);
+
+  framePreOutput_[idxCurr].value->copyFrom(*(frameState_[idxCurr].value));
+  activationState_->forward(framePreOutput_[idxCurr]);
+
+  frameOutput_[idxCurr].value->dotMul(*framePreOutput_[idxCurr].value,
+                                      *frameOutputGate_[idxCurr].value);
+}
+
+void MDLstmLayer::forwardOneSequence(int start, CoordIterator& coordIter) {
+  for (coordIter.begin(); !coordIter.end(); ++coordIter) {
+    int offset = coordIter.offset();
+    for (int i = 0; i < numDims_; i++) {
+      std::vector<int> prePos;
+      if (coordIter.getPrePos(delays_, i, prePos)) {
+        int preOffset = coordIter.offset(prePos);
+        frameGate_[start + offset].value->mul(
+            frameOutput_[start + preOffset].value, weight_->getW(), 1.0, 1.0);
+      }
+    }
+    forwardGate2OutputSequence(start, coordIter);
+  }
+}
+
+void MDLstmLayer::backward(const UpdateCallback& callback) {
+  const Argument& input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  const int* starts = input.sequenceStartPositions->getData(false);
+  size_t numSequences = input.getNumSequences();
+
+  Matrix::resizeOrCreate(gate_.grad,
+                         /* height= */ batchSize, numBlocks_ * (3 + numDims_),
+                         /* trans= */ false, useGpu_);
+
+  for (int i = 0; i < batchSize; i++) {
+    if (frameState_[i].grad == NULL)
+      frameState_[i].grad = Matrix::create(
+          /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
+  }
+  for (int i = 0; i < batchSize; i++) {
+    if (framePreOutput_[i].grad == NULL)
+      framePreOutput_[i].grad = Matrix::create(
+          /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
+  }
+
+  for (int i = 0; i < batchSize; i++) {
+    frameOutput_[i].grad->setData(output_.grad->getData() + i * numBlocks_);
+    frameGate_[i].grad->setData(gate_.grad->getData() +
+                                i * numBlocks_ * (3 + numDims_));
+    frameInputNode_[i].grad->setData(gate_.grad->getData() +
+                                     i * numBlocks_ * (3 + numDims_) +
+                                     numBlocks_ * 0);
+    frameInputGate_[i].grad->setData(gate_.grad->getData() +
+                                     i * numBlocks_ * (3 + numDims_) +
+                                     numBlocks_ * 1);
+    frameForgetGate_[i].grad->setData(gate_.grad->getData() +
+                                      i * numBlocks_ * (3 + numDims_) +
+                                      numBlocks_ * 2);
+    frameOutputGate_[i].grad->setData(gate_.grad->getData() +
+                                      i * numBlocks_ * (3 + numDims_) +
+                                      numBlocks_ * (2 + numDims_));
+  }
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+
+    for (size_t i = 0; i < numSequences; i++) {
+      CoordIterator coordIter(dimsV_[i], directions_);
+      backwardOneSequence(starts[i], coordIter);
+    }
+  }
+
+  if (input.grad) {
+    input.grad->add(*gate_.grad);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    localBiasGrad_->collectBias(*gate_.grad, 1);
+    bias_->getParameterPtr()->incUpdate(callback);
+  }
+
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+void MDLstmLayer::backwardGate2OutputSequence(int start,
+                                              CoordIterator& coordIter) {
+  int idxCurr = start + coordIter.offset();
+  std::vector<int> preOffsetV;
+  std::vector<int> nextOffsetV;
+  preOffsetV.reserve(numDims_);
+  nextOffsetV.reserve(numDims_);
+  for (int i = 0; i < numDims_; i++) {
+    std::vector<int> prePos;
+    if (coordIter.getPrePos(delays_, i, prePos)) {
+      preOffsetV[i] = coordIter.offset(prePos);
+    } else {
+      preOffsetV[i] = -1;
+    }
+    std::vector<int> nextPos;
+    if (coordIter.getNextPos(delays_, i, nextPos)) {
+      nextOffsetV[i] = coordIter.offset(nextPos);
+    } else {
+      nextOffsetV[i] = -1;
+    }
+  }
+
+  framePreOutput_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
+                                        *frameOutputGate_[idxCurr].value);
+  activationState_->backward(framePreOutput_[idxCurr]);
+  frameState_[idxCurr].grad->copyFrom(*(framePreOutput_[idxCurr].grad));
+
+  frameOutputGate_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
+                                         *framePreOutput_[idxCurr].value);
+  activationGate_->backward(frameOutputGate_[idxCurr]);
+
+  frameState_[idxCurr].grad->addDotMul(*frameOutputGate_[idxCurr].grad,
+                                       *checkOg_, 1.0, 1.0);
+  for (int i = 0; i < numDims_; i++) {
+    if (nextOffsetV[i] >= 0) {
+      frameState_[idxCurr].grad->addDotMul(
+          *frameInputGate_[start + nextOffsetV[i]].grad, *checkIg_, 1.0, 1.0);
+
+      MatrixPtr fgGateOneDimGrad = Matrix::create(
+          frameForgetGate_[start + nextOffsetV[i]].grad->getData() +
+              i * numBlocks_,
+          1, numBlocks_, false, useGpu_);
+      MatrixPtr fgGateOneDimVal = Matrix::create(
+          frameForgetGate_[start + nextOffsetV[i]].value->getData() +
+              i * numBlocks_,
+          1, numBlocks_, false, useGpu_);
+      MatrixPtr checkFgOneDim = Matrix::create(
+          checkFg_->getData() + i * numBlocks_, 1, numBlocks_, false, useGpu_);
+
+      frameState_[idxCurr].grad->addDotMul(*fgGateOneDimGrad, *checkFgOneDim,
+                                           1.0, 1.0);
+      frameState_[idxCurr].grad->addDotMul(
+          *frameState_[start + nextOffsetV[i]].grad, *fgGateOneDimVal, 1.0,
+          1.0);
+    }
+  }
+
+  frameInputNode_[idxCurr].grad->dotMul(*frameState_[idxCurr].grad,
+                                        *frameInputGate_[idxCurr].value);
+  frameInputGate_[idxCurr].grad->dotMul(*frameState_[idxCurr].grad,
+                                        *frameInputNode_[idxCurr].value);
+
+  frameForgetGate_[idxCurr].grad->zeroMem();
+  for (int i = 0; i < numDims_; i++) {
+    if (preOffsetV[i] >= 0) {
+      MatrixPtr fgGateOneDimGrad = Matrix::create(
+          frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_, 1,
+          numBlocks_, false, useGpu_);
+      fgGateOneDimGrad->addDotMul(*frameState_[idxCurr].grad,
+                                  *frameState_[start + preOffsetV[i]].value,
+                                  1.0, 1.0);
+    }
+  }
+
+  activationGate_->backward(frameInputGate_[idxCurr]);
+  activationGate_->backward(frameForgetGate_[idxCurr]);
+  activation_->backward(frameInputNode_[idxCurr]);
+
+  if (bias_->getWGrad()) {
+    for (int i = 0; i < numDims_; i++) {
+      if (preOffsetV[i] >= 0) {
+        checkIgGrad_->addDotMul(*frameInputGate_[idxCurr].grad,
+                                *frameState_[start + preOffsetV[i]].value, 1.0,
+                                1.0);
+
+        MatrixPtr fgGateOneDimGrad = Matrix::create(
+            frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_, 1,
+            numBlocks_, false, useGpu_);
+        MatrixPtr checkFgOneDimGrad =
+            Matrix::create(checkFgGrad_->getData() + i * numBlocks_, 1,
+                           numBlocks_, false, useGpu_);
+        checkFgOneDimGrad->addDotMul(*fgGateOneDimGrad,
+                                     *frameState_[start + preOffsetV[i]].value,
+                                     1.0, 1.0);
+      }
+    }
+    checkOgGrad_->addDotMul(*frameOutputGate_[idxCurr].grad,
+                            *frameState_[idxCurr].value, 1.0, 1.0);
+  }
+}
+
+void MDLstmLayer::backwardOneSequence(int start, CoordIterator& coordIter) {
+  MatrixPtr weightT = weight_->getW()->getTranspose();
+  for (coordIter.rbegin(); !coordIter.end(); --coordIter) {
+    int offset = coordIter.offset();
+    backwardGate2OutputSequence(start, coordIter);
+    for (int i = 0; i < numDims_; i++) {
+      std::vector<int> prePos;
+      if (coordIter.getPrePos(delays_, i, prePos)) {
+        int preOffset = coordIter.offset(prePos);
+        frameOutput_[start + preOffset].grad->mul(
+            frameGate_[start + offset].grad, weightT, 1.0, 1.0);
+        if (weight_->getWGrad()) {
+          weight_->getWGrad()->mul(
+              frameOutput_[start + preOffset].value->getTranspose(),
+              frameGate_[start + offset].grad, 1.0, 1.0);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxIdLayer.cpp b/paddle/gserver/layers/MaxIdLayer.cpp
new file mode 100644
index 00000000000000..b80de87b4e9cc5
--- /dev/null
+++ b/paddle/gserver/layers/MaxIdLayer.cpp
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer for finding the id which has the maximal value for each sample.
+ * The result is stored in output_.ids.
+ *
+ * The config file api is maxid_layer.
+ */
+class MaxIdLayer : public Layer {
+private:
+  /// a predetermined number of best states at each level
+  size_t beamSize_;
+
+public:
+  explicit MaxIdLayer(const LayerConfig& config) : Layer(config) {}
+
+  virtual bool init(const LayerMap& layerMap,
+                    const ParameterMap& parameterMap) {
+    bool ret = Layer::init(layerMap, parameterMap);
+    CHECK_EQ(1UL, inputLayers_.size());
+
+    beamSize_ = config_.has_beam_size() ? config_.beam_size() : FLAGS_beam_size;
+    CHECK_GE(beamSize_, 1LU);
+    return ret;
+  }
+
+  virtual void forward(PassType passType) {
+    Layer::forward(passType);
+    const Argument& input = getInput(0);
+    size_t batchSize = input.getBatchSize();
+    IVector::resizeOrCreate(output_.ids, batchSize * beamSize_, useGpu_);
+    Matrix::resizeOrCreate(output_.in, batchSize, beamSize_, false,
+                           /* useGpu */ useGpu_);
+    output_.value = nullptr;
+    input.value->rowMax(*output_.ids, *output_.in);
+  }
+
+  virtual void backward(const UpdateCallback& callback) {}
+};
+
+REGISTER_LAYER(maxid, MaxIdLayer);
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxLayer.cpp b/paddle/gserver/layers/MaxLayer.cpp
new file mode 100644
index 00000000000000..226e0ea87dbd4a
--- /dev/null
+++ b/paddle/gserver/layers/MaxLayer.cpp
@@ -0,0 +1,128 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "MaxLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(max, MaxLayer);
+
+bool MaxLayer::init(const LayerMap& layerMap,
+                    const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  // transform to which sequence type
+  if (config_.trans_type() == "non-seq") {
+    type_ = kNonSeq;
+  } else if (config_.trans_type() == "seq") {
+    type_ = kSeq;
+  } else {
+    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
+  }
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void MaxLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  // max layer should have exactly 1 input
+  CHECK_EQ(1U, inputLayers_.size());
+
+  size_t dim = getSize();
+  const Argument& input = getInput(0);
+  int64_t newBatchSize =
+      type_ ? input.getNumSubSequences() : input.getNumSequences();
+  ICpuGpuVectorPtr startPositions =
+      type_ ? input.subSequenceStartPositions
+            : input.sequenceStartPositions;
+  auto starts = startPositions->getVector(useGpu_);
+  size_t numSequences = startPositions->getSize() - 1;
+
+  CHECK_EQ(dim, input.value->getWidth());
+  CHECK_EQ(numSequences, (size_t)newBatchSize);
+  CHECK_EQ(startPositions->getData(false)[numSequences], input.getBatchSize());
+  if (type_) {
+    // when trans_type = seq, input must hasSubseq
+    CHECK_EQ(input.hasSubseq(), 1UL);
+  }
+
+  // reset output: resize to "num of sequences", not "batch size".
+  resetOutput(newBatchSize, dim);
+
+  IVector::resizeOrCreate(maxIndex_, newBatchSize * dim, useGpu(deviceId_));
+  maxIndex_->zeroMem();
+
+  MatrixPtr inputValue = getInputValue(0);
+  MatrixPtr outputValue = getOutputValue();
+
+  {
+    REGISTER_TIMER_INFO("MaxLayerForward", getName().c_str());
+    outputValue->maxSequenceForward(*inputValue, *starts, *maxIndex_);
+  }
+
+  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
+   * thus, in this case, output_ has no cpuSequenceStartPositions.
+   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
+   * case, we should compute the new cpuSequenceStartPositions.
+  */
+  if (type_) {
+    output_.degradeSequence(input, useGpu_);
+  }
+
+  if (config_.output_max_index()) {
+    // copy maxIndex_ to output
+    outputValue->copyFrom(*maxIndex_);
+  } else {
+    /* add the bias-vector AFTER max operation */
+    if (biases_.get() != NULL) {
+      outputValue->addBias(*(biases_->getW()), 1);
+    }
+    /* activation */ { forwardActivation(); }
+  }
+}
+
+void MaxLayer::backward(const UpdateCallback& callback) {
+  CHECK(!config_.output_max_index())
+      << "backward is not available when output_max_index is set";
+  /* Do derivation */ { backwardActivation(); }
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  MatrixPtr inputGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+  if (inputGrad) {
+    ICpuGpuVectorPtr starts =
+        type_ ? getInput(0).subSequenceStartPositions
+              : getInput(0).sequenceStartPositions;
+    REGISTER_TIMER_INFO("MaxLayerBackward", getName().c_str());
+    inputGrad->maxSequenceBackward(*outputGrad,
+        *(starts->getVector(useGpu_)), *maxIndex_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h
new file mode 100644
index 00000000000000..b4c34e665d926d
--- /dev/null
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * A layer for "internal max" for sequence input.
+ * Input: one or more sequences. Each sequence contains some instances.
+ * If MaxLevel = kNonSeq:
+ *    Output: output size is the number of input sequences (NOT input instances)
+ *    output[i] = max_{for each instance in this sequence}{input[i]}
+ * If MaxLevel = kSeq:
+ *    Check input sequence must has sub-sequence
+ *    Output: output size is the number of input sub-sequences
+ *    output[i] = max_{for each instance in this sub-sequence}{input[i]}
+ */
+
+class MaxLayer : public Layer {
+protected:
+  std::unique_ptr<Weight> biases_;
+  // maxIndex_[i][j] = k : the value at (i, j) is from input[k].
+  IVectorPtr maxIndex_;
+  int type_;
+
+public:
+  explicit MaxLayer(const LayerConfig& config) : Layer(config) {}
+  enum MaxLevel {kNonSeq = 0, kSeq = 1 };
+
+  ~MaxLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MixedLayer.cpp b/paddle/gserver/layers/MixedLayer.cpp
new file mode 100644
index 00000000000000..054ddd3a228edd
--- /dev/null
+++ b/paddle/gserver/layers/MixedLayer.cpp
@@ -0,0 +1,174 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Stat.h"
+#include "MixedLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(mixed, MixedLayer);
+
+bool MixedLayer::init(const LayerMap& layerMap,
+                      const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  if (!Layer::init(layerMap, parameterMap)) return false;
+
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  projections_.resize(inputLayers_.size());
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    if (config_.inputs(i).has_proj_conf()) {
+      projections_[i].reset(Projection::create(config_.inputs(i).proj_conf(),
+                                               parameters_[i], useGpu_));
+    } else {
+      CHECK(!parameters_[i]) << "should no parameters for operators";
+    }
+  }
+  for (auto& operator_conf : config_.operator_confs()) {
+    for (auto& input_index : operator_conf.input_indices()) {
+      CHECK(!config_.inputs(input_index).has_proj_conf());
+    }
+    operators_.emplace_back(Operator::create(operator_conf, useGpu_));
+  }
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  return true;
+}
+
+void MixedLayer::prefetch() {
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    if (projections_[i]) {
+      projections_[i]->prefetch(&getInput(i));
+    }
+  }
+}
+
+void MixedLayer::resetState() {
+  for (auto& proj : projections_) {
+    if (proj) {
+      proj->resetState();
+    }
+  }
+}
+
+void MixedLayer::setState(LayerStatePtr state) {
+  CHECK(projectionStateMatrixSize_.size() == projections_.size())
+      << "projection size mis-match";
+
+  int start = 0;
+  LayerStatePtr statePtr = std::make_shared<LayerState>();
+  for (int i = 0; i < (int)projectionStateMatrixSize_.size(); i++) {
+    if (projectionStateMatrixSize_[i] > 0) {
+      statePtr->value.clear();
+      for (int j = start; j < start + projectionStateMatrixSize_[i]; j++) {
+        statePtr->value.push_back(state->value[j]);
+      }
+      projections_[i]->setState(statePtr);
+      start += projectionStateMatrixSize_[i];
+    }
+  }
+  CHECK((int)state->value.size() == start) << "state matrix size mis-match";
+}
+
+// Return state which consists of all projections states
+LayerStatePtr MixedLayer::getState() {
+  bool init = projectionStateMatrixSize_.size() == 0;
+  LayerStatePtr res = std::make_shared<LayerState>();
+  for (int i = 0; i < (int)projections_.size(); i++) {
+    LayerStatePtr statePtr =
+        projections_[i] ? projections_[i]->getState() : nullptr;
+    int stateSize = statePtr == nullptr ? 0 : statePtr->value.size();
+    if (init) {
+      projectionStateMatrixSize_.push_back(stateSize);
+    } else {
+      CHECK(projectionStateMatrixSize_[i] == stateSize)
+          << "state matrix size mis-match";
+    }
+    if (statePtr != nullptr) {
+      for (auto& matrixPtr : statePtr->value) {
+        res->value.push_back(matrixPtr);
+      }
+    }
+  }
+  return res;
+}
+
+void MixedLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = getInput(0).getBatchSize();
+  int size = getSize();
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, size);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    if (projections_[i]) {
+      projections_[i]->forward(&getInput(i), &output_, passType);
+    }
+  }
+
+  std::vector<const Argument*> ins;
+  for (auto& op : operators_) {
+    ins.clear();
+    for (auto& input_index : op->getConfig().input_indices()) {
+      ins.push_back(&getInput(input_index));
+    }
+    op->forward(ins, &output_, passType);
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void MixedLayer::backward(const UpdateCallback& callback) {
+  /* Do activation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    if (projections_[i]) {
+      projections_[i]->backward(callback);
+    }
+  }
+
+  for (auto& op : operators_) {
+    op->backward();
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MixedLayer.h b/paddle/gserver/layers/MixedLayer.h
new file mode 100644
index 00000000000000..9bac1355bd21ff
--- /dev/null
+++ b/paddle/gserver/layers/MixedLayer.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Layer.h"
+#include "Projection.h"
+#include "Operator.h"
+
+namespace paddle {
+
+/**
+ * A mixed layer has multiple input layers. 
+ * Each input layer was processed by a Projection or Operator. 
+ * The results of all projections or Operators are summed together with bias
+ * (if configured), and then go through an activation function and dropout
+ * (if configured).
+ *
+ * The config file api is mixed_layer.
+ */
+class MixedLayer : public Layer {
+public:
+  explicit MixedLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~MixedLayer() {}
+
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  virtual void prefetch();
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback = nullptr);
+  virtual void resetState();
+  /**
+   * setState() should be called after getState(). 
+   * Argument state consists of all projections states.
+   */
+  virtual void setState(LayerStatePtr state);
+  /**
+   * Return state which consists of all projections states.
+   */
+  virtual LayerStatePtr getState();
+
+protected:
+  std::vector<std::unique_ptr<Projection>> projections_;
+  std::vector<std::unique_ptr<Operator>> operators_;
+  /// the matrix size of projection state
+  std::vector<int> projectionStateMatrixSize_;
+  std::unique_ptr<Weight> biases_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MultinomialSampler.cpp b/paddle/gserver/layers/MultinomialSampler.cpp
new file mode 100644
index 00000000000000..710772c0cf476f
--- /dev/null
+++ b/paddle/gserver/layers/MultinomialSampler.cpp
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "MultinomialSampler.h"
+
+namespace paddle {
+
+MultinomialSampler::MultinomialSampler(const real* prob, int size)
+    : rand_(0.0, size) {
+  intervals_.reserve(size + 1);
+  double sum = 0;
+  for (int i = 0; i < size; ++i) {
+    sum += prob[i];
+  }
+
+  double intervalLength = sum / size;
+  double s = 1 / intervalLength;
+  for (int i = 0; i < size; ++i) {
+    intervals_[i] = {i, (real)(prob[i] * s)};
+  }
+
+  auto nextSmallPos = [&](int pos) {
+    while (pos < size &&
+           (pos != intervals_[pos].otherId || intervals_[pos].thresh >= 1)) {
+      ++pos;
+    }
+    return pos;
+  };
+
+  auto nextBigPos = [&](int pos) {
+    while (pos < size && intervals_[pos].thresh < 1) {
+      ++pos;
+    }
+    return pos;
+  };
+
+  int smallPos = nextSmallPos(0);
+  int bigPos = nextBigPos(0);
+
+  auto fillIntervals = [&]() {
+    while (bigPos < size && smallPos < size) {
+      while (intervals_[bigPos].thresh > 1 && smallPos < size) {
+        intervals_[smallPos].otherId = bigPos;
+        intervals_[bigPos].thresh -= 1 - intervals_[smallPos].thresh;
+        smallPos = nextSmallPos(smallPos + 1);
+      }
+      bigPos = nextBigPos(bigPos + 1);
+      // If intervals_[bigPos].thresh < 1, it becomes a small interval
+    }
+  };
+
+  fillIntervals();
+
+  smallPos = nextSmallPos(0);
+
+  // At this point there is no small intervals after bigPos. And this condition
+  // will remain true during the next fillIntervals()
+
+  fillIntervals();
+
+  // Handle the inaccuracy caused by finite-precision arithmetic which
+  // may results in some unprocessed small or big intervals at this point.
+  for (int i = 0; i < size; ++i) {
+    if (intervals_[i].otherId == i) {
+      intervals_[i].thresh = 1;
+    }
+  }
+
+  // The last one is to safeguard the case that the random number is equal
+  // to size
+  intervals_[size] = {size - 1, 1};
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MultinomialSampler.h b/paddle/gserver/layers/MultinomialSampler.h
new file mode 100644
index 00000000000000..442124704ac0a9
--- /dev/null
+++ b/paddle/gserver/layers/MultinomialSampler.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <random>
+
+#include "paddle/utils/TypeDefs.h"
+
+namespace paddle {
+
+/**
+ * @brief Given the probability of N objects, the sampler random select
+ * one of the object.
+ * @note: prob does not have to be unnormalized.
+ *
+ * The space requirement is O(N)=O(N * sizeof(Interval)).
+ * The computational complexity of generate one sample is O(1).
+ */
+class MultinomialSampler {
+public:
+  MultinomialSampler(const real* prob, int size);
+
+  /**
+   * @brief Generate a random sample.
+   * @param g is a random number engine. See <random>.
+   * @return Random integer.
+   */
+  template <typename URNG>
+  int gen(URNG& g) {
+    return gen1([&g, this]() { return rand_(g); });
+  }
+
+protected:
+  /**
+   * @brief Generation
+   * @param[in] rand rand is a real random number distribution
+   * for the range [0, size).
+   * @return random int number or intervals_[random_int_number].otherId.
+   */
+  template <typename Rand>
+  int gen1(Rand rand) {
+    double r = rand();  // NOLINT
+    int i = (int)r;
+    r -= i;
+    return r < intervals_[i].thresh ? i : intervals_[i].otherId;
+  }
+
+  struct Interval {
+    int otherId;
+    real thresh;
+  };
+
+  /// The probability of each interval will be 1./size
+  std::vector<Interval> intervals_;
+  std::uniform_real_distribution<double> rand_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MultiplexLayer.cpp b/paddle/gserver/layers/MultiplexLayer.cpp
new file mode 100644
index 00000000000000..a70172d9a6344b
--- /dev/null
+++ b/paddle/gserver/layers/MultiplexLayer.cpp
@@ -0,0 +1,180 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ *@brief This layer multiplex multiple layers according to the index,
+ * which is provided by the first input layer.
+ * - Input[0]: the index of the layer to output of size batchSize.
+ * - Input[1:N]; the candidate output data.
+ * For each index i from 0 to batchSize -1, the output is the i-th row of the
+ * (index[i] + 1)-th layer.
+ *
+ * For each i-th row of output:
+ *
+ * \f[
+ *   y[i][j] = x_{x_{0}[i] + 1}[i][j], j = 0,1, ... , (x_{1}.width - 1)
+ * \f]
+ * where, y is output. \f$x_{k}\f$ is the k-th input layer and
+ * \f$k = x_{0}[i] + 1\f$.
+ */
+
+class MultiplexLayer : public Layer {
+protected:
+  /**
+   * @brief A struct is used to save the copy information, includes input
+   * layer index and copy size.
+   */
+  struct CopyInfo {
+    CopyInfo(int inStartIdx, int inLength, int inCopyIdx)
+        : startIdx(inStartIdx), length(inLength), copyIdx(inCopyIdx) {}
+
+    /// The start row of input.
+    int startIdx;
+    /// Number of rows. If the layer index in Input[0] is not consecutive,
+    /// the length is one. Otherwise, the length is > 1 and copy multi rows
+    /// once.
+    int length;
+    /// The copied layer index, which needs to add 1.
+    int copyIdx;
+  };
+
+  /// A list of CopyInfo used to save copy information.
+  std::vector<CopyInfo> copySchedule_;
+
+  /// Temporary matrix pointer to point to input data.
+  MatrixPtr tmpSrc_;
+  /// Temporary matrix pointer to point to output data.
+  MatrixPtr tmpDest_;
+
+public:
+  explicit MultiplexLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~MultiplexLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+
+private:
+  /**
+   * @brief Calculate copy info for input layers.
+   */
+  void calculateCopySchedule(const IVectorPtr& copyIds, size_t numIns);
+};
+
+REGISTER_LAYER(multiplex, MultiplexLayer);
+
+void MultiplexLayer::calculateCopySchedule(const IVectorPtr& copyIds,
+                                           size_t numIns) {
+  copySchedule_.clear();
+  CopyInfo prevCopyInfo(0, 0, -1);
+  for (size_t i = 0; i < copyIds->getSize(); i++) {
+    int copyId = copyIds->getElement(i);
+    CHECK_GE(copyId, 0);
+    CHECK_LT(copyId, int(numIns));
+    // copy same input layer with prevous and will copy consecutive.
+    if (copyId == prevCopyInfo.copyIdx) {
+      ++prevCopyInfo.length;
+    } else {
+      if (prevCopyInfo.copyIdx != -1) {
+        copySchedule_.emplace_back(prevCopyInfo);
+      }
+      prevCopyInfo.startIdx = i;
+      prevCopyInfo.length = 1;
+      prevCopyInfo.copyIdx = copyId;
+    }
+  }
+  if (prevCopyInfo.copyIdx != -1) {
+    copySchedule_.emplace_back(prevCopyInfo);
+  }
+}
+
+bool MultiplexLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_GE(inputLayers_.size(), 2U);
+
+  tmpSrc_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+  tmpDest_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+  return true;
+}
+
+void MultiplexLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  IVectorPtr copyIds = getInput(0).ids;
+  MatrixPtr inV1 = getInputValue(1);
+  CHECK_EQ(copyIds->getSize(), inV1->getHeight());
+  for (size_t i = 2; i < inputLayers_.size(); i++) {
+    CHECK_EQ(inV1->getHeight(), getInputValue(i)->getHeight());
+    CHECK_EQ(inV1->getWidth(), getInputValue(i)->getWidth());
+  }
+
+  calculateCopySchedule(copyIds, inputLayers_.size() - 1);
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(inV1->getHeight(), inV1->getWidth());
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwLMultplexingTimer", getName().c_str());
+    AsyncGpuBlock block;
+    for (const CopyInfo& info : copySchedule_) {
+      outV->subMatrix(info.startIdx, info.length, tmpDest_)
+          ->copyFrom(*getInputValue(info.copyIdx + 1)
+                          ->subMatrix(info.startIdx, info.length, tmpSrc_));
+    }
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void MultiplexLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  MatrixPtr outG = getOutputGrad();
+
+  {
+    REGISTER_TIMER_INFO("BwLMultiplexTimer", getName().c_str());
+    AsyncGpuBlock block;
+    for (const CopyInfo& info : copySchedule_) {
+      if (getInputGrad(info.copyIdx + 1)) {
+        getInputGrad(info.copyIdx + 1)
+            ->subMatrix(info.startIdx, info.length, tmpDest_)
+            ->add(*outG->subMatrix(info.startIdx, info.length, tmpSrc_));
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/NCELayer.cpp b/paddle/gserver/layers/NCELayer.cpp
new file mode 100644
index 00000000000000..a896e16a6027b3
--- /dev/null
+++ b/paddle/gserver/layers/NCELayer.cpp
@@ -0,0 +1,303 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <random>
+
+#include "Layer.h"
+#include "MultinomialSampler.h"
+#include "paddle/math/MathFunctions.h"
+
+namespace paddle {
+
+/**
+ * Noise-contrastive estimation
+ * Implements the method in the following paper:
+ * A fast and simple algorithm for training neural probabilistic language models
+ */
+class NCELayer : public Layer {
+  int numClasses_;
+  int numInputs_;  // number of input layer besides labelLayer and weightLayer
+  LayerPtr labelLayer_;
+  LayerPtr weightLayer_;
+  WeightList weights_;
+  std::unique_ptr<Weight> biases_;
+  std::unique_ptr<MultinomialSampler> sampler_;
+
+  std::uniform_int_distribution<int> rand_;
+
+  struct Sample {
+    int sampleId;
+    int labelId;
+    bool target;
+    real weight;
+  };
+  std::vector<Sample> samples_;
+  bool prepared_;  // whether samples_ is prepared
+  Argument sampleOut_;
+
+  IVectorPtr labelIds_;
+
+public:
+  explicit NCELayer(const LayerConfig& config)
+      : Layer(config),
+        numClasses_(config.num_classes()),
+        rand_(0, config.num_classes() - 1),
+        prepared_(false) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+    /* Initialize the basic parent class */
+    Layer::init(layerMap, parameterMap);
+
+    /* initialize the weightList */
+    size_t i;
+    for (i = 0; i < inputLayers_.size(); i++) {
+      if (!parameters_[i]) break;
+      size_t width = inputLayers_[i]->getSize();
+      // create a new weight
+      CHECK_EQ(parameters_[i]->getSize(), width * numClasses_);
+      Weight* w = new Weight(numClasses_, width, parameters_[i]);
+
+      // append the new weight to the list
+      weights_.emplace_back(w);
+    }
+
+    CHECK_EQ(1U, getSize());
+
+    numInputs_ = i;
+    CHECK_GE(numInputs_, 1)
+        << "Must have at least one input besides label and weight";
+    CHECK_LT(i, inputLayers_.size()) << "Missing label layer";
+    labelLayer_ = inputLayers_[i];
+    if (++i < inputLayers_.size()) {
+      weightLayer_ = inputLayers_[i];
+      ++i;
+    }
+    CHECK_EQ(i, inputLayers_.size());
+
+    /* initialize biases_ */
+    if (biasParameter_.get() != NULL) {
+      CHECK_EQ(biasParameter_->getSize(), (size_t)numClasses_);
+      biases_.reset(new Weight(1, numClasses_, biasParameter_));
+    }
+
+    if (config_.neg_sampling_dist_size()) {
+      CHECK_EQ(numClasses_, config_.neg_sampling_dist_size());
+      sampler_.reset(new MultinomialSampler(config_.neg_sampling_dist().data(),
+                                            numClasses_));
+    }
+
+    return true;
+  }
+
+  void prepareSamples() {
+    CHECK(!useGpu_) << "GPU is not supported";
+
+    int batchSize = getInput(*labelLayer_).getBatchSize();
+    IVectorPtr label = getInput(*labelLayer_).ids;
+
+    CpuSparseMatrixPtr multiLabel = std::dynamic_pointer_cast<CpuSparseMatrix>(
+        getInput(*labelLayer_).value);
+
+    CHECK(label || multiLabel)
+        << "The label layer must have ids or NonValueSparseMatrix value";
+
+    auto& randEngine = ThreadLocalRandomEngine::get();
+
+    samples_.clear();
+    samples_.reserve(batchSize * (1 + config_.num_neg_samples()));
+
+    real* weight =
+        weightLayer_ ? getInputValue(*weightLayer_)->getData() : nullptr;
+
+    for (int i = 0; i < batchSize; ++i) {
+      real w = weight ? weight[i] : 1;
+      if (label) {
+        int* ids = label->getData();
+        samples_.push_back({i, ids[i], true, w});
+      } else {
+        const int* cols = multiLabel->getRowCols(i);
+        int n = multiLabel->getColNum(i);
+        for (int j = 0; j < n; ++j) {
+          samples_.push_back({i, cols[j], true, w});
+        }
+      }
+      for (int j = 0; j < config_.num_neg_samples(); ++j) {
+        int id = sampler_ ? sampler_->gen(randEngine) : rand_(randEngine);
+        samples_.push_back({i, id, false, w});
+      }
+    }
+    prepared_ = true;
+  }
+
+  void prefetch() {
+    prepareSamples();
+    IVector::resizeOrCreate(labelIds_, samples_.size(), useGpu_);
+    int* ids = labelIds_->getData();
+    for (size_t i = 0; i < samples_.size(); ++i) {
+      ids[i] = samples_[i].labelId;
+    }
+
+    for (int i = 0; i < numInputs_; ++i) {
+      auto sparseParam =
+          dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
+      if (sparseParam) {
+        sparseParam->addRows(labelIds_);
+      }
+    }
+  }
+
+  void forward(PassType passType) {
+    Layer::forward(passType);
+
+    CHECK(!useGpu_) << "GPU is not supported";
+
+    if (!prepared_) {
+      if (passType == PASS_GC) {
+        ThreadLocalRandomEngine::get().seed(ThreadLocalRand::getDefaultSeed());
+      }
+      prepareSamples();
+    }
+    prepared_ = false;
+
+    /* malloc memory for the output_ if necessary */
+    int batchSize = getInputValue(0)->getHeight();
+    int size = getSize();
+    resetOutput(batchSize, size);
+
+    Matrix::resizeOrCreate(sampleOut_.value, 1, samples_.size(),
+                           /* trans= */ false, useGpu_);
+
+    forwardBias();
+
+    for (int l = 0; l < numInputs_; ++l) {
+      forwardOneInput(l);
+    }
+
+    activation_->forward(sampleOut_);
+
+    forwardCost();
+  }
+
+  void backward(const UpdateCallback& callback) {
+    Matrix::resizeOrCreate(sampleOut_.grad, 1, samples_.size(),
+                           /* trans= */ false, useGpu_);
+
+    backwardCost();
+
+    activation_->backward(sampleOut_);
+
+    if (biases_->getWGrad()) {
+      backwardBias(callback);
+    }
+
+    for (int l = 0; l < numInputs_; ++l) {
+      backwardOneInput(l, callback);
+    }
+  }
+
+  void forwardBias() {
+    if (!biases_) {
+      sampleOut_.value->zeroMem();
+    } else {
+      real* bias = biases_->getW()->getData();
+      real* sampleOut = sampleOut_.value->getData();
+      for (size_t i = 0; i < samples_.size(); ++i) {
+        sampleOut[i] = bias[samples_[i].labelId];
+      }
+    }
+  }
+
+  void backwardBias(const UpdateCallback& callback) {
+    if (!biases_) return;
+    real* bias = biases_->getWGrad()->getData();
+    real* sampleOut = sampleOut_.grad->getData();
+    for (size_t i = 0; i < samples_.size(); ++i) {
+      bias[samples_[i].labelId] += sampleOut[i];
+    }
+    biases_->incUpdate(callback);
+  }
+
+  void forwardOneInput(int layerId) {
+    const MatrixPtr& inputMat = getInputValue(layerId);
+    const MatrixPtr& weightMat = weights_[layerId]->getW();
+
+    int dim = inputMat->getWidth();
+    real* sampleOut = sampleOut_.value->getData();
+
+    for (size_t i = 0; i < samples_.size(); ++i) {
+      sampleOut[i] += dotProduct(dim, inputMat->getRowBuf(samples_[i].sampleId),
+                                 weightMat->getRowBuf(samples_[i].labelId));
+    }
+  }
+
+  void backwardOneInput(int layerId, const UpdateCallback& callback) {
+    const MatrixPtr& inputMat = getInputValue(layerId);
+    const MatrixPtr& inputGradMat = getInputGrad(layerId);
+    const MatrixPtr& weightMat = weights_[layerId]->getW();
+    const MatrixPtr& weightGradMat = weights_[layerId]->getWGrad();
+
+    int dim = inputMat->getWidth();
+    real* sampleGrad = sampleOut_.grad->getData();
+
+    if (weightGradMat) {
+      for (size_t i = 0; i < samples_.size(); ++i) {
+        axpy(dim, sampleGrad[i], inputMat->getRowBuf(samples_[i].sampleId),
+             weightGradMat->getRowBuf(samples_[i].labelId));
+      }
+      weights_[layerId]->incUpdate(callback);
+    }
+
+    if (inputGradMat) {
+      for (size_t i = 0; i < samples_.size(); ++i) {
+        axpy(dim, sampleGrad[i], weightMat->getRowBuf(samples_[i].labelId),
+             inputGradMat->getRowBuf(samples_[i].sampleId));
+      }
+    }
+  }
+
+  void forwardCost() {
+    real* out = output_.value->getData();
+    real* sampleOut = sampleOut_.value->getData();
+    real b = 1. / numClasses_ * config_.num_neg_samples();
+    for (size_t i = 0; i < samples_.size(); ++i) {
+      real o = sampleOut[i];
+      if (sampler_) {
+        b = config_.num_neg_samples() *
+            config_.neg_sampling_dist(samples_[i].labelId);
+      }
+      real cost = samples_[i].target ? -log(o / (o + b)) : -log(b / (o + b));
+      out[samples_[i].sampleId] += samples_[i].weight * cost;
+    }
+  }
+
+  void backwardCost() {
+    real* sampleOut = sampleOut_.value->getData();
+    real* sampleGrad = sampleOut_.grad->getData();
+
+    real b = 1. / numClasses_ * config_.num_neg_samples();
+    for (size_t i = 0; i < samples_.size(); ++i) {
+      real o = sampleOut[i];
+      if (sampler_) {
+        b = config_.num_neg_samples() *
+            config_.neg_sampling_dist(samples_[i].labelId);
+      }
+      real w = samples_[i].weight;
+      sampleGrad[i] = samples_[i].target ? -w * b / (o * (o + b)) : w / (o + b);
+    }
+  }
+};
+
+REGISTER_LAYER(nce, NCELayer);
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/NormLayer.cpp b/paddle/gserver/layers/NormLayer.cpp
new file mode 100644
index 00000000000000..ad8b92d2ff7242
--- /dev/null
+++ b/paddle/gserver/layers/NormLayer.cpp
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "NormLayer.h"
+#include "NormProjectionLayer.h"
+namespace paddle {
+
+REGISTER_LAYER_CREATE_FUNC(norm, &NormLayer::create);
+
+Layer* NormLayer::create(const LayerConfig& config) {
+  CHECK_EQ(config.inputs_size(), 1);
+  const std::string& norm = config.inputs(0).norm_conf().norm_type();
+  if (norm == "rnorm") {
+    return new ResponseNormLayer(config);
+  } else if (norm == "cmrnorm-projection") {
+    return new CMRProjectionNormLayer(config);
+  } else {
+    LOG(FATAL) << "Unknown norm type: " << norm;
+    return nullptr;
+  }
+}
+
+bool ResponseNormLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  NormLayer::init(layerMap, parameterMap);
+
+  /* the size of inputs for norm-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  channels_ = conf.channels();
+  size_ = conf.size();
+  scale_ = conf.scale();
+  pow_ = conf.pow();
+  outputX_ = conf.output_x();
+  imgSize_ = conf.img_size();
+  denoms_ = NULL;
+  return true;
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
new file mode 100644
index 00000000000000..89bd23dae1bedc
--- /dev/null
+++ b/paddle/gserver/layers/NormLayer.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "NormLayer.h"
+
+namespace paddle {
+
+/**
+ * @brief basic parent layer of normalization
+ * Normalize the input in local region
+ */
+class NormLayer : public Layer {
+public:
+  explicit NormLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+    Layer::init(layerMap, parameterMap);
+    return true;
+  }
+
+  // create norm layer by norm_type
+  static Layer* create(const LayerConfig& config);
+};
+
+/**
+ * @brief response normalization within feature maps
+ * namely normalize in independent channel 
+ * When code refactoring, we delete the original implementation. 
+ * Need to implement in the futrue.
+ */
+class ResponseNormLayer : public NormLayer {
+protected:
+  size_t channels_, size_, outputX_, imgSize_;
+  float scale_, pow_;
+  MatrixPtr denoms_;
+
+public:
+  explicit ResponseNormLayer(const LayerConfig& config) : NormLayer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType) { LOG(FATAL) << "Not implemented"; }
+  void backward(const UpdateCallback& callback = nullptr) {
+    LOG(FATAL) << "Not implemented";
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp
new file mode 100644
index 00000000000000..f30a3e8df0f87f
--- /dev/null
+++ b/paddle/gserver/layers/NormProjectionLayer.cpp
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+#include "NormProjectionLayer.h"
+
+namespace paddle {
+size_t CMRProjectionNormLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t layerSize = 0;
+  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imgSizeH_ == 0) {
+    imgSizeH_ = imgSize_;
+  }
+  if (imgSizeW_ == 0) {
+    imgSizeW_ = imgSize_;
+  }
+  outputH_ = imgSizeH_;
+  outputW_ = imgSizeW_;
+  layerSize = outputH_ * outputW_ * channels_;
+
+  getOutput().setFrameHeight(outputH_);
+  getOutput().setFrameWidth(outputW_);
+  return layerSize;
+}
+
+bool CMRProjectionNormLayer::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  ResponseNormLayer::init(layerMap, parameterMap);
+
+  /* the size of inputs for norm-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  auto& inputConfig = config_.inputs(0);
+  blocked_ = inputConfig.norm_conf().blocked();
+
+  return true;
+}
+
+void CMRProjectionNormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  /* note: one sample correspond to one row */
+  MatrixPtr input = inputLayers_[0]->getOutputValue();
+  int batchSize = input->getHeight();
+  int size = getSize();
+  resetOutput(batchSize, size);
+
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(denoms_, batchSize, size, /* trans */ false, useGpu_);
+
+  denoms_->zeroMem();
+
+  outV->crossMapNormalFwd(*input, imgSizeH_, imgSizeW_, *denoms_, channels_,
+                          size_, scale_, pow_, blocked_);
+}
+
+void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  if (NULL == inputLayers_[0]->getOutputGrad()) {
+    return;
+  }
+  /* Do derivation */
+  MatrixPtr preOutGrad = inputLayers_[0]->getOutputGrad();
+  MatrixPtr localGrad = getOutputGrad();
+  MatrixPtr localOutV = getOutputValue();
+  MatrixPtr preOutV = inputLayers_[0]->getOutputValue();
+
+  preOutGrad->crossMapNormalBwd(*localGrad, *denoms_, *preOutV, *localOutV,
+                                channels_, imgSizeH_, imgSizeW_, size_, scale_,
+                                pow_, blocked_);
+}
+}  // namespace paddle
diff --git a/paddle/gserver/layers/NormProjectionLayer.h b/paddle/gserver/layers/NormProjectionLayer.h
new file mode 100644
index 00000000000000..a5e8dce029ae1b
--- /dev/null
+++ b/paddle/gserver/layers/NormProjectionLayer.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "NormLayer.h"
+#include "paddle/math/Matrix.h"
+#include <vector>
+
+namespace paddle {
+
+/**
+ * @brief response normalization across feature maps
+ * namely normalize in number of size_ channels 
+ */
+class CMRProjectionNormLayer : public ResponseNormLayer {
+  size_t imgSizeH_, imgSizeW_;
+  size_t outputH_, outputW_;
+
+protected:
+  bool blocked_;
+
+public:
+  explicit CMRProjectionNormLayer(const LayerConfig& config)
+      : ResponseNormLayer(config) {}
+
+  ~CMRProjectionNormLayer() {}
+
+  size_t getSize();
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Operator.cpp b/paddle/gserver/layers/Operator.cpp
new file mode 100644
index 00000000000000..5fa8239ac5d6f1
--- /dev/null
+++ b/paddle/gserver/layers/Operator.cpp
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "Operator.h"
+
+namespace paddle {
+
+ClassRegistrar<Operator, OperatorConfig, bool> Operator::registrar_;
+
+Operator* Operator::create(const OperatorConfig& config, bool useGpu) {
+  return registrar_.createByType(config.type(), config, useGpu);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Operator.h b/paddle/gserver/layers/Operator.h
new file mode 100644
index 00000000000000..9ee16f70ee3a3c
--- /dev/null
+++ b/paddle/gserver/layers/Operator.h
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "paddle/parameter/Parameter.h"
+#include "ModelConfig.pb.h"
+
+#include "paddle/parameter/Argument.h"
+#include "Layer.h"
+
+namespace paddle {
+
+// Macro for registering a operator type
+// Example: REGISTER_OPERATOR(dot_mul, DotMulOperator);
+#define REGISTER_OPERATOR(__type_name, __class_name)                \
+  static InitFunction __reg_type_##__type_name([]() {               \
+    Operator::registrar_.registerClass<__class_name>(#__type_name); \
+  })
+
+/**
+ * Operator like Projection, but takes more than one Arguments as input.
+ * @note: Operator can't have parameters.
+ */
+class Operator {
+public:
+  static Operator* create(const OperatorConfig& config, bool useGpu);
+
+  Operator(const OperatorConfig& config, bool useGpu)
+      : config_(config), useGpu_(useGpu) {}
+
+  virtual ~Operator() {}
+
+  const OperatorConfig& getConfig() const { return config_; }
+
+  static ClassRegistrar<Operator, OperatorConfig, bool> registrar_;
+
+  /**
+   * Forward propagation. If backward() will be called, in and out must be kept valid until then.
+   * @param ins inputs of operator
+   * @param out output of operator
+   * @param passType PASS_TRAIN of PASS_TEST
+   */
+  void forward(std::vector<const Argument*> ins, Argument* out,
+               PassType passType) {
+    ins_ = ins;
+    out_ = out;
+    passType_ = passType;
+    forward();
+  }
+
+  virtual void prefetch(const Argument* in) {}
+  virtual void forward() = 0;
+  virtual void backward() = 0;
+
+  /**
+   * See comment in Layer.h for the function with the same name.
+   */
+  virtual void resetState() {}
+
+  /**
+   * Set layer state.
+   */
+  virtual void setState(LayerStatePtr state) {}
+
+  /**
+   * Set layer state.
+   */
+  virtual LayerStatePtr getState() { return nullptr; }
+
+protected:
+  /// Config of operator
+  OperatorConfig config_;
+  bool useGpu_;
+
+  /// Store `ins` passed to forward()
+  std::vector<const Argument*> ins_;
+  /// Store `out` passed to forward()
+  Argument* out_;
+  /// Store `passType` passed to forward()
+  PassType passType_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/OuterProdLayer.cpp b/paddle/gserver/layers/OuterProdLayer.cpp
new file mode 100644
index 00000000000000..307b70dc1a455b
--- /dev/null
+++ b/paddle/gserver/layers/OuterProdLayer.cpp
@@ -0,0 +1,137 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for computing the outer product of two vectors,
+ * which is used in NEURAL TURING MACHINE
+ * Input: two vectors: batchSize x dim1, batchSize x dim2
+ * Output: a matrix: (batchSize x (dim1*dim2))
+ */
+
+class OuterProdLayer : public Layer {
+protected:
+  MatrixPtr tmpMtx0;
+  MatrixPtr tmpRow0;
+  MatrixPtr tmpRow1;
+
+public:
+  explicit OuterProdLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~OuterProdLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+REGISTER_LAYER(out_prod, OuterProdLayer);
+
+bool OuterProdLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+
+  size_t dim0 = inputLayers_[0]->getSize();
+  size_t dim1 = inputLayers_[1]->getSize();
+
+  CHECK_EQ(dim0 * dim1, getSize()) << "Dimension mismatch";
+
+  tmpRow0 = Matrix::create(nullptr, /* height= */ 1, dim0, /* trans= */ false,
+                           useGpu_);
+  tmpRow1 = Matrix::create(nullptr, /* height= */ 1, dim1, /* trans= */ false,
+                           useGpu_);
+  tmpMtx0 = Matrix::create(nullptr, /* height= */ dim0, dim1,
+                           /* trans= */ false, useGpu_);
+  return true;
+}
+
+void OuterProdLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t dim0 = inV0->getWidth();
+  size_t dim1 = inV1->getWidth();
+
+  CHECK_EQ(dim0 * dim1, getSize());
+  CHECK_EQ(inV1->getHeight(), batchSize);
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, dim0 * dim1);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  {
+    REGISTER_TIMER_INFO("FwOutProdTimer", getName().c_str());
+    for (size_t i = 0; i < batchSize; i++) {
+      tmpMtx0->setData(outV->getData() + i * dim0 * dim1);
+      tmpRow0->setData(inV0->getData() + i * dim0);
+      tmpRow1->setData(inV1->getData() + i * dim1);
+
+      tmpMtx0->mul(tmpRow0->getTranspose(), tmpRow1);
+    }
+  }
+}
+
+void OuterProdLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t dim0 = inV0->getWidth();
+  size_t dim1 = inV1->getWidth();
+
+  {
+    REGISTER_TIMER_INFO("BwOutProdTimer", getName().c_str());
+
+    if (inG0) {
+      for (size_t i = 0; i < batchSize; i++) {
+        tmpMtx0->setData(outG->getData() + i * dim0 * dim1);
+        tmpRow0->setData(inG0->getData() + i * dim0);
+        tmpRow1->setData(inV1->getData() + i * dim1);
+
+        tmpRow0->mul(tmpRow1, tmpMtx0->getTranspose(), 1, 1);
+      }
+    }
+
+    if (inG1) {
+      for (size_t i = 0; i < batchSize; i++) {
+        tmpMtx0->setData(outG->getData() + i * dim0 * dim1);
+        tmpRow0->setData(inV0->getData() + i * dim0);
+        tmpRow1->setData(inG1->getData() + i * dim1);
+
+        tmpRow1->mul(tmpRow0, tmpMtx0, 1, 1);
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ParameterReluLayer.cpp b/paddle/gserver/layers/ParameterReluLayer.cpp
new file mode 100644
index 00000000000000..98d108db5f0525
--- /dev/null
+++ b/paddle/gserver/layers/ParameterReluLayer.cpp
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "ParameterReluLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(prelu, ParameterReluLayer);
+
+bool ParameterReluLayer::init(const LayerMap& layerMap,
+                              const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  partialSum_ = config_.partial_sum();
+  CHECK_GT(partialSum_, 0UL) << "partial_sum must be larger than zero.";
+  CHECK(!(inputLayers_[0]->getSize() % partialSum_))
+      << "Incorrect value for partialSum: " << partialSum_
+      << " must divide input size: " << inputLayers_[0]->getSize();
+  CHECK_EQ(getSize() / partialSum_, parameters_[0]->getSize());
+  weight_ = std::unique_ptr<Weight>(new Weight(
+      1UL, inputLayers_[0]->getSize() / partialSum_, parameters_[0]));
+  return true;
+}
+
+void ParameterReluLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInput(0).getBatchSize();
+  int size = getSize();
+  reserveOutput(batchSize, size);
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    outV->paramReluForward(*(getInput(0).value), *(weight_->getW()));
+  }
+}
+
+void ParameterReluLayer::backward(const UpdateCallback& callback) {
+  if (weight_->getWGrad()) {
+    weight_->getWGrad()->paramReluBackwardW(*getOutputGrad(),
+                                            *(getInputValue(0)));
+  }
+
+  MatrixPtr preGrad = getInputGrad(0);
+  preGrad->paramReluBackwardDiff(*getOutputGrad(), *(getInputValue(0)),
+                                 *(weight_->getW()));
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ParameterReluLayer.h b/paddle/gserver/layers/ParameterReluLayer.h
new file mode 100644
index 00000000000000..367e4e787c5ef2
--- /dev/null
+++ b/paddle/gserver/layers/ParameterReluLayer.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ *  @brief ParameterReluLayer active inputs with learnable parameter weight_.
+ *  forward:
+ *  \f[
+ *      y = x > 0 ? x : w .* x
+ *  \f]
+ *  backward:
+ *  \f[
+ *      dx = x > 0 ? dy : w .* dy \\
+ *      dw = x > 0 ? 0 : dy.*x
+ *  \f]
+ *  Here, x is the input, w is the weight, y is the output.
+ *  dx, dw, dy is the gradient.
+ */
+
+class ParameterReluLayer : public Layer {
+protected:
+  std::unique_ptr<Weight> weight_;
+
+  /**
+   *  @brief partialSum_ makes a group of inputs share same weights,
+   *  - partialSum_ = 1:
+   *       element wise activation: each element has a weight_,
+   *  - partialSum_ = number of elements in one channel,
+   *       channels wise parameter activation, elements in a channel
+   *       share same weight_,
+   *  - partialSum_ = number of outputs
+   *       all elements share same weight_,
+   */
+  size_t partialSum_;
+
+public:
+  explicit ParameterReluLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ParameterReluLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp
new file mode 100644
index 00000000000000..0ff7f374abb4be
--- /dev/null
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "PoolLayer.h"
+#include "PoolProjectionLayer.h"
+#ifndef PADDLE_ONLY_CPU
+#include "CudnnPoolLayer.h"
+#endif
+namespace paddle {
+
+REGISTER_LAYER_CREATE_FUNC(pool, &PoolLayer::create);
+
+bool PoolLayer::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* the size of inputs for pool-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  const PoolConfig& conf = config_.inputs(0).pool_conf();
+  poolType_ = conf.pool_type();
+  channels_ = conf.channels();
+  sizeX_ = conf.size_x();
+  start_ = conf.start();
+  stride_ = conf.stride();
+  outputX_ = conf.output_x();
+  imgSize_ = conf.img_size();
+  confPadding_ = conf.padding();
+
+  sizeY_ = conf.has_size_y() ? conf.size_y() : conf.size_x();
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
+  confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+
+  bool cudnnTypeCheck = true;
+#ifndef PADDLE_ONLY_CPU
+  cudnnTypeCheck = !CudnnPoolLayer::typeCheck(poolType_);
+#endif
+
+  if ((sizeY_ != sizeX_ || imgSizeY_ != imgSize_ || strideY_ != stride_ ||
+       confPaddingY_ != confPadding_ || outputY_ != outputX_) &&
+      cudnnTypeCheck) {
+    LOG(FATAL) << poolType_ << " does not supported non-square "
+                               "filter, image, stride or padding";
+  }
+
+  if (confPadding_ != 0 && cudnnTypeCheck) {
+    LOG(FATAL) << poolType_ << " does not supported 'padding'";
+  }
+
+  return true;
+}
+
+Layer* PoolLayer::create(const LayerConfig& config) {
+  CHECK_EQ(config.inputs_size(), 1);
+  const std::string& pool = config.inputs(0).pool_conf().pool_type();
+  if (pool == "max-projection") {
+    return new MaxPoolProjectionLayer(config);
+  } else if (pool == "avg-projection") {
+    return new AvgPoolProjectionLayer(config);
+#ifndef PADDLE_ONLY_CPU
+  } else if (CudnnPoolLayer::typeCheck(pool)) {
+    return new CudnnPoolLayer(config);
+#endif
+  } else {
+    LOG(FATAL) << "Unknown pool type: " << pool;
+    return nullptr;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
new file mode 100644
index 00000000000000..1c649bc66aeacf
--- /dev/null
+++ b/paddle/gserver/layers/PoolLayer.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include <vector>
+
+namespace paddle {
+
+/**
+ * @brief basic parent layer of pooling
+ * Pools the input within regions
+ */
+class PoolLayer : public Layer {
+protected:
+  size_t channels_, sizeX_, stride_, outputX_, imgSize_;
+  int start_, confPadding_;
+
+  size_t sizeY_;
+  size_t imgSizeY_;
+  size_t strideY_;
+  size_t outputY_;
+  int confPaddingY_;
+
+  std::string poolType_;
+
+public:
+  explicit PoolLayer(const LayerConfig& config) : Layer(config) {}
+
+  // create pooling layer by pool_type
+  static Layer* create(const LayerConfig& config);
+
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjectionLayer.cpp b/paddle/gserver/layers/PoolProjectionLayer.cpp
new file mode 100644
index 00000000000000..9c2d6d2164a3f5
--- /dev/null
+++ b/paddle/gserver/layers/PoolProjectionLayer.cpp
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+#include "PoolProjectionLayer.h"
+
+namespace paddle {
+
+size_t PoolProjectionLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t layerSize = 0;
+  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imgSizeH_ == 0) {
+    imgSizeH_ = imgSize_;
+  }
+  if (imgSizeW_ == 0) {
+    imgSizeW_ = imgSize_;
+  }
+  outputH_ = 1 + (imgSizeH_ - start_ - sizeX_ + stride_ - 1) / stride_;
+  outputW_ = 1 + (imgSizeW_ - start_ - sizeX_ + stride_ - 1) / stride_;
+  layerSize = outputH_ * outputW_ * channels_;
+
+  getOutput().setFrameHeight(outputH_);
+  getOutput().setFrameWidth(outputW_);
+  return layerSize;
+}
+
+void MaxPoolProjectionLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  /* note: one sample correspond to one ROW */
+  MatrixPtr input = getInputValue(0);
+  int batchSize = input->getHeight();
+  int size = getSize();
+  resetOutput(batchSize, size);
+
+  MatrixPtr outV = getOutputValue();
+
+  outV->maxPoolForward(*input, imgSizeH_, imgSizeW_, channels_, sizeX_, start_,
+                       stride_, outputH_, outputW_);
+}
+
+void MaxPoolProjectionLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  if (NULL == getInputGrad(0)) {
+    return;
+  }
+
+  /* Do derivation */
+  MatrixPtr outGrad = getOutputGrad();
+  MatrixPtr inputV = getInputValue(0);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr inputGrad = getInputGrad(0);
+
+  inputGrad->maxPoolBackward(*inputV, imgSizeH_, imgSizeW_, *outGrad, *outV,
+                             sizeX_, start_, stride_, outputH_, outputW_, 1, 1);
+}
+
+void AvgPoolProjectionLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  /* note: one sample correspond to one ROW */
+  MatrixPtr input = getInputValue(0);
+  int batchSize = input->getHeight();
+  int size = getSize();
+  resetOutput(batchSize, size);
+
+  MatrixPtr outV = getOutputValue();
+
+  outV->avgPoolForward(*input, imgSizeH_, imgSizeW_, channels_, sizeX_, start_,
+                       stride_, outputH_, outputW_);
+}
+
+void AvgPoolProjectionLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  if (NULL == getInputGrad(0)) {
+    return;
+  }
+  /* Do derivation */
+  MatrixPtr outputGrad = getOutputGrad();
+  MatrixPtr inputGrad = getInputGrad(0);
+  inputGrad->avgPoolBackward(*outputGrad, imgSizeH_, imgSizeW_, sizeX_, start_,
+                             stride_, outputH_, outputW_, 1, 1);
+}
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjectionLayer.h b/paddle/gserver/layers/PoolProjectionLayer.h
new file mode 100644
index 00000000000000..ce321946b1e853
--- /dev/null
+++ b/paddle/gserver/layers/PoolProjectionLayer.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "PoolLayer.h"
+#include "paddle/math/Matrix.h"
+#include <vector>
+
+namespace paddle {
+
+class PoolProjectionLayer : public PoolLayer {
+protected:
+  size_t imgSizeH_, imgSizeW_;
+  size_t outputH_, outputW_;
+
+public:
+  size_t getSize();
+  explicit PoolProjectionLayer(const LayerConfig& config) : PoolLayer(config) {}
+};
+
+class MaxPoolProjectionLayer : public PoolProjectionLayer {
+public:
+  explicit MaxPoolProjectionLayer(const LayerConfig& config)
+      : PoolProjectionLayer(config) {}
+
+  ~MaxPoolProjectionLayer() {}
+
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback = nullptr);
+};
+
+class AvgPoolProjectionLayer : public PoolProjectionLayer {
+public:
+  explicit AvgPoolProjectionLayer(const LayerConfig& config)
+      : PoolProjectionLayer(config) {}
+
+  ~AvgPoolProjectionLayer() {}
+
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback = nullptr);
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PowerLayer.cpp b/paddle/gserver/layers/PowerLayer.cpp
new file mode 100644
index 00000000000000..44c5e6063b1aed
--- /dev/null
+++ b/paddle/gserver/layers/PowerLayer.cpp
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * This layer applys a power function to a vector element-wise,
+ * which is used in NEURAL TURING MACHINE.
+ * \f[
+ *   y = x^w
+ * \f]
+ * where \f$x\f$ is a input vector, \f$w\f$ is scalar weight, 
+ * and output \f$y\f$ is a vector.
+ *
+ * The config file api is power_layer.
+ */
+
+class PowerLayer : public Layer {
+protected:
+  MatrixPtr tmpMtx;
+
+public:
+  explicit PowerLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~PowerLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+REGISTER_LAYER(power, PowerLayer);
+
+bool PowerLayer::init(const LayerMap& layerMap,
+                      const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+
+  return true;
+}
+
+void PowerLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV1->getHeight();
+  size_t dataDim = inV1->getWidth();
+
+  CHECK_EQ(getSize(), dataDim);
+  CHECK_EQ(1U, inV0->getWidth());
+  CHECK_EQ(batchSize, inV0->getHeight());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  {
+    REGISTER_TIMER_INFO("FwPowerTimer", getName().c_str());
+    outV->rowPow(0, *inV1, *inV0);
+  }
+}
+
+void PowerLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+
+  size_t batchSize = inV1->getHeight();
+  size_t dataDim = inV1->getWidth();
+
+  {
+    REGISTER_TIMER_INFO("BwPowerTimer", getName().c_str());
+    Matrix::resizeOrCreate(tmpMtx, batchSize, dataDim, false, useGpu_);
+
+    if (inG0) {
+      tmpMtx->log(*inV1);
+      tmpMtx->dotMul(*tmpMtx, *outV);
+
+      // inG0 += outG .* (log(inV1) * outV)
+      inG0->rowDotMul(0, *outG, *tmpMtx);
+    }
+
+    if (inG1) {
+      // tmp = (outV / inV1) * inV0
+      tmpMtx->dotDiv(*outV, *inV1);
+      tmpMtx->rowScale(0, *tmpMtx, *inV0);
+
+      inG1->addDotMul(*outG, *tmpMtx, 1, 1);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Projection.cpp b/paddle/gserver/layers/Projection.cpp
new file mode 100644
index 00000000000000..aebc08f4a0e593
--- /dev/null
+++ b/paddle/gserver/layers/Projection.cpp
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "Projection.h"
+
+#include "ContextProjection.h"
+#include "FullMatrixProjection.h"
+#include "TableProjection.h"
+
+namespace paddle {
+
+ClassRegistrar<Projection, ProjectionConfig, ParameterPtr, bool>
+    Projection::registrar_;
+
+Projection* Projection::create(const ProjectionConfig& config,
+                               ParameterPtr parameter, bool useGpu) {
+  return registrar_.createByType(config.type(), config, parameter, useGpu);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Projection.h b/paddle/gserver/layers/Projection.h
new file mode 100644
index 00000000000000..3fa3a0cc230ac4
--- /dev/null
+++ b/paddle/gserver/layers/Projection.h
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "paddle/parameter/Parameter.h"
+#include "ModelConfig.pb.h"
+#include "Layer.h"
+
+namespace paddle {
+
+// Macro for registering a projection type
+// Example: REGISTER_LAYER(fc, FullMatrixProjection);
+#define REGISTER_PROJECTION(__type_name, __class_name)                \
+  static InitFunction __reg_type_##__type_name([]() {                 \
+    Projection::registrar_.registerClass<__class_name>(#__type_name); \
+  })
+
+/**
+ * A projection takes one Argument as input, calculate the result and add it
+ * to output Argument.
+ */
+class Projection {
+public:
+  static Projection* create(const ProjectionConfig& config,
+                            ParameterPtr parameter, bool useGpu);
+
+  Projection(const ProjectionConfig& config, ParameterPtr parameter,
+             bool useGpu)
+      : config_(config), parameter_(parameter), useGpu_(useGpu) {}
+
+  virtual ~Projection() {}
+
+  const std::string& getName() const { return config_.name(); }
+
+  /// Register a projection
+  static ClassRegistrar<Projection, ProjectionConfig, ParameterPtr, bool>
+      registrar_;
+
+  /**
+   * Forward propagation. If backward() will be called, in and out must be kept valid until then.
+   * @param in input of projection
+   * @param out output of projection
+   * @param passType PASS_TRAIN of PASS_TEST
+   */
+  void forward(const Argument* in, const Argument* out, PassType passType) {
+    in_ = in;
+    out_ = out;
+    passType_ = passType;
+    forward();
+  }
+
+  virtual void prefetch(const Argument* in) {}
+  virtual void forward() = 0;
+  virtual void backward(const UpdateCallback& callback) = 0;
+
+  /**
+   * See comment in Layer.h for the function with the same name.
+   */
+  virtual void resetState() {}
+
+  /**
+   * Set layer state.
+   */
+  virtual void setState(LayerStatePtr state) {}
+
+  /**
+   * Get layer state. A copy of internal state is returned.
+   */
+  virtual LayerStatePtr getState() { return nullptr; }
+
+  /**
+   * Get output size of projection.
+   */
+  size_t getOutputSize() const { return config_.output_size(); }
+
+protected:
+  /// Config of projection
+  ProjectionConfig config_;
+  /// Parameter of projection
+  ParameterPtr parameter_;
+  bool useGpu_;
+
+  /// Store `in` passed to forward()
+  const Argument* in_;
+  /// Store `out` passed to forward()
+  const Argument* out_;
+  /// Store `passType` passed to forward()
+  PassType passType_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp
new file mode 100644
index 00000000000000..a7c7b893283180
--- /dev/null
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
@@ -0,0 +1,334 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "Layer.h"
+#include "paddle/utils/Stat.h"
+#include "SequenceToBatch.h"
+#include "paddle/utils/CommandLineParser.h"
+
+P_DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");
+
+namespace paddle {
+
+/*
+RecurrentLayer takes 1 input layer with the same size.
+For each sequence [start, end] it performs the following computation:
+out_i = act(in_i)                 for i = start
+out_i = act(in_i + out_{i-1} * W) for start < i <= end
+
+If reversed is true, the order is reversed:
+out_i = act(in_i)                 for i = end
+out_i = act(in_i + out_{i+1} * W) for start <= i < end
+*/
+class RecurrentLayer : public Layer {
+public:
+  explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback& callback);
+
+  void resetState();
+
+  void setState(LayerStatePtr state);
+
+  LayerStatePtr getState();
+
+protected:
+  void forwardSequence(int batchSize, size_t numSequences, const int* starts);
+  void forwardOneSequence(int start, int length);
+  void backwardSequence(int batchSize, size_t numSequences, const int* starts);
+  void backwardOneSequence(int start, int length);
+
+  void forwardBatch(int batchSize, size_t numSequences, const int* starts);
+  void backwardBatch(int batchSize, size_t numSequences, const int* starts);
+
+protected:
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> bias_;
+
+  // frameOutput_[i] is used to hold the i-th sample of output_
+  std::vector<Argument> frameOutput_;
+  MatrixPtr prevOutput_;
+  bool reversed_;
+  std::unique_ptr<SequenceToBatch> batchValue_;
+  std::unique_ptr<SequenceToBatch> batchGrad_;
+};
+
+REGISTER_LAYER(recurrent, RecurrentLayer);
+
+bool RecurrentLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(1U, inputLayers_.size());
+  CHECK_EQ(1U, parameters_.size());
+  CHECK_EQ(getSize() * getSize(), parameters_[0]->getSize());
+  weight_.reset(new Weight(getSize(), getSize(), parameters_[0]));
+  if (biasParameter_.get() != NULL) {
+    bias_.reset(new Weight(1, getSize(), biasParameter_));
+  }
+  reversed_ = config_.reversed();
+  return true;
+}
+
+void RecurrentLayer::resetState() {
+  CHECK(!reversed_) << "state is not allowed for reversed recurrent layer";
+  Matrix::resizeOrCreate(prevOutput_, 1, getSize(), /* trans= */ false,
+                         useGpu_);
+  prevOutput_->zeroMem();
+}
+
+void RecurrentLayer::setState(LayerStatePtr state) {
+  CHECK(state->value.size() == 1) << "one matrix is expected for RNN state";
+  prevOutput_->copyFrom(*(state->value[0]));
+}
+
+LayerStatePtr RecurrentLayer::getState() {
+  LayerStatePtr res = std::make_shared<LayerState>();
+  res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
+  res->value[0]->copyFrom(*prevOutput_);
+  return res;
+}
+
+void RecurrentLayer::forward(PassType passType) {
+  REGISTER_TIMER_INFO("RecurrentFwTimer", getName().c_str());
+  Layer::forward(passType);
+  const Argument& input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  size_t numSequences = input.getNumSequences();
+  resetOutput(batchSize, getSize());
+  CHECK_EQ(getSize(), input.value->getWidth());
+  const int* starts = input.sequenceStartPositions->getData(false);
+  CHECK_EQ(starts[numSequences], batchSize);
+
+  output_.value->assign(*input.value);
+  if (bias_) {
+    output_.value->addBias(*bias_->getW(), 1);
+  }
+  if (!FLAGS_rnn_use_batch) {
+    forwardSequence(batchSize, numSequences, starts);
+  } else {
+    forwardBatch(batchSize, numSequences, starts);
+  }
+}
+
+void RecurrentLayer::forwardSequence(int batchSize, size_t numSequences,
+                                     const int* starts) {
+  REGISTER_TIMER_INFO("RecurrentFwSequence", getName().c_str());
+  frameOutput_.reserve(batchSize);
+  for (int i = frameOutput_.size(); i < batchSize; ++i) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr, /* height= */ 1, getSize(),
+                               /* trans= */ false, useGpu_);
+    arg.grad = Matrix::create(nullptr, /* height= */ 1, getSize(),
+                              /* trans= */ false, useGpu_);
+    frameOutput_.push_back(arg);
+  }
+
+  for (int i = 0; i < batchSize; ++i) {
+    frameOutput_[i].value->setData(output_.value->getData() + i * getSize());
+  }
+
+  AsyncGpuBlock asyncGpuBlock;
+  for (size_t i = 0; i < numSequences; ++i) {
+    forwardOneSequence(starts[i], starts[i + 1] - starts[i]);
+  }
+}
+
+void RecurrentLayer::forwardOneSequence(int start, int length) {
+  if (!reversed_) {
+    if (prevOutput_) {
+      frameOutput_[start].value->mul(prevOutput_, weight_->getW(), 1, 1);
+    }
+    activation_->forward(frameOutput_[start]);
+    for (int i = 1; i < length; ++i) {
+      frameOutput_[start + i].value->mul(frameOutput_[start + i - 1].value,
+                                         weight_->getW(), 1, 1);
+      activation_->forward(frameOutput_[start + i]);
+    }
+    if (prevOutput_) {
+      prevOutput_->assign(*frameOutput_[start + length - 1].value);
+    }
+  } else {
+    activation_->forward(frameOutput_[start + length - 1]);
+    for (int i = length - 2; i >= 0; --i) {
+      frameOutput_[start + i].value->mul(frameOutput_[start + i + 1].value,
+                                         weight_->getW(), 1, 1);
+      activation_->forward(frameOutput_[start + i]);
+    }
+  }
+}
+
+void RecurrentLayer::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("RecurrentBwTimer", getName().c_str());
+  const Argument& input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  const int* starts = input.sequenceStartPositions->getData(false);
+  size_t numSequences = input.getNumSequences();
+
+  if (!FLAGS_rnn_use_batch) {
+    backwardSequence(batchSize, numSequences, starts);
+  } else {
+    backwardBatch(batchSize, numSequences, starts);
+  }
+
+  if (input.grad) {
+    input.grad->add(*output_.grad);
+  }
+
+  if (bias_ && bias_->getWGrad()) {
+    bias_->getWGrad()->collectBias(*output_.grad, 1);
+    bias_->getParameterPtr()->incUpdate(callback);
+  }
+
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+void RecurrentLayer::backwardSequence(int batchSize, size_t numSequences,
+                                      const int* starts) {
+  REGISTER_TIMER_INFO("RecurrentBwSequence", getName().c_str());
+  for (int i = 0; i < batchSize; ++i) {
+    frameOutput_[i].grad->setData(output_.grad->getData() + i * getSize());
+  }
+
+  AsyncGpuBlock asyncGpuBlock;
+  for (size_t i = 0; i < numSequences; ++i) {
+    backwardOneSequence(starts[i], starts[i + 1] - starts[i]);
+  }
+}
+
+void RecurrentLayer::backwardOneSequence(int start, int length) {
+  MatrixPtr weightT = weight_->getW()->getTranspose();
+  if (!reversed_) {
+    for (int i = length - 1; i > 0; --i) {
+      activation_->backward(frameOutput_[start + i]);
+      frameOutput_[start + i - 1].grad->mul(frameOutput_[start + i].grad,
+                                            weightT, 1, 1);
+    }
+    activation_->backward(frameOutput_[start]);
+    if (weight_->getWGrad()) {
+      weight_->getWGrad()->mul(
+          output_.value->subMatrix(start, length - 1)->getTranspose(),
+          output_.grad->subMatrix(start + 1, length - 1), 1, 1);
+    }
+  } else {
+    for (int i = 0; i < length - 1; ++i) {
+      activation_->backward(frameOutput_[start + i]);
+      frameOutput_[start + i + 1].grad->mul(frameOutput_[start + i].grad,
+                                            weightT, 1, 1);
+    }
+    activation_->backward(frameOutput_[start + length - 1]);
+    if (weight_->getWGrad()) {
+      weight_->getWGrad()->mul(
+          output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
+          output_.grad->subMatrix(start, length - 1), 1, 1);
+    }
+  }
+}
+
+void RecurrentLayer::forwardBatch(int batchSize, size_t numSequences,
+                                  const int* starts) {
+  if (!batchValue_) {
+    batchValue_.reset(new SequenceToBatch(useGpu_));
+  }
+
+  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_);
+
+  batchValue_->copyFromSeq(*output_.value);
+  {
+    REGISTER_TIMER_INFO("RecurrentFwBatch", getName().c_str());
+    AsyncGpuBlock asyncGpuBlock;
+    /* forward one batch */
+    for (size_t n = 0; n < batchValue_->getNumBatch(); n++) {
+      MatrixPtr batch2 = batchValue_->getBatchValue(n);
+
+      if (n != 0) {
+        MatrixPtr batch1 =
+            batchValue_->getBatchValue(n - 1, batch2->getHeight());
+        batch2->mul(batch1, weight_->getW(), 1, 1);
+      }
+      Argument arg;
+      arg.value = batch2;
+      activation_->forward(arg);
+    }
+  }
+  batchValue_->copyBackSeq(*output_.value);
+}
+
+void RecurrentLayer::backwardBatch(int batchSize, size_t numSequences,
+                                   const int* starts) {
+  if (!batchGrad_) {
+    batchGrad_.reset(new SequenceToBatch(useGpu_));
+  }
+  batchGrad_->shareIndexWith(*batchValue_);
+
+  size_t numBatch = batchGrad_->getNumBatch();
+  bool backwardByBatch = numBatch < numSequences;
+
+  batchGrad_->copyFromSeq(*output_.grad);
+  {
+    REGISTER_TIMER_INFO("RecurrentBwData", getName().c_str());
+    MatrixPtr weightT = weight_->getW()->getTranspose();
+    AsyncGpuBlock asyncGpuBlock;
+    /* backward one batch */
+    for (int n = (int)numBatch - 1; n >= 0; n--) {
+      MatrixPtr batch2 = batchGrad_->getBatchValue(n);
+      MatrixPtr batch1 = batchValue_->getBatchValue(n, batch2->getHeight());
+
+      Argument arg;
+      arg.value = batch1;
+      arg.grad = batch2;
+      activation_->backward(arg);
+
+      if (n != 0) {
+        batch1 = batchGrad_->getBatchValue(n - 1, batch2->getHeight());
+        batch1->mul(batch2, weightT, 1, 1);
+      }
+
+      if (backwardByBatch && weight_->getWGrad()) {
+        if (n != 0) {
+          /* backward weight */
+          batch1 = batchValue_->getBatchValue(n - 1, batch2->getHeight());
+          weight_->getWGrad()->mul(batch1->getTranspose(), batch2, 1, 1);
+        }
+      }
+    }
+  }
+
+  batchGrad_->copyBackSeq(*output_.grad);
+
+  if (!backwardByBatch && weight_->getWGrad()) {
+    REGISTER_TIMER_INFO("RecurrentBwWeight", getName().c_str());
+    AsyncGpuBlock asyncGpuBlock;
+    for (size_t seq = 0; seq < numSequences; ++seq) {
+      int len = starts[seq + 1] - starts[seq];
+      if (!reversed_) {
+        weight_->getWGrad()->mul(
+            output_.value->subMatrix(starts[seq], len - 1)->getTranspose(),
+            output_.grad->subMatrix(starts[seq] + 1, len - 1), 1, 1);
+      } else {
+        weight_->getWGrad()->mul(
+            output_.value->subMatrix(starts[seq] + 1, len - 1)->getTranspose(),
+            output_.grad->subMatrix(starts[seq], len - 1), 1, 1);
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/RecurrentLayerGroup.cpp b/paddle/gserver/layers/RecurrentLayerGroup.cpp
new file mode 100644
index 00000000000000..62dbaa2674ce62
--- /dev/null
+++ b/paddle/gserver/layers/RecurrentLayerGroup.cpp
@@ -0,0 +1,92 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/gserver/layers/Layer.h"
+#include <functional>
+
+#include "paddle/gserver/gradientmachines/RecurrentGradientMachine.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * Recurrent layer group is a group of layers, which forward/backward one frame
+ * after previous frame forward/backward through all layers in layer group.
+ * It's automatically added by config_parser if some layers are defined
+ * between RecurrentLayerGroupBegin and RecurrentLayerGroupEnd.
+ */
+class RecurrentLayerGroup : public Layer {
+public:
+  explicit RecurrentLayerGroup(const LayerConfig& config) : Layer(config) {}
+
+  void initSubNetwork(NeuralNetwork* rootNetwork, const ModelConfig& config,
+                      const std::vector<ParameterType>& parameterTypes,
+                      bool useGpu);
+
+  void forward(PassType passType) {
+    REGISTER_TIMER_INFO("RecurrentGroupFwTime", getName().c_str());
+    const std::vector<Argument> inArgs;
+    std::vector<Argument> outArgs;
+    network_->forward(inArgs, &outArgs, passType);
+  }
+  void backward(const UpdateCallback& callback) {
+    REGISTER_TIMER_INFO("RecurrentGroupBwTime", getName().c_str());
+    network_->backward(nullptr);
+
+    for (auto& para : parameters_) {
+      para->incUpdate(callback);
+    }
+  }
+
+  /**
+   * @see Layer.accessSubNetwork
+   */
+  void accessSubNetwork(const std::function<void(NeuralNetwork &)> &callback) {
+    callback(*network_);
+  }
+
+private:
+  std::unique_ptr<RecurrentGradientMachine> network_;
+};
+
+REGISTER_LAYER(recurrent_layer_group, RecurrentLayerGroup);
+
+void RecurrentLayerGroup::initSubNetwork(
+    NeuralNetwork* rootNetwork, const ModelConfig& config,
+    const std::vector<ParameterType>& parameterTypes, bool useGpu) {
+  setNeedGradient(true);
+
+  network_.reset(new RecurrentGradientMachine(config_.name(), rootNetwork));
+  ParamInitCallback cb = [this, rootNetwork](int paramId, Parameter* para) {
+    para->enableSharedType(
+        PARAMETER_VALUE,
+        rootNetwork->getParameters()[paramId]->getBuf(PARAMETER_VALUE),
+        rootNetwork->getParameters()[paramId]->getMat(PARAMETER_VALUE));
+    para->enableSharedType(
+        PARAMETER_GRADIENT,
+        rootNetwork->getParameters()[paramId]->getBuf(PARAMETER_GRADIENT),
+        rootNetwork->getParameters()[paramId]->getMat(PARAMETER_GRADIENT));
+  };
+  network_->init(config, cb, parameterTypes, useGpu);
+
+  for (auto paramId : network_->getParameterIds()) {
+    ParameterPtr parameter = rootNetwork->getParameters()[paramId];
+    parameter->incShared();
+    CHECK_EQ(parameter->getDeviceId(), getDeviceId());
+    parameters_.push_back(parameter);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ResizeLayer.cpp b/paddle/gserver/layers/ResizeLayer.cpp
new file mode 100644
index 00000000000000..df3a7fb1263ff9
--- /dev/null
+++ b/paddle/gserver/layers/ResizeLayer.cpp
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/BaseMatrix.h"
+
+namespace paddle {
+/* resize a minibatch matrix h*w to h'*w' */
+class ResizeLayer : public Layer {
+public:
+  explicit ResizeLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback& callback);
+};
+
+REGISTER_LAYER(resize, ResizeLayer);
+
+bool ResizeLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(1U, inputLayers_.size());
+
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void ResizeLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  const Argument& input = getInput(0);
+  size_t height = input.value->getHeight();
+  size_t width = input.value->getWidth();
+  CHECK_EQ((height * width) % getSize(), 0UL);
+
+  reserveOutput(height * width / getSize(), getSize());
+  MatrixPtr tmp =
+      Matrix::create(output_.value->getData(), height, width, false, useGpu_);
+  tmp->assign(*input.value);
+}
+
+void ResizeLayer::backward(const UpdateCallback& callback) {
+  const Argument& input = getInput(0);
+  size_t height = input.value->getHeight();
+  size_t width = input.value->getWidth();
+
+  if (!input.grad) {
+    return;
+  }
+
+  MatrixPtr tmp =
+      Matrix::create(input.grad->getData(), height * width / getSize(),
+                     getSize(), false, useGpu_);
+  tmp->add(*output_.grad);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SamplingIdLayer.cpp b/paddle/gserver/layers/SamplingIdLayer.cpp
new file mode 100644
index 00000000000000..41c1461967ae1c
--- /dev/null
+++ b/paddle/gserver/layers/SamplingIdLayer.cpp
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <random>
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for sampling id from multinomial distribution from the
+ * input layer. Sampling one id for one sample. The result is stored in
+ * output_.ids.
+ *
+ * The config file api is sampling_id_layer.
+ */
+class SamplingIdLayer : public Layer {
+  /// Produces random floating-point values, uniformly distributed on [0, 1).
+  std::uniform_real_distribution<double> rand1_;
+  std::vector<Argument> tmpCpuInput_;
+
+public:
+  explicit SamplingIdLayer(const LayerConfig& config)
+      : Layer(config), rand1_(0, 1) {}
+
+  virtual bool init(const LayerMap& layerMap,
+                    const ParameterMap& parameterMap) {
+    bool ret = Layer::init(layerMap, parameterMap);
+    CHECK_EQ(1UL, inputLayers_.size());
+    if (useGpu_) {
+      tmpCpuInput_.reserve(inputLayers_.size());
+      for (size_t i = 0; i < inputLayers_.size(); i++) {
+        tmpCpuInput_.push_back(Argument());
+      }
+    }
+    return ret;
+  }
+
+  void forward(PassType passType) {
+    Layer::forward(passType);
+    if (useGpu_) {
+      for (size_t i = 0; i < inputLayers_.size(); i++) {
+        tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
+      }
+      forwardImp(tmpCpuInput_[0]);
+    } else {
+      forwardImp(getInput(0));
+    }
+  }
+
+  void forwardImp(const Argument& input) {
+    size_t batchSize = input.getBatchSize();
+    IVector::resizeOrCreate(output_.ids, batchSize, useGpu_);
+    real* buf = input.value->getData();
+    int dim = input.value->getWidth();
+    std::vector<int> ids(batchSize);
+    auto& reng = ThreadLocalRandomEngine::get();
+    for (size_t i = 0; i < batchSize; ++i) {
+      double r = rand1_(reng);
+      int id = dim - 1;
+      for (int j = 0; j < dim; ++j) {
+        if ((r -= buf[i * dim + j]) < 0) {
+          id = j;
+          break;
+        }
+      }
+      ids[i] = id;
+    }
+    output_.ids->copyFrom(ids.data(), batchSize);
+  }
+
+  virtual void backward(const UpdateCallback& callback) {}
+};
+
+REGISTER_LAYER(sampling_id, SamplingIdLayer);
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ScalingLayer.cpp b/paddle/gserver/layers/ScalingLayer.cpp
new file mode 100644
index 00000000000000..a494b401ff5972
--- /dev/null
+++ b/paddle/gserver/layers/ScalingLayer.cpp
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for each row of a matrix, multiplying with a element of a vector,
+ * which is used in NEURAL TURING MACHINE.
+ * \f[
+ *   y.row[i] = w[i] * x.row[i]
+ * \f]
+ * where \f$x\f$ is (batchSize x dataDim) input, \f$w\f$ is 
+ * (batchSize x 1) weight vector, and \f$y\f$ is (batchSize x dataDim) output.
+ *
+ * The config file api is scaling_layer.
+ */
+
+class ScalingLayer : public Layer {
+public:
+  explicit ScalingLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ScalingLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+REGISTER_LAYER(scaling, ScalingLayer);
+
+bool ScalingLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+
+  return true;
+}
+
+void ScalingLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr weightV = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV1->getHeight();
+  size_t dataDim = inV1->getWidth();
+
+  CHECK_EQ(dataDim, getSize());
+  CHECK_EQ(weightV->getWidth(), 1U);
+  CHECK_EQ(weightV->getHeight(), batchSize);
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwScalingTimer", getName().c_str());
+    // outV += inV1 * weight
+    outV->addRowScale(0, *inV1, *weightV);
+  }
+}
+
+void ScalingLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr weightV = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+  MatrixPtr outG = getOutputGrad();
+
+  {
+    REGISTER_TIMER_INFO("BwScalingTimer", getName().c_str());
+
+    if (inG0) {
+      // inG0 += outG .* inV1
+      inG0->rowDotMul(0, *outG, *inV1);
+    }
+
+    if (inG1) {
+      // inG1 += outG * weight;
+      inG1->addRowScale(0, *outG, *weightV);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
new file mode 100644
index 00000000000000..25ae9d519533a9
--- /dev/null
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
@@ -0,0 +1,292 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "SelectiveFullyConnectedLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/math/SparseMatrix.h"
+#include <vector>
+#include <algorithm>
+
+namespace paddle {
+
+REGISTER_LAYER(selective_fc, SelectiveFullyConnectedLayer);
+
+bool SelectiveFullyConnectedLayer::init(const LayerMap& layerMap,
+                                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  inputNum_ = inputLayers_.size();
+  if (config_.has_selected_colums()) {
+    inputNum_ -= 1;
+  }
+  for (size_t i = 0; i < inputNum_; i++) {
+    size_t height = inputLayers_[i]->getSize();
+    size_t width = getSize();
+    // NOTE weight is transpoed
+    weights_.emplace_back(new Weight(width, height, parameters_[i]));
+  }
+
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  fullOutput_ = false;
+
+  return true;
+}
+
+void SelectiveFullyConnectedLayer::prefetch() {}
+
+void SelectiveFullyConnectedLayer::reserveOutput(size_t height, size_t width,
+                                                 size_t nnz) {
+  bool flag = (passType_ == PASS_TEST &&
+               config_.selective_fc_pass_generation() &&
+               !fullOutput_);
+  SetDevice device(output_.deviceId);
+  if (flag) {
+    // output_.value is sparse matrix
+    if (dynamic_cast<CpuMatrix*>(output_.value.get()) ||
+        dynamic_cast<GpuMatrix*>(output_.value.get())) {
+      output_.value = nullptr;
+    }
+    Matrix::resizeOrCreateSparseMatrix(output_.value, height, width, nnz,
+                                       FLOAT_VALUE, SPARSE_CSR,
+                                       /*trans=*/false,
+                                       /*useGpu=*/useGpu_);
+    output_.value->copyFrom(*selCols_);
+    interOutput_ = output_.value;
+  } else {
+    if (fullOutput_) {
+      // output_.value is dense matrix
+      if (dynamic_cast<CpuSparseMatrix*>(output_.value.get()) ||
+          dynamic_cast<GpuSparseMatrix*>(output_.value.get())) {
+        output_.value = nullptr;
+      }
+      Matrix::resizeOrCreate(output_.value, height, width,
+                             /*trans=*/false, /*useGpu=*/useGpu_);
+      interOutput_ = output_.value;
+    } else {
+      // output_.value is dense matrix, but width = nnz /height
+      CHECK_EQ(nnz % height, 0U);
+      CHECK(nnz / height);
+      Matrix::resizeOrCreate(output_.value, height, nnz / height,
+                             /*trans=*/false, /*useGpu=*/useGpu_);
+      interOutput_ = Matrix::createSparseMatrix(
+          output_.value->getData(), selCols_->getRows(), selCols_->getCols(),
+          height, width, nnz, FLOAT_VALUE, SPARSE_CSR,
+          /*trans=*/false, /*useGpu=*/useGpu_);
+    }
+  }
+  interOutput_->zeroMem();
+
+  if (passType_ != PASS_TEST && needGradient()) {
+    CHECK_EQ(nnz % height, 0U) << "during training, each sample must have a "
+                                  "same number of selected columns.";
+    CHECK(nnz / height)
+        << "during training, "
+           "each sample must have at least one column selected.";
+    Matrix::resizeOrCreate(output_.grad, height, nnz / height,
+                           /*trans=*/false, /*useGpu=*/useGpu_);
+    output_.grad->zeroMem();
+  }
+}
+
+void SelectiveFullyConnectedLayer::forward(PassType passType) {
+  REGISTER_TIMER("selective_fc.forward");
+  Layer::forward(passType);
+
+  getSelectiveCols();
+  size_t height = getInput(0).getBatchSize();
+  size_t width = getSize();
+  size_t nnz = height * width;
+  if (!fullOutput_) {
+    CHECK(selCols_);
+    CHECK(height == selCols_->getHeight());
+    CHECK(width == selCols_->getWidth());
+    nnz = selCols_->getElementCnt();
+  }
+
+  // Layer::ResetOutput(), here we set outV/outG as SparseMatrix manually
+  // this outV should be used as input of MaxIdLayer and softmax activation
+  reserveOutput(height, width, nnz);
+
+  bool flag = true;
+  for (size_t i = 0; i < inputNum_; i++) {
+    MatrixPtr input = getInputValue(i);
+    MatrixPtr weight = weights_[i]->getW();
+    size_t hsize = input->getHeight();
+    size_t wsize = weight->getHeight();
+    real scaleT = i == 0 ? real(0) : real(1);
+
+    flag = nnz < (hsize * wsize) * config_.selective_fc_full_mul_ratio() &&
+                !fullOutput_;
+    if (flag) {
+      // if the indecies are highly sparse,
+      // manully compute the multiplication of
+      // the input vector and the selected rows.
+      REGISTER_TIMER("selective.plain");
+      interOutput_->mul(input, weight->getTranspose(), 1, scaleT);
+    } else {
+      // if the indecies is not sparse enough,
+      // use full mul instead
+      REGISTER_TIMER("selective.mul");
+      if (fullOutput_) {
+        interOutput_->mul(input, weight->getTranspose(), 1, scaleT);
+      } else {
+        Matrix::resizeOrCreate(mmat_, hsize, wsize,
+                               /*trans=*/false, /*useGpu=*/useGpu_);
+        mmat_->mul(input, weight->getTranspose());
+        interOutput_->add3(mmat_);
+      }
+    }
+  }
+
+  if (biases_) {
+    interOutput_->addBias(*(biases_->getW()), 1);
+  }
+
+  flag = (passType_ == PASS_TEST && config_.selective_fc_pass_generation() &&
+         !fullOutput_);
+  if (flag) {
+    // during generation, output of this layer is a sparse csr matrix,
+    // which is probably the input of maxid layer
+    // if the model is trained with multi-class-cross-entroy-with-selfnorm,
+    // activiation of this layer should be exponential, not softmax.
+
+    Argument arg;
+    arg.value = Matrix::create(interOutput_->getData(), 1, nnz,
+                               /*trans=*/false, /*useGpu=*/useGpu_);
+    activation_->forward(arg);
+  } else /* train and test in train, not generating */ {
+    // during training, this layer output value is *Matrix*, which is input of
+    // eg. multi-class-cross-entropy
+
+    // while training, every sample has a equal number of selected
+    // columns to be activated.
+    // note indices of multi-class-cross-entropy need to be remapped
+    // to this index.
+    // e.g. sample = [1,3,5] and 3 is gold, then label is 1
+
+    forwardActivation();
+  }
+}
+
+void SelectiveFullyConnectedLayer::backward(const UpdateCallback& callback) {
+  backwardActivation();
+  MatrixPtr oGrad = getOutputGrad();
+  if (!fullOutput_) {
+    interOutGrad_ = Matrix::createSparseMatrix(
+        oGrad->getData(), interOutput_->getRows(), interOutput_->getCols(),
+        interOutput_->getHeight(), interOutput_->getWidth(),
+        interOutput_->getElementCnt(), FLOAT_VALUE, SPARSE_CSR,
+        /*trans=*/false,
+        /*useGpu=*/useGpu_);
+  } else {
+    interOutGrad_ =
+        Matrix::create(oGrad->getData(), oGrad->getHeight(), oGrad->getWidth(),
+                       /*trans=*/false,
+                       /*useGpu=*/useGpu_);
+  }
+
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
+    biases_->getWGrad()->collectBias(*interOutGrad_, 1);
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  // backward is different from FullyConnectedLayer
+  // because the weight is transposed
+  for (size_t i = 0; i < inputNum_; i++) {
+    AsyncGpuBlock block;
+    MatrixPtr preGrad = getInputGrad(i);
+    if (preGrad) {
+      REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
+      preGrad->mul(interOutGrad_, weights_[i]->getW(), 1, 1);
+    }
+
+    MatrixPtr wGrad = weights_[i]->getWGrad();
+    if (wGrad) {
+      REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
+      MatrixPtr input = getInputValue(i);
+      wGrad->mul(interOutGrad_->getTranspose(), input, 1, 1);
+    }
+
+    {
+      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+
+void paddle::SelectiveFullyConnectedLayer::fillSelectiveData(
+    const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& candidates) {
+  if (candidates == nullptr) {
+    fillFullySelectiveData();
+    return;
+  }
+
+  size_t sampleNum = candidates->size();
+  size_t outputWidth = getSize();
+  size_t nnz =
+      std::accumulate(candidates->begin(), candidates->end(), 0UL,
+                      [](size_t a, const std::pair<int*, size_t>& arr) {
+                        return a + arr.second;
+                      });
+
+  Matrix::resizeOrCreateSparseMatrix(this->cpuSelCols_,
+    sampleNum, outputWidth, nnz, NO_VALUE, SPARSE_CSR, false, false);
+  CHECK(this->cpuSelCols_ != nullptr);
+  CpuSparseMatrixPtr selCols =
+      std::dynamic_pointer_cast<CpuSparseMatrix>(cpuSelCols_);
+  int* rowOffsets = selCols->getRows();
+  int* colIndices = selCols->getCols();
+
+  rowOffsets[0] = 0;
+  int idx = 0;
+  for (size_t i = 0; i < sampleNum; ++i) {
+    if ((*candidates)[i].second > 0) {
+      rowOffsets[i + 1] = rowOffsets[i] + (*candidates)[i].second;
+      for (size_t j = 0; j < (*candidates)[i].second; ++j) {
+        colIndices[idx] = (*candidates)[i].first[j];
+        idx++;
+      }
+    } else {
+      rowOffsets[i + 1] = rowOffsets[i];
+    }
+  }
+
+  CHECK_EQ(static_cast<size_t>(rowOffsets[sampleNum]), nnz);
+  if (!useGpu_) {
+    this->selCols_ = this->cpuSelCols_;
+  } else {
+    Matrix::resizeOrCreateSparseMatrix(this->selCols_,
+          sampleNum, outputWidth, nnz, NO_VALUE, SPARSE_CSR, false, true);
+    this->selCols_->copyFrom(*cpuSelCols_, HPPL_STREAM_1);
+    hl_stream_synchronize(HPPL_STREAM_1);
+  }
+
+  fullOutput_ = false;
+}
+
+void paddle::SelectiveFullyConnectedLayer::getSelectiveCols() {
+  if (config_.has_selected_colums()) {
+    this->selCols_ = inputLayers_[inputNum_]->getOutputValue();
+    fullOutput_ = false;
+  } else if (!config_.selective_fc_pass_generation() || selCols_ == nullptr) {
+    this->fillFullySelectiveData();
+  }  // else selCols_ is initialized by fillSelectiveData
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
new file mode 100644
index 00000000000000..c152151cff051b
--- /dev/null
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * @brief The SelectiveFullyConnectedLayer class
+ *
+ * SelectiveFullyConnectedLayer differs from FullyConnectedLayer by that it
+ * requires an additional input to indicate several selected columns, and only
+ * compute the multiplications between the input matrices and the selected
+ * columns of the parameter matrices of this layer. If the selected columns is
+ * not specified, SelectiveFullyConnected layer acts exactly like
+ * FullyConnectedLayer.
+ *
+ * The config file api is selective_fc_layer.
+ */
+class SelectiveFullyConnectedLayer : public Layer {
+protected:
+  WeightList weights_;
+  std::unique_ptr<Weight> biases_;
+
+private:
+  /**
+   * Get selected columns each forward.
+   */
+  void getSelectiveCols();
+
+  MatrixPtr mmat_;
+  /// cpuSelCols_ is a CpuSparseMatrix, used to save selected columns.
+  MatrixPtr cpuSelCols_;
+  /// CpuSparseMatrix or GpuSparseMatrix. In CPU mode, selCols_ points
+  /// to cpuSelCols_.
+  MatrixPtr selCols_;
+  size_t inputNum_;
+
+  /// interOutput_ shared same memory with output_.value.
+  MatrixPtr interOutput_;
+
+  /// if fullOutput_ is false, interOutGrad_ sparse matrix
+  MatrixPtr interOutGrad_;
+
+  /// if true, means output_.value is the same as Fc Layer
+  bool fullOutput_;
+
+public:
+  explicit SelectiveFullyConnectedLayer(const LayerConfig& config)
+      : Layer(config), selCols_(nullptr) {}
+
+  ~SelectiveFullyConnectedLayer() {}
+  void prefetch();
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  Weight& getWeight(int idx) { return *weights_[idx]; }
+
+  /**
+   * @brief Resize the output matrix size.
+   * And reset value to zero
+   */
+  void reserveOutput(size_t height, size_t width, size_t nnz);
+
+  /**
+   * @brief Fill candidates to select several activations as output.
+   * @param candidates specifies several selected columns of the parameter
+   * matrices of this layer.
+   * Multiplications only between the input matrices and the selected columns
+   * are computed.
+   * If the candidates is a nullptr, selective fc layer acts exactly like the
+   * fully connected layer.
+   * @note CURRENTLY, THIS METHOD IS ONLY USED FOR BEAM SEARCH
+   */
+  void fillSelectiveData(
+      const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& candidates);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+
+private:
+  /**
+   * @brief Make SelectiveFC act as FullyConnectedLayer
+   */
+  void fillFullySelectiveData() {
+    fullOutput_ = true;
+  }
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceConcatLayer.cpp b/paddle/gserver/layers/SequenceConcatLayer.cpp
new file mode 100644
index 00000000000000..dfce4dcb196132
--- /dev/null
+++ b/paddle/gserver/layers/SequenceConcatLayer.cpp
@@ -0,0 +1,187 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for concatenating the first sequence with the second sequence
+ * following the first
+ * Input: two sequences each containing some instances
+ * Output: a concatenated sequence of the two input sequences
+ */
+
+class SequenceConcatLayer : public Layer {
+protected:
+  std::unique_ptr<Weight> biases_;
+
+public:
+  explicit SequenceConcatLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~SequenceConcatLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+REGISTER_LAYER(seqconcat, SequenceConcatLayer);
+
+bool SequenceConcatLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  // sequene concatenation layer should have exactly 2 inputs
+  CHECK_EQ(2U, inputLayers_.size());
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SequenceConcatLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  size_t dim = getSize();
+
+  const Argument& input1 = getInput(0);
+  size_t numSequences1 = input1.getNumSequences();
+  auto startPositions1 =
+      input1.sequenceStartPositions->getVector(false);
+
+  const Argument& input2 = getInput(1);
+  size_t numSequences2 = input2.getNumSequences();
+  auto startPositions2 =
+      input2.sequenceStartPositions->getVector(false);
+
+  CHECK_EQ(dim, input1.value->getWidth());
+  CHECK_EQ(startPositions1->getData()[numSequences1], input1.getBatchSize());
+  CHECK_EQ(numSequences1, startPositions1->getSize() - 1);
+
+  CHECK_EQ(dim, input2.value->getWidth());
+  CHECK_EQ(startPositions2->getData()[numSequences2], input2.getBatchSize());
+  CHECK_EQ(numSequences2, startPositions2->getSize() - 1);
+
+  CHECK_EQ(numSequences1, numSequences2);
+
+  MatrixPtr inputValue1 = getInputValue(0);
+  MatrixPtr inputValue2 = getInputValue(1);
+
+  // reset output
+  reserveOutput(inputValue1->getHeight() + inputValue2->getHeight(), dim);
+
+  MatrixPtr outputValue = getOutputValue();
+
+  const int* starts1 = startPositions1->getData();
+  const int* starts2 = startPositions2->getData();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SequenceConcatLayerForward", getName().c_str());
+
+    size_t offset = 0;
+    size_t leftNumIns = 0;
+    size_t rightNumIns = 0;
+    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
+      leftNumIns = starts1[seqId + 1] - starts1[seqId];
+      outputValue->subMatrix(offset, leftNumIns)
+          ->assign(*(inputValue1->subMatrix(starts1[seqId], leftNumIns)));
+      offset += leftNumIns;
+
+      rightNumIns = starts2[seqId + 1] - starts2[seqId];
+      outputValue->subMatrix(offset, rightNumIns)
+          ->assign(*(inputValue2->subMatrix(starts2[seqId], rightNumIns)));
+      offset += rightNumIns;
+    }
+
+    // modify the sequenceStartPositions
+    ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions,
+                                   numSequences1 + 1, false);
+
+    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
+
+    for (size_t seqId = 0; seqId < numSequences1 + 1; ++seqId) {
+      tgtBuf[seqId] = starts1[seqId] + starts2[seqId];
+    }
+  }
+
+  if (biases_.get() != NULL) {
+    MatrixPtr outV = getOutputValue();
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */
+  forwardActivation();
+}
+
+void SequenceConcatLayer::backward(const UpdateCallback& callback) {
+  /* activation */
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  MatrixPtr inputGrad1 = getInputGrad(0);
+  MatrixPtr inputGrad2 = getInputGrad(1);
+  MatrixPtr outputGrad = getOutputGrad();
+  auto startPositions1 =
+      getInput(0).sequenceStartPositions->getVector(false);
+  auto startPositions2 =
+      getInput(1).sequenceStartPositions->getVector(false);
+
+  size_t numSequences1 = startPositions1->getSize() - 1;
+  size_t numSequences2 = startPositions2->getSize() - 1;
+
+  CHECK_EQ(numSequences1, numSequences2);
+
+  const int* starts1 = startPositions1->getData();
+  const int* starts2 = startPositions2->getData();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SequenceConcatLayerBackward", getName().c_str());
+
+    size_t offset = 0;
+    size_t leftNumIns = 0;
+    size_t rightNumIns = 0;
+    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
+      leftNumIns = starts1[seqId + 1] - starts1[seqId];
+      inputGrad1->subMatrix(starts1[seqId], leftNumIns)
+          ->add(*(outputGrad->subMatrix(offset, leftNumIns)));
+      offset += leftNumIns;
+
+      rightNumIns = starts2[seqId + 1] - starts2[seqId];
+      inputGrad2->subMatrix(starts2[seqId], rightNumIns)
+          ->add(*(outputGrad->subMatrix(offset, rightNumIns)));
+      offset += rightNumIns;
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
new file mode 100644
index 00000000000000..12831e36688029
--- /dev/null
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -0,0 +1,173 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for extracting the last instance of the input sequence.
+ * Input: a sequence
+ * If SequenceLevel = kNonseq:
+ *   Output: a sequence containing only the last instance of the input sequence
+ * If SequenceLevel = kSeq:
+ *   Check input sequence must has sub-sequence
+ *   Output: a sequence containing only the last instance of each sub-sequence
+ * of the input sequence
+ */
+
+class SequenceLastInstanceLayer : public Layer {
+protected:
+  std::unique_ptr<Weight> biases_;
+  MatrixPtr tmpSrc_;
+  MatrixPtr tmpDest_;
+  enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
+  int type_;
+
+public:
+  explicit SequenceLastInstanceLayer(const LayerConfig& config)
+      : Layer(config) {}
+
+  ~SequenceLastInstanceLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
+
+bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
+                                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  // seqlastins layer should have exactly 1 input
+  CHECK_EQ(1U, inputLayers_.size());
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  tmpSrc_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+  tmpDest_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+
+  // transform to which sequence type
+  if (config_.trans_type() == "non-seq") {
+    type_ = kNonSeq;
+  } else if (config_.trans_type() == "seq") {
+    type_ = kSeq;
+  } else {
+    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
+  }
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SequenceLastInstanceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  size_t dim = getSize();
+  const Argument& input = getInput(0);
+
+  // check
+  auto startPositions =
+      type_ ? input.subSequenceStartPositions->getVector(false)
+            : input.sequenceStartPositions->getVector(false);
+  size_t height = type_ ? input.getNumSubSequences() : input.getNumSequences();
+  CHECK_EQ(dim, input.value->getWidth());
+  CHECK_EQ(startPositions->getData()[height], input.getBatchSize());
+  CHECK_EQ(height, startPositions->getSize() - 1);
+  if (type_) {
+    // when trans_type = seq, input must hasSubseq
+    CHECK_EQ(input.hasSubseq(), 1UL);
+  }
+
+  reserveOutput(height, dim);
+  const int* starts = startPositions->getData();
+  MatrixPtr inputValue = getInputValue(0);
+  MatrixPtr outputValue = getOutputValue();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
+
+    for (size_t seqId = 0; seqId < height; ++seqId) {
+      int insId =
+          config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
+
+      outputValue->subMatrix(seqId, 1, tmpDest_)
+          ->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
+    }
+    /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
+     * thus, in this case, output_ has no sequenceStartPositions.
+     * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
+     * case, we should compute the new sequenceStartPositions.
+    */
+    if (type_) {
+      output_.degradeSequence(input, useGpu_);
+    }
+  }
+
+  if (biases_.get() != NULL) {
+    outputValue->addBias(*(biases_->getW()), 1);
+  }
+
+  /*  activation, should set to 'linear' in most cases */
+  forwardActivation();
+}
+
+void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
+  /* activation, should set to 'linear' in most cases */
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  MatrixPtr inputGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+  auto startPositions =
+      type_ ? getInput(0).subSequenceStartPositions->getVector(false)
+            : getInput(0).sequenceStartPositions->getVector(false);
+  const int* starts = startPositions->getData();
+  size_t numSequences = startPositions->getSize() - 1;
+
+  if (inputGrad) {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SequenceLastInstanceLayerBackward", getName().c_str());
+
+    for (size_t seqId = 0; seqId < numSequences; ++seqId) {
+      int insId =
+          config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
+
+      inputGrad->subMatrix(insId, 1, tmpDest_)
+          ->add(*(outputGrad->subMatrix(seqId, 1, tmpSrc_)));
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceReshapeLayer.cpp b/paddle/gserver/layers/SequenceReshapeLayer.cpp
new file mode 100644
index 00000000000000..05766706b002c0
--- /dev/null
+++ b/paddle/gserver/layers/SequenceReshapeLayer.cpp
@@ -0,0 +1,144 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for reshaping the sequence
+ * Input: a sequence
+ * Output: a sequence
+ */
+
+class SequenceReshapeLayer : public Layer {
+protected:
+  std::unique_ptr<Weight> biases_;
+
+  MatrixPtr reshapedOutputGrad;
+
+public:
+  explicit SequenceReshapeLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~SequenceReshapeLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+REGISTER_LAYER(seqreshape, SequenceReshapeLayer);
+
+bool SequenceReshapeLayer::init(const LayerMap& layerMap,
+                                const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(1U, inputLayers_.size());
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SequenceReshapeLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+
+  size_t inDim = input.value->getWidth();
+  size_t outDim = getSize();
+
+  size_t numSequences = input.getNumSequences();
+  auto startPositions =
+      input.sequenceStartPositions->getVector(false);
+  const int* starts = startPositions->getData();
+
+  CHECK_EQ(starts[numSequences], input.getBatchSize());
+  CHECK_EQ(numSequences, startPositions->getSize() - 1);
+
+  for (size_t seqID = 0; seqID < numSequences; seqID++) {
+    size_t inNumIns = starts[seqID + 1] - starts[seqID];
+    size_t outNumIns = inNumIns * inDim / outDim;
+    CHECK_EQ(outNumIns * outDim, inNumIns * inDim);
+  }
+
+  MatrixPtr inputValue = getInputValue(0);
+
+  // reset output
+  reserveOutput(inputValue->getHeight() * inDim / outDim, outDim);
+  MatrixPtr outputValue = getOutputValue();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SequenceReshapeLayerForward", getName().c_str());
+
+    outputValue->copyFrom(*inputValue);
+
+    // modify the sequenceStartPositions
+    ICpuGpuVector::resizeOrCreate(
+        output_.sequenceStartPositions,
+        numSequences + 1,
+        false);
+
+    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
+
+    for (size_t seqId = 0; seqId < numSequences + 1; ++seqId) {
+      tgtBuf[seqId] = starts[seqId] * inDim / outDim;
+    }
+  }
+
+  if (biases_.get() != NULL) {
+    MatrixPtr outV = getOutputValue();
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */
+  forwardActivation();
+}
+
+void SequenceReshapeLayer::backward(const UpdateCallback& callback) {
+  /* activation */
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  MatrixPtr inputGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+
+  AsyncGpuBlock asyncGpuBlock;
+  REGISTER_TIMER_INFO("SequenceReshapeLayerBackward", getName().c_str());
+
+  if (inputGrad) {
+    Matrix::resizeOrCreate(reshapedOutputGrad, inputGrad->getHeight(),
+                           inputGrad->getWidth(), false, useGpu_);
+    reshapedOutputGrad->copyFrom(*outputGrad);
+    inputGrad->add(*reshapedOutputGrad);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceToBatch.cpp b/paddle/gserver/layers/SequenceToBatch.cpp
new file mode 100644
index 00000000000000..88eace28b2afff
--- /dev/null
+++ b/paddle/gserver/layers/SequenceToBatch.cpp
@@ -0,0 +1,221 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <vector>
+#include <algorithm>
+#include "SequenceToBatch.h"
+#include <iostream>
+#include <string.h>
+
+namespace paddle {
+
+void SequenceToBatch::resizeOrCreateBatch(int batchSize, size_t numSequences,
+                                          const int *seqStarts, bool reversed,
+                                          bool prevBatchState) {
+  CHECK_EQ(seqStarts[numSequences], batchSize);
+  IVector::resizeOrCreate(seq2BatchIdx_, batchSize, useGpu_);
+  if (!useGpu_) {
+    cpuSeq2BatchIdx_ = seq2BatchIdx_;
+  } else {
+    IVector::resizeOrCreate(cpuSeq2BatchIdx_, batchSize, false);
+  }
+
+  /*
+   * calculate the length of each sequence & sort sequence index by the length
+   * Exampel:  Sequences = {s0, s1, s2}
+   *           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+   *           seqStartAndLength[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
+   */
+  struct SeqStartAndLength {
+    int start_;
+    int length_;
+    int seqIdx_;
+    SeqStartAndLength(int start, int length, int seqIdx)
+        : start_(start), length_(length), seqIdx_(seqIdx) {}
+  };
+  std::vector<SeqStartAndLength> seqStartAndLength;
+  for (size_t seqId = 0; seqId < numSequences; ++seqId) {
+    int length = seqStarts[seqId + 1] - seqStarts[seqId];
+    seqStartAndLength.emplace_back(seqStarts[seqId], length, seqId);
+  }
+  std::sort(seqStartAndLength.begin(), seqStartAndLength.end(),
+            [](SeqStartAndLength a, SeqStartAndLength b) {
+              return a.length_ > b.length_;
+            });
+
+  /*
+   * calculate the start position of each batch
+   * (numBatch equal the maxLength of sequences)
+   * Exampel:  Sequences = {s0, s1, s2}
+   *           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+   *           numBatch = 5,
+   *           batchIndex = {b0, b1, b2, b3, b4}
+   *           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
+   *           batchStartPositions[6] = {0, 3, 6, 9, 11, 12}
+   */
+  numBatch_ = (size_t)seqStartAndLength[0].length_;
+
+  IVector::resizeOrCreate(batchStartPositions_, numBatch_ + 1, false);
+  int *batchStartPositions = batchStartPositions_->getData();
+  batchStartPositions[0] = 0;
+  for (size_t n = 0; n < numBatch_; n++) {
+    int batchId = batchStartPositions[n];
+    for (size_t i = 0; i < seqStartAndLength.size(); ++i) {
+      size_t seqLength = seqStartAndLength[i].length_;
+      int start = seqStartAndLength[i].start_;
+      if (n < seqLength) {
+        if (!reversed) {
+          cpuSeq2BatchIdx_->getData()[batchId] = start + n;
+        } else {
+          cpuSeq2BatchIdx_->getData()[batchId] = start + seqLength - 1 - n;
+        }
+        batchId++;
+      } else {
+        break;
+      }
+    }
+    batchStartPositions[n + 1] = batchId;
+  }
+  if (useGpu_) {
+    seq2BatchIdx_->copyFrom(*cpuSeq2BatchIdx_);
+  }
+  if (prevBatchState) {
+    IVector::resizeOrCreate(seqIdx_, numSequences, useGpu_);
+    IVector::resizeOrCreate(seqEndIdxInBatch_, numSequences, useGpu_);
+    if (!useGpu_) {
+      cpuSeqIdx_ = seqIdx_;
+      cpuSeqEndIdxInBatch_ = seqEndIdxInBatch_;
+    } else {
+      IVector::resizeOrCreate(cpuSeqIdx_, numSequences, false);
+      IVector::resizeOrCreate(cpuSeqEndIdxInBatch_, numSequences, false);
+    }
+    int *seqIdx = cpuSeqIdx_->getData();
+    int *seqEndIdxInBatch = cpuSeqEndIdxInBatch_->getData();
+    for (size_t i = 0; i < seqStartAndLength.size(); ++i) {
+      seqIdx[i] = seqStartAndLength[i].seqIdx_;
+    }
+    for (size_t i = 0; i < seqStartAndLength.size(); ++i) {
+      if (seqStartAndLength[i].length_ > 0) {
+        seqEndIdxInBatch[seqStartAndLength[i].seqIdx_] =
+            batchStartPositions[seqStartAndLength[i].length_ - 1] + i;
+      } else {
+        seqEndIdxInBatch[seqStartAndLength[i].seqIdx_] = 0;
+      }
+    }
+    if (useGpu_) {
+      seqIdx_->copyFrom(*cpuSeqIdx_);
+      seqEndIdxInBatch_->copyFrom(*cpuSeqEndIdxInBatch_);
+    }
+  }
+}
+
+void SequenceToBatch::resizeOrCreate(Matrix &seqValue) {
+  Matrix::resizeOrCreate(batchValue_, seqValue.getHeight(), seqValue.getWidth(),
+                         /* trans= */ false, useGpu_);
+}
+
+MatrixPtr SequenceToBatch::getBatchValue(int batchId, int numRows) {
+  return getBatchValue(*batchValue_, batchId, numRows);
+}
+
+MatrixPtr SequenceToBatch::getBatchValue(Matrix &batchValue, int batchId,
+                                         int numRows) {
+  int *batchStartPositions = batchStartPositions_->getData();
+  int start = batchStartPositions[batchId];
+  int maxRows = batchStartPositions[batchId + 1] - batchStartPositions[batchId];
+  if (numRows == 0) {
+    numRows = maxRows;
+  } else {
+    CHECK_LE(numRows, maxRows);
+  }
+  return batchValue.subMatrix(start, numRows);
+}
+
+void SequenceToBatch::prevOutput2Batch(Matrix &src, Matrix &dst) {
+  sequence2BatchCopy(dst, src, *seqIdx_, true);
+}
+
+void SequenceToBatch::getSeqOutputFromBatch(Matrix &sequence, Matrix &batch) {
+  sequence2BatchCopy(sequence, batch, *seqEndIdxInBatch_, true);
+}
+
+void SequenceToBatch::sequence2BatchCopy(Matrix &batch, Matrix &sequence,
+                                         IVector &seq2BatchIdx,
+                                         bool seq2batch) {
+  int seqWidth = sequence.getWidth();
+  int batchCount = batch.getHeight();
+  real *batchData = batch.getData();
+  real *seqData = sequence.getData();
+  int *idxData = seq2BatchIdx.getData();
+
+  if (useGpu_) {
+    hl_sequence2batch_copy(batchData, seqData, idxData, seqWidth,
+                           batchCount, seq2batch);
+  } else {
+    for (int i = 0; i < batchCount; ++i) {
+      if (seq2batch) {
+        memcpy(batch.rowBuf(i), sequence.rowBuf(idxData[i]),
+               seqWidth * sizeof(real));
+      } else {
+        memcpy(sequence.rowBuf(idxData[i]), batch.rowBuf(i),
+               seqWidth * sizeof(real));
+      }
+    }
+  }
+}
+
+void SequenceToBatch::sequence2BatchAdd(Matrix &batch, Matrix &sequence,
+                                        IVector &seq2BatchIdx, bool seq2batch) {
+  int seqWidth = sequence.getWidth();
+  int batchCount = batch.getHeight();
+  real *batchData = batch.getData();
+  real *seqData = sequence.getData();
+  int *idxData = seq2BatchIdx.getData();
+
+  if (useGpu_) {
+    hl_sequence2batch_add(batchData, seqData, idxData, seqWidth,
+                          batchCount, seq2batch);
+  } else {
+    for (int i = 0; i < batchCount; ++i) {
+      if (seq2batch) {
+        batch.subMatrix(i, 1)->add(*sequence.subMatrix(idxData[i], 1));
+      } else {
+        sequence.subMatrix(idxData[i], 1)->add(*batch.subMatrix(i, 1));
+      }
+    }
+  }
+}
+
+void SequenceToBatch::copyFromSeq(Matrix &seqValue) {
+  Matrix::resizeOrCreate(batchValue_, seqValue.getHeight(), seqValue.getWidth(),
+                         /* trans= */ false, useGpu_);
+  sequence2BatchCopy(*batchValue_, seqValue, *seq2BatchIdx_, true);
+}
+
+void SequenceToBatch::copyBackSeq(Matrix &seqValue) {
+  sequence2BatchCopy(*batchValue_, seqValue, *seq2BatchIdx_, false);
+}
+
+void SequenceToBatch::copy(Matrix &seqValue, Matrix &batchValue,
+                           bool seq2batch) {
+  sequence2BatchCopy(batchValue, seqValue, *seq2BatchIdx_, seq2batch);
+}
+
+void SequenceToBatch::add(Matrix &seqValue, Matrix &batchValue,
+                          bool seq2batch) {
+  sequence2BatchAdd(batchValue, seqValue, *seq2BatchIdx_, seq2batch);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceToBatch.h b/paddle/gserver/layers/SequenceToBatch.h
new file mode 100644
index 00000000000000..8cba7ea3b98c3a
--- /dev/null
+++ b/paddle/gserver/layers/SequenceToBatch.h
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/math/Vector.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/*
+ * This class can used to modify the matrix structure of sequence matrix into
+ * batch structure.
+ * sequence matrix: [C1_s ... Cn_s | ...... | C1_t ... Cn_t]
+ * batch matrix:    [C1_s ... C1_t | ...... | Cn_s ... Cn_t]
+ * Cn_s is the state for sequence s at time n.
+ *
+ * Exampel:  sequence matrix = {{0, 0, 0, 0}, {1, 1, 1, 1, 1}, {2, 2, 2}}
+ *           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+ *           batch matrix = {{1, 0, 2}, {1, 0, 2}, {1, 0, 2}, {1, 0}, {1}}
+ *           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
+ *
+ * Use:
+ * Input: seqMatrix, seqStarts(Sequence Start Positions)
+ * Output: batchMatrix
+ * 1. SequenceToBatch seq2batch;
+ * 2. seq2batch.resizeOrCreateBatch(seqStarts);     // calculate seq2BatchIdx
+ * 3. seq2batch.copy(seqMatrix, batchMatrix, true); // copy seq to batch matrix
+ *
+ */
+class SequenceToBatch {
+public:
+  explicit SequenceToBatch(bool useGpu) : useGpu_(useGpu) {}
+
+  /* resize and calculate the batchIndex_ */
+  void resizeOrCreateBatch(int batchSize, size_t numSequences,
+                           const int *seqStarts, bool reversed,
+                           bool prevBatchState = false);
+
+  /* sequence matrix and batch matrix copy:
+   * seq2batch: copy(seqValue, batchValue, true);
+   * batch2seq: copy(seqValue, batchValue, false);
+   */
+  void copy(Matrix &seqValue, Matrix &batchValue, bool seq2batch);
+  /* sequence/batch matrix add to batch/sequence matrix */
+  void add(Matrix &seqValue, Matrix &batchValue, bool seq2batch);
+  MatrixPtr getBatchValue(Matrix &batchValue, int batchId, int numRows = 0);
+
+  size_t getNumBatch() const { return numBatch_; }
+
+  /* resize or create a batch matrix(batchValue_) */
+  void resizeOrCreate(Matrix &seqValue);
+  /* copy seqValue to batchValue_ */
+  void copyFromSeq(Matrix &seqValue);
+  /* copy batchValue_ to seqValue */
+  void copyBackSeq(Matrix &seqValue);
+  MatrixPtr getBatchValue(int batchId, int numRows = 0);
+  MatrixPtr getBatchValue() { return batchValue_; }
+  /*tranfer preBatchOutput to batch struct*/
+  void prevOutput2Batch(Matrix &src, Matrix &dst);
+  /*get sequence output from batch struct*/
+  void getSeqOutputFromBatch(Matrix &sequence, Matrix &batch);
+
+  /* Copy the index from another seq2batch. */
+  void shareIndexWith(const SequenceToBatch &seq2batch) {
+    CHECK(useGpu_ == seq2batch.useGpu_);
+    batchStartPositions_ = seq2batch.batchStartPositions_;
+    seq2BatchIdx_ = seq2batch.seq2BatchIdx_;
+    cpuSeq2BatchIdx_ = seq2batch.cpuSeq2BatchIdx_;
+    numBatch_ = seq2batch.numBatch_;
+  }
+
+protected:
+  void sequence2BatchCopy(Matrix &batch, Matrix &sequence,
+                          IVector &seq2BatchIdx, bool seq2batch);
+  void sequence2BatchAdd(Matrix &batch, Matrix &sequence, IVector &seq2BatchIdx,
+                         bool seq2batch);
+
+  IVectorPtr batchStartPositions_;
+  IVectorPtr seq2BatchIdx_;
+  IVectorPtr cpuSeq2BatchIdx_;
+  IVectorPtr cpuSeqIdx_;
+  IVectorPtr cpuSeqEndIdxInBatch_;
+  IVectorPtr seqIdx_;
+  IVectorPtr seqEndIdxInBatch_;
+  size_t numBatch_;
+  bool useGpu_;
+  MatrixPtr batchValue_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SlopeInterceptLayer.cpp b/paddle/gserver/layers/SlopeInterceptLayer.cpp
new file mode 100644
index 00000000000000..af5fccf6506b6d
--- /dev/null
+++ b/paddle/gserver/layers/SlopeInterceptLayer.cpp
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for applying a slope and an intercept to the input element-wise.
+ * This layer is used in NEURAL TURING MACHINE.
+ * @note There is no activation and weight in this layer.
+ *
+ * \f[
+ *    y = ax + b
+ * \f]
+ *
+ * Here, a is scale and b is offset, which are provided as attributes of the layer.
+ *
+ * The config file api is slope_intercept_layer.
+ */
+
+class SlopeInterceptLayer : public Layer {
+public:
+  explicit SlopeInterceptLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~SlopeInterceptLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+REGISTER_LAYER(slope_intercept, SlopeInterceptLayer);
+
+bool SlopeInterceptLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1U);
+
+  return true;
+}
+
+void SlopeInterceptLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+
+  /* malloc memory for the output_ if necessary */
+  size_t batchSize = inV->getHeight();
+  size_t size = getSize();
+
+  CHECK_EQ(size, inV->getWidth());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, size);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwSlopeInterceptTimer", getName().c_str());
+    outV->mulScalar(*inV, config_.slope());
+    outV->add(config_.intercept());
+  }
+}
+
+void SlopeInterceptLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr outG = getOutputGrad();
+
+  if (inG) {
+    REGISTER_TIMER_INFO("BwSlopeInterceptTimer", getName().c_str());
+    inG->add(*outG, config_.slope());
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SubSequenceLayer.cpp b/paddle/gserver/layers/SubSequenceLayer.cpp
new file mode 100644
index 00000000000000..ccf65ba649f214
--- /dev/null
+++ b/paddle/gserver/layers/SubSequenceLayer.cpp
@@ -0,0 +1,208 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for taking the subsequence according to given offset and size
+ * Input: original sequence, offset, size
+ * Output: subsequence
+ */
+
+class SubSequenceLayer : public Layer {
+protected:
+  std::unique_ptr<Weight> biases_;
+  MatrixPtr tmpSrc_;
+  MatrixPtr tmpDest_;
+
+public:
+  explicit SubSequenceLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~SubSequenceLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+REGISTER_LAYER(subseq, SubSequenceLayer);
+
+bool SubSequenceLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  // sequene concatenation layer should have exactly 2 inputs
+  CHECK_EQ(3U, inputLayers_.size());
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  tmpSrc_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+  tmpDest_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SubSequenceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  size_t dim = getSize();
+
+  const Argument& input = getInput(0);
+  size_t numSequences1 = input.getNumSequences();
+  auto startPositions1 =
+      input.sequenceStartPositions->getVector(false);
+
+  const Argument& offsetSeq = getInput(1);
+  size_t numSequences2 = offsetSeq.getNumSequences();
+  auto startPositions2 =
+      offsetSeq.sequenceStartPositions->getVector(false);
+
+  const Argument& sizeSeq = getInput(2);
+  size_t numSequences3 = sizeSeq.getNumSequences();
+  auto startPositions3 =
+      sizeSeq.sequenceStartPositions->getVector(false);
+
+  CHECK_EQ(dim, input.value->getWidth());
+
+  CHECK_EQ(startPositions1->getData()[numSequences1], input.getBatchSize());
+  CHECK_EQ(numSequences1, startPositions1->getSize() - 1);
+
+  CHECK_EQ(startPositions2->getData()[numSequences2], offsetSeq.getBatchSize());
+  CHECK_EQ(numSequences2, startPositions2->getSize() - 1);
+
+  CHECK_EQ(startPositions3->getData()[numSequences3], sizeSeq.getBatchSize());
+  CHECK_EQ(numSequences3, startPositions3->getSize() - 1);
+
+  CHECK_EQ(numSequences1, numSequences2);
+  CHECK_EQ(numSequences2, numSequences3);
+
+  MatrixPtr inputValue = input.value;
+  IVectorPtr offsetValue = offsetSeq.ids;
+  IVectorPtr sizeValue = sizeSeq.ids;
+
+  CHECK_EQ(offsetValue->getSize(), numSequences1);
+  CHECK_EQ(sizeValue->getSize(), numSequences1);
+
+  int* offsets = offsetValue->getData();
+  int* sizes = sizeValue->getData();
+
+  // get total height of output
+  size_t height = 0;
+  for (size_t seqId = 0; seqId < numSequences1; seqId++) {
+    height += sizes[seqId];
+  }
+
+  // reset output
+  resetOutput(height, dim);
+
+  MatrixPtr outputValue = getOutputValue();
+
+  const int* starts1 = startPositions1->getData();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SubSequenceLayerForward", getName().c_str());
+
+    size_t offsetIn = 0;
+    size_t offsetOut = 0;
+    size_t size = 0;
+    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
+      offsetIn = starts1[seqId] + offsets[seqId];
+      size = sizes[seqId];
+
+      outputValue->subMatrix(offsetOut, size, tmpDest_)
+          ->assign(*(inputValue->subMatrix(offsetIn, size, tmpSrc_)));
+
+      offsetOut += size;
+    }
+
+    // modify the sequenceStartPositions
+    ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions,
+                                   numSequences1 + 1, false);
+
+    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
+    int offset = 0;
+    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
+      tgtBuf[seqId] = offset;
+      offset += sizes[seqId];
+    }
+    tgtBuf[numSequences1] = offset;
+  }
+
+  if (biases_.get() != NULL) {
+    MatrixPtr outV = getOutputValue();
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */
+  forwardActivation();
+}
+
+void SubSequenceLayer::backward(const UpdateCallback& callback) {
+  /* activation */
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  MatrixPtr inputGrad1 = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+  auto startPositions1 =
+      getInput(0).sequenceStartPositions->getVector(false);
+  size_t numSequences1 = startPositions1->getSize() - 1;
+  const int* starts1 = startPositions1->getData();
+
+  IVectorPtr offsetValue = getInput(1).ids;
+  IVectorPtr sizeValue = getInput(2).ids;
+
+  int* offsets = offsetValue->getData();
+  int* sizes = sizeValue->getData();
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SubSequenceLayerBackward", getName().c_str());
+
+    int offsetIn = 0;
+    int offsetOut = 0;
+    int size = 0;
+    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
+      offsetIn = starts1[seqId] + offsets[seqId];
+      size = sizes[seqId];
+
+      inputGrad1->subMatrix(offsetIn, size, tmpDest_)
+          ->add(*(outputGrad->subMatrix(offsetOut, size, tmpSrc_)));
+      offsetOut += size;
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SumToOneNormLayer.cpp b/paddle/gserver/layers/SumToOneNormLayer.cpp
new file mode 100644
index 00000000000000..7b61dd08227253
--- /dev/null
+++ b/paddle/gserver/layers/SumToOneNormLayer.cpp
@@ -0,0 +1,122 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for sum-to-one normalization, 
+ * which is used in NEURAL TURING MACHINE.
+ * \f[
+ *   out[i] = \frac {in[i]} {\sum_{k=1}^N in[k]}
+ * \f]
+ * where \f$in\f$ is a (batchSize x dataDim) input vector,
+ * and \f$out\f$ is a (batchSize x dataDim) output vector.
+ *
+ * The config file api is sum_to_one_norm_layer.
+ */
+
+class SumToOneNormLayer : public Layer {
+protected:
+  /// reciprocalRowSum_ = \f$1 / \sum_{k=1}^N in[k]\f$
+  MatrixPtr reciprocalRowSum_;
+  /// dotSum = output_.grad \f$.*\f$ output_.value
+  MatrixPtr dotSum_;
+
+public:
+  explicit SumToOneNormLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~SumToOneNormLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+REGISTER_LAYER(sum_to_one_norm, SumToOneNormLayer);
+
+bool SumToOneNormLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1U);
+
+  return true;
+}
+
+void SumToOneNormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+
+  /* malloc memory for the output_ if necessary */
+  size_t batchSize = inV->getHeight();
+  size_t dataDim = getSize();
+
+  CHECK_EQ(dataDim, inV->getWidth());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwSumToOneNormTimer", getName().c_str());
+
+    Matrix::resizeOrCreate(reciprocalRowSum_, batchSize, 1, false, useGpu_);
+    inV->rowSum(*reciprocalRowSum_);
+
+    // todo: matrix checks
+    CHECK_GT(reciprocalRowSum_->getMin(), 0.0);
+
+    reciprocalRowSum_->scalarDiv(*reciprocalRowSum_, 1.0);
+
+    // outV = inV * reciprocalRowSum
+    outV->rowScale(0, *inV, *reciprocalRowSum_);
+  }
+}
+
+void SumToOneNormLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+
+  size_t batchSize = inV->getHeight();
+
+  if (inG) {
+    REGISTER_TIMER_INFO("BwSumToOneTimer", getName().c_str());
+
+    Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
+
+    // dotSum = outG .* outV
+    dotSum_->zeroMem();
+    dotSum_->rowDotMul(0, *outG, *outV);
+
+    // inG += -1 * (dotSum / rowSum)
+    dotSum_->dotMul(*dotSum_, *reciprocalRowSum_);
+    inG->rowAdd(0, *inG, *dotSum_, -1.0);
+    // inG += outG * (1/rowSum)
+    inG->addRowScale(0, *outG, *reciprocalRowSum_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/TableProjection.cpp b/paddle/gserver/layers/TableProjection.cpp
new file mode 100644
index 00000000000000..947d8cf9be1b4a
--- /dev/null
+++ b/paddle/gserver/layers/TableProjection.cpp
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "TableProjection.h"
+
+namespace paddle {
+
+REGISTER_PROJECTION(table, TableProjection);
+
+TableProjection::TableProjection(const ProjectionConfig& config,
+                                 const ParameterPtr& parameter, bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  table_.reset(
+      new Weight(config.input_size(), config.output_size(), parameter));
+}
+
+void TableProjection::prefetch(const Argument* in) {
+  CHECK(in->ids);
+  auto* sparseParam =
+      dynamic_cast<SparsePrefetchRowCpuMatrix*>(table_->getW().get());
+  if (sparseParam) {
+    sparseParam->addRows(in->ids);
+  }
+}
+
+void TableProjection::forward() {
+  CHECK(in_->ids);
+  out_->value->selectRows(*table_->getW(), *in_->ids);
+}
+
+void TableProjection::backward(const UpdateCallback& callback) {
+  if (table_->getWGrad()) {
+    CHECK(in_->ids);
+    out_->grad->addToRows(*table_->getWGrad(), *in_->ids);
+    parameter_->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/TableProjection.h b/paddle/gserver/layers/TableProjection.h
new file mode 100644
index 00000000000000..eadf2de623cdf2
--- /dev/null
+++ b/paddle/gserver/layers/TableProjection.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * Table projection takes index data input. It select rows from parameter
+ * where row_id is in input_ids:
+ * \f[
+ *   out.row[i] += table.row[ids[i]]
+ * \f]
+ * where \f$out\f$ is out, \f$table\f$ is parameter, \f$ids\f$ is input_ids,
+ * and \f$i\f$ is row_id.
+ *
+ * The config file api is table_projection.
+ *
+ * @note If \f$ids[i] = -1\f$, it will be ignored.
+ */
+class TableProjection : public Projection {
+public:
+  TableProjection(const ProjectionConfig& config, const ParameterPtr& parameter,
+                  bool useGpu);
+  /**
+   * If use sparse row matrix as parameter, prefetch feature ids in input label.
+   */
+  virtual void prefetch(const Argument* in);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+protected:
+  std::unique_ptr<Weight> table_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/TensorLayer.cpp b/paddle/gserver/layers/TensorLayer.cpp
new file mode 100644
index 00000000000000..84fe9005b003db
--- /dev/null
+++ b/paddle/gserver/layers/TensorLayer.cpp
@@ -0,0 +1,142 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "TensorLayer.h"
+
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(tensor, TensorLayer);
+
+bool TensorLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* initialize the weightList */
+  CHECK_EQ(inputLayers_.size(), 2LU);
+  CHECK(parameters_[0]);
+  CHECK(!parameters_[1]);
+
+  // Option the parameters
+  size_t height = inputLayers_[0]->getSize();
+  size_t width = inputLayers_[1]->getSize();
+  CHECK_EQ(width * height * getSize(), parameters_[0]->getSize());
+
+  for (size_t i = 0; i < getSize(); ++i) {
+    // create a new weight
+    Weight* w = new Weight(height, width, parameters_[0], i * width * height);
+
+    // append the new weight to the list
+    weights_.emplace_back(w);
+  }
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  return true;
+}
+
+void TensorLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(0)->getHeight();
+  int size = getSize();
+
+  { resetOutput(batchSize, size); }
+
+  MatrixPtr outV = getOutputValue();
+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* e1 * W * trans(e2) */ {
+    MatrixPtr input1 = getInputValue(0);
+    MatrixPtr input2 = getInputValue(1);
+    MatrixPtr tmpMat = Matrix::create(input2->getHeight(),
+      input2->getWidth(), /* trans= */ false, input2->useGpu());
+    REGISTER_TIMER_INFO("TensorFwMulTimer", getName().c_str());
+    for (size_t i = 0; i < getSize(); ++i) {
+      MatrixPtr weights = weights_[i]->getW();
+      tmpMat->mul(input1, weights, 1, 0);
+      outV->rowDotMul(i, *tmpMat, *input2);
+    }
+  }
+
+  /* activation */ { forwardActivation(); }
+}
+
+void TensorLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  bool syncFlag = hl_get_sync_flag();
+
+  /* Calculate the W-gradient for the current layer */
+  MatrixPtr input1 = getInputValue(0);
+  MatrixPtr input2 = getInputValue(1);
+  MatrixPtr oGrad = getOutputGrad();
+  MatrixPtr tmpMat = Matrix::create(input1->getHeight(),
+    input1->getWidth(), /* trans= */ false, input1->useGpu());
+
+  /* trans(grad * e1) * e2 */ {
+    REGISTER_TIMER_INFO("TensorGradMulTimer", getName().c_str());
+    for (size_t i = 0; i < getSize(); ++i) {
+      if (weights_[i]->getWGrad()) {
+        tmpMat->rowScale(i, *input1, *oGrad);
+        MatrixPtr input1_T = tmpMat->getTranspose();
+        weights_[i]->getWGrad()->mul(input1_T, input2, 1, 1);
+      }
+    }
+  }
+
+  hl_set_sync_flag(false);
+
+  /* Calculate the input layers error */ {
+    MatrixPtr preGrad1 = getInputGrad(0);
+    MatrixPtr preGrad2 = getInputGrad(1);
+
+    REGISTER_TIMER_INFO("TensorBpMulTimer", getName().c_str());
+    for (size_t i = 0; i < getSize(); ++i) {
+      MatrixPtr weights = weights_[i]->getW();
+
+      if (NULL != preGrad1) { /* (grad * e2) * trans(W) */
+        tmpMat->rowScale(i, *input2, *oGrad);
+        MatrixPtr weights_T = weights->getTranspose();
+        preGrad1->mul(tmpMat, weights_T, 1, 1);
+      }
+      if (NULL != preGrad2) { /* (grad * e1) * W */
+        tmpMat->rowScale(i, *input1, *oGrad);
+        preGrad2->mul(tmpMat, weights, 1, 1);
+      }
+    }
+  }
+  hl_set_sync_flag(syncFlag);
+  parameters_[0]->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/TensorLayer.h b/paddle/gserver/layers/TensorLayer.h
new file mode 100644
index 00000000000000..83b87b1307ac1f
--- /dev/null
+++ b/paddle/gserver/layers/TensorLayer.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * @brief TensorLayer takes two input vectors.
+ * \f[
+ *     y_{i} = x_{1} * W_{i} * x_{2}^{\rm T}, i=0, 1, ...,K-1
+ * \f]
+ *
+ * - \f$x_{1}\f$: the first input, size is M.
+ * - \f$x_{2}\f$: the second input, size is N.
+ * - y: output, size is K.
+ * - \f$y_{i}\f$: i-th element of y.
+ * - \f$W_{i}\f$: the i-th learned weight, dimensions: [M, N].
+ * - \f$x_{2}^{\rm T}\f$: the transpose of \f$x_{2}\f$.
+ *
+ * The config file api is tensor_layer.
+ */
+
+class TensorLayer : public Layer {
+protected:
+  WeightList weights_;
+  std::unique_ptr<Weight> biases_;
+
+public:
+  explicit TensorLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~TensorLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  Weight& getWeight(int idx) { return *weights_[idx]; }
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/TransLayer.cpp b/paddle/gserver/layers/TransLayer.cpp
new file mode 100644
index 00000000000000..f8827bec63a9bc
--- /dev/null
+++ b/paddle/gserver/layers/TransLayer.cpp
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "TransLayer.h"
+namespace paddle {
+
+REGISTER_LAYER(trans, TransLayer);
+
+bool TransLayer::init(const LayerMap& layerMap,
+                      const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* the size of inputs for trans-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  return true;
+}
+
+void TransLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  MatrixPtr input = getInputValue(0);
+  int height = input->getHeight();
+  int width = input->getWidth();
+
+  resizeOutput(width, height);
+
+  MatrixPtr outV = getOutputValue();
+
+  /* outV's memory has been allocated, so memAlloc = false */
+  input->transpose(outV, false);
+  if (getInputGrad(0)) {
+    zeroGrad();
+  }
+}
+
+void TransLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  MatrixPtr outputGrad = getOutputGrad();
+  if (outputGrad == NULL) {
+    return;
+  }
+  MatrixPtr preGrad = getInputGrad(0);
+  outputGrad->transpose(preGrad, false);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/TransLayer.h b/paddle/gserver/layers/TransLayer.h
new file mode 100644
index 00000000000000..867ccb4d1950cf
--- /dev/null
+++ b/paddle/gserver/layers/TransLayer.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include <vector>
+
+namespace paddle {
+/**
+ * A layer for transposition.
+ * \f[
+     y = x^\mathrm{T}
+ * \f]
+ * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
+ *
+ * The config file api is trans_layer.
+ */
+class TransLayer : public Layer {
+public:
+  explicit TransLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
new file mode 100644
index 00000000000000..6e3f6bf2e496cf
--- /dev/null
+++ b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Stat.h"
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * @brief TransposedFullMatrixProjection performs full matrix multiplication:
+ * out.row[i] += in.row[i] * weight.transpose
+ *
+ * The config file api is trans_full_matrix_projection.
+ */
+class TransposedFullMatrixProjection : public Projection {
+public:
+  TransposedFullMatrixProjection(const ProjectionConfig& config,
+                                 ParameterPtr parameter, bool useGPu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+protected:
+  std::unique_ptr<Weight> weight_;
+};
+
+REGISTER_PROJECTION(trans_fc, TransposedFullMatrixProjection);
+
+TransposedFullMatrixProjection::TransposedFullMatrixProjection(
+    const ProjectionConfig& config, ParameterPtr parameter, bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  weight_.reset(
+      new Weight(config.output_size(), config.input_size(), parameter));
+}
+
+void TransposedFullMatrixProjection::forward() {
+  REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
+  out_->value->mul(in_->value, weight_->getW()->getTranspose(), 1, 1);
+}
+
+void TransposedFullMatrixProjection::backward(const UpdateCallback& callback) {
+  bool syncFlag = hl_get_sync_flag();
+
+  /* Calculate the W-gradient for the current layer */
+  if (weight_->getWGrad()) {
+    REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
+    weight_->getWGrad()->mul(out_->grad->getTranspose(), in_->value, 1, 1);
+  }
+
+  // If callback does not change value, backprop error asynchronously so that
+  // we can do the callback concurrently.
+  // This is still a little bit dangerous since theoretically for
+  // SyncMultiGpuMachine it is possible that the value copyback can still
+  // happen at the same time as the error backprop where the value is being
+  // used.
+  hl_set_sync_flag(false);
+
+  /* Calculate the input layers error */
+  if (in_->grad) {
+    REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
+    in_->grad->mul(out_->grad, weight_->getW(), 1, 1);
+  }
+
+  hl_set_sync_flag(syncFlag);
+  parameter_->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ValidationLayer.cpp b/paddle/gserver/layers/ValidationLayer.cpp
new file mode 100644
index 00000000000000..48a7b54338fca3
--- /dev/null
+++ b/paddle/gserver/layers/ValidationLayer.cpp
@@ -0,0 +1,169 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <memory>
+#include <algorithm>
+#include <fstream>
+
+#include "paddle/utils/Logging.h"
+#include "ValidationLayer.h"
+
+namespace paddle {
+
+bool ValidationLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  return Layer::init(layerMap, parameterMap);
+}
+
+void ValidationLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr output = getInputValue(*getOutputLayer());
+  CHECK(output);
+  IVectorPtr label = getInputLabel(*getLabelLayer());
+  CHECK(label);
+  validationImp(output, label);
+}
+
+void ValidationLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+}
+
+bool AucValidation::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  bool ret = ValidationLayer::init(layerMap, parameterMap);
+  EvaluatorConfig config;
+  config.set_name(getName());
+  config.set_type("last-column-auc");
+  config.add_input_layers(inputLayers_[0]->getName());
+  config.add_input_layers(inputLayers_[1]->getName());
+  if (3 == inputLayers_.size()) {
+    config.add_input_layers(inputLayers_[2]->getName());
+  }
+  evaluator_.reset(Evaluator::create(config));
+  passBegin_ = false;
+  return ret;
+}
+
+void AucValidation::validationImp(MatrixPtr output, IVectorPtr label) {
+  if (!passBegin_) {
+    passBegin_ = true;
+    evaluator_->start();
+  }
+
+  bool supportWeight = (3 == inputLayers_.size()) ? true : false;
+  MatrixPtr weight = supportWeight ? getInputValue(*inputLayers_[2]) : nullptr;
+  if (dynamic_cast<GpuMatrix*>(output.get())) {
+    size_t height = output->getHeight();
+    size_t width = output->getWidth();
+    Matrix::resizeOrCreate(cpuOutput_, height, width,
+                           /* trans=*/false, /* useGpu=*/false);
+    cpuOutput_->copyFrom(*output);
+    IVector::resizeOrCreate(cpuLabel_, height, false);
+    cpuLabel_->copyFrom(*label);
+
+    if (supportWeight) {
+      Matrix::resizeOrCreate(cpuWeight_, height, (size_t)1, false, false);
+      cpuWeight_->copyFrom(*weight);
+    }
+
+    output = cpuOutput_;
+    label = cpuLabel_;
+    weight = cpuWeight_;
+  }
+
+  for (size_t i = 0; i < output->getHeight(); i++) {
+    float y1 = output->getData()[i * output->getWidth() + 1];
+    int* labels = label->getData();
+    predictArray_.push_back(PredictionResult(y1, labels[i]));
+  }
+  std::vector<Argument> arguments;
+  if (3 == inputLayers_.size()) {
+    arguments.resize(3);
+    arguments[2].value = weight;
+  } else {
+    arguments.resize(2);
+  }
+  arguments[0].value = output;
+  arguments[1].ids = label;
+  evaluator_->evalImp(arguments);
+}
+
+void AucValidation::onPassEnd() {
+  if (!FLAGS_predict_file.empty()) {
+    std::ofstream fs(FLAGS_predict_file);
+    CHECK(fs) << "Fail to open " << FLAGS_predict_file;
+    for (auto& res : predictArray_) {
+      fs << res.out << " " << res.label << std::endl;
+    }
+  }
+
+  evaluator_->finish();
+  LOG(INFO) << *evaluator_;
+  passBegin_ = false;
+  predictArray_.clear();
+}
+
+bool PnpairValidation::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  bool ret = ValidationLayer::init(layerMap, parameterMap);
+  if (!ret) return ret;
+  CHECK_GE(inputLayers_.size(), 3UL);
+  CHECK_LE(inputLayers_.size(), 4UL);
+  EvaluatorConfig config;
+  config.set_name(getName());
+  config.set_type("pnpair");
+  config.add_input_layers(inputLayers_[0]->getName());
+  config.add_input_layers(inputLayers_[1]->getName());
+  config.add_input_layers(inputLayers_[2]->getName());
+  if (4 == inputLayers_.size()) {
+    config.add_input_layers(inputLayers_[3]->getName());
+  }
+  evaluator_.reset(Evaluator::create(config));
+  passBegin_ = false;
+  return true;
+}
+
+void PnpairValidation::validationImp(MatrixPtr output, IVectorPtr label) {
+  if (!passBegin_) {
+    passBegin_ = true;
+    evaluator_->start();
+  }
+  MatrixPtr weight =
+      (4 == inputLayers_.size()) ? getInputValue(*inputLayers_[3]) : nullptr;
+  IVectorPtr info = getInputLabel(*getInfoLayer());
+  std::vector<Argument> arguments;
+  if (4 == inputLayers_.size()) {
+    arguments.resize(4);
+    arguments[3].value = weight;
+  } else {
+    arguments.resize(3);
+  }
+  arguments[0].value = output;
+  arguments[1].ids = label;
+  arguments[2].ids = info;
+  evaluator_->evalImp(arguments);
+}
+
+void PnpairValidation::onPassEnd() {
+  if (!FLAGS_predict_file.empty()) {
+    (dynamic_cast<PnpairEvaluator*>(evaluator_.get()))->printPredictResults();
+  }
+  evaluator_->finish();
+  LOG(INFO) << *evaluator_;
+  passBegin_ = false;
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ValidationLayer.h b/paddle/gserver/layers/ValidationLayer.h
new file mode 100644
index 00000000000000..eef9c80a7b11f5
--- /dev/null
+++ b/paddle/gserver/layers/ValidationLayer.h
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <memory>
+
+#include "paddle/gserver/evaluators/Evaluator.h"
+#include "Layer.h"
+
+P_DECLARE_int32(trainer_id);
+
+namespace paddle {
+
+class ValidationLayer : public Layer {
+public:
+  explicit ValidationLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  LayerPtr getOutputLayer() { return inputLayers_[0]; }
+
+  LayerPtr getLabelLayer() { return inputLayers_[1]; }
+
+  LayerPtr getInfoLayer() {
+    assert(inputLayers_.size() > 2);
+    return inputLayers_[2];
+  }
+
+  virtual void forward(PassType passType);
+
+  virtual void backward(const UpdateCallback& callback = nullptr);
+
+  virtual void validationImp(MatrixPtr outputValue, IVectorPtr label) = 0;
+
+  virtual void onPassEnd() = 0;
+};
+
+/*
+ * AucValidation
+ */
+class AucValidation : public ValidationLayer {
+public:
+  explicit AucValidation(const LayerConfig& config)
+      : ValidationLayer(config),
+        cpuOutput_(nullptr),
+        cpuLabel_(nullptr),
+        cpuWeight_(nullptr) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void validationImp(MatrixPtr outputValue, IVectorPtr label);
+
+  void onPassEnd();
+
+  struct PredictionResult {
+    PredictionResult(real __out, int __label) : out(__out), label(__label) {}
+    real out;
+    int label;
+  };
+  std::vector<PredictionResult> predictArray_;
+
+private:
+  bool passBegin_;
+  std::unique_ptr<Evaluator> evaluator_;
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
+  MatrixPtr cpuWeight_;
+};
+
+/*
+ * positive-negative pair rate Validation
+ */
+class PnpairValidation : public ValidationLayer {
+public:
+  explicit PnpairValidation(const LayerConfig& config)
+      : ValidationLayer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void validationImp(MatrixPtr outputValue, IVectorPtr label);
+
+  void onPassEnd();
+
+private:
+  bool passBegin_;
+  std::unique_ptr<Evaluator> evaluator_;
+};
+
+typedef std::shared_ptr<ValidationLayer> ValidationLayerPtr;
+}  // namespace paddle
diff --git a/paddle/gserver/tests/.gitignore b/paddle/gserver/tests/.gitignore
new file mode 100644
index 00000000000000..7f1845d7ec4c35
--- /dev/null
+++ b/paddle/gserver/tests/.gitignore
@@ -0,0 +1 @@
+pyDataProviderBase.py
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
new file mode 100644
index 00000000000000..4a037a5c17402c
--- /dev/null
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -0,0 +1,82 @@
+# gserver pacakge unittests
+
+################### test_ProtoDataProvider ############
+add_unittest_without_exec(test_ProtoDataProvider
+    test_ProtoDataProvider.cpp
+    TestUtil.cpp)
+
+# test_ProtoDataProvider will mkdir as same name,
+# so if WORKING_DIRECTORY is default directory, then
+# mkdir will get error.
+add_test(NAME test_ProtoDataProvider
+    COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoDataProvider
+    WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+
+################# test_LayerGrad #######################
+add_unittest_without_exec(test_LayerGrad
+    test_LayerGrad.cpp
+    LayerGradUtil.cpp
+    TestUtil.cpp)
+add_test(NAME test_LayerGrad
+    COMMAND test_LayerGrad --need_high_accuracy=true)
+
+################## test_Evaluator #######################
+add_unittest(test_Evaluator
+    test_Evaluator.cpp
+    TestUtil.cpp)
+
+################ test_LinearChainCRF ####################
+add_simple_unittest(test_LinearChainCRF)
+
+############## test_MultinomialSampler ###################
+add_simple_unittest(test_MultinomialSampler)
+
+############## test_PyDataProvider ########################
+if(WITH_PYTHON)
+    add_unittest_without_exec(test_PyDataProvider
+        test_PyDataProvider.cpp
+        TestUtil.cpp)
+
+    add_test(NAME test_PyDataProvider
+        COMMAND .set_python_path.sh -d ./gserver/tests:${PROJ_ROOT}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider
+        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+endif()
+
+############### test_RecurrentLayer #######################
+add_unittest(test_RecurrentLayer
+    test_RecurrentLayer.cpp
+    TestUtil.cpp)
+
+############### test_RecurrentGradientMachine ###############
+# TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
+# I will fix it.
+add_unittest_without_exec(test_RecurrentGradientMachine
+    test_RecurrentGradientMachine.cpp)
+add_test(NAME test_RecurrentGradientMachine
+    COMMAND .set_python_path.sh -d
+            ${PROJ_ROOT}/python:${PROJ_ROOT}/paddle/gserver/tests
+            ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
+            --use_gpu=false
+    WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+
+add_unittest_without_exec(test_NetworkCompare
+    test_NetworkCompare.cpp
+    TestUtil.cpp)
+if(WITH_GPU)
+    add_test(NAME test_NetworkCompare
+        COMMAND .set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
+        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+else()
+    add_test(NAME test_NetworkCompare
+        COMMAND .set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false
+        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+endif()
+
+
+add_unittest_without_exec(test_PyDataProvider2
+        test_PyDataProvider2.cpp)
+
+add_test(NAME test_PyDataProvider2
+   COMMAND .set_python_path.sh -d ${PROJ_ROOT}/paddle/gserver/tests:${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
+        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
+)
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
new file mode 100644
index 00000000000000..f72011ae16cb3b
--- /dev/null
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -0,0 +1,700 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "LayerGradUtil.h"
+
+P_DECLARE_bool(thread_local_rand_use_global_seed);
+
+namespace paddle {
+real getCostSum(LayerPtr& testLayer, MatrixPtr weights) {
+  testLayer->forward(PASS_GC);
+  std::vector<Argument> outArgs;
+  outArgs.push_back(testLayer->getOutput());
+  if (weights) {
+    outArgs[0].value->dotMul(*outArgs[0].value, *weights);
+  }
+  return Argument::sumCosts(outArgs);
+}
+
+real getDiffAndPrint(real newCost1, real newCost2, real callbackCount,
+                     char fill, string testLayerName, string name, real step,
+                     real delta) {
+  EXPECT_FALSE(std::isnan(newCost1));
+  EXPECT_FALSE(std::isnan(newCost2));
+
+  real trueDelta = (newCost1 - newCost2) * (callbackCount / 2.);
+  real diff = (1e-20 + trueDelta) / (1e-20 + delta) - 1;
+  LOG(INFO) << setiosflags(ios::left) << setfill(fill) << setw(20)
+            << testLayerName << " " << setw(20) << name << "step=" << setw(15)
+            << step << "cost1=" << setw(10) << newCost1 << "cost2=" << setw(10)
+            << newCost2 << "true_delta=" << setw(15) << trueDelta
+            << "analytic_delta=" << setw(15) << delta << "diff=" << diff
+            << (abs(diff) > 0.01 ? " ***" : "");
+  if (fabs(diff - 1) < 0.02) {
+    LOG(INFO) << "The previous diff might be caused by not accumulating"
+              << " parameter gradients in backward()";
+  }
+  return diff;
+}
+
+void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
+               vector<Argument>& datas) {
+  auto batchSize = datas[0].getBatchSize();
+  Argument data;
+  ICpuGpuVectorPtr sequenceStartPositions =
+      ICpuGpuVector::create(2, /* useGpu= */ false);
+  sequenceStartPositions->getMutableData(false)[0] = 0;
+  sequenceStartPositions->getMutableData(false)[1] = batchSize;
+  data.sequenceStartPositions = sequenceStartPositions;
+  testLayer->resetState();
+  for (size_t j = 0; j < datas.size(); ++j) {
+    if (datas[j].value) {
+      data.value = datas[j].value;
+    }
+    if (datas[j].ids) {
+      data.ids = datas[j].ids;
+    }
+    dataLayers[j]->setData(data);
+    dataLayers[j]->forward(PASS_TEST);
+  }
+  testLayer->forward(PASS_TEST);
+  Argument batchOut;
+  batchOut.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
+
+  sequenceStartPositions->getMutableData(false)[1] = 1;
+  testLayer->resetState();
+
+  auto testLayerState = [&](int batchId) {
+    for (size_t j = 0; j < datas.size(); ++j) {
+      if (datas[j].value) {
+        data.value = datas[j].value->subMatrix(batchId, 1);
+      }
+      if (datas[j].ids) {
+        data.ids = IVector::create(datas[j].ids->getData() + batchId, 1,
+                                   FLAGS_use_gpu);
+      }
+      dataLayers[j]->setData(data);
+      dataLayers[j]->forward(PASS_TEST);
+    }
+
+    testLayer->forward(PASS_TEST);
+    Argument out;
+    out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+    if (batchOut.value) {
+      size_t dim = batchOut.value->getWidth();
+      ASSERT_TRUE((bool)out.value);
+      EXPECT_EQ(dim, out.value->getWidth());
+      EXPECT_EQ(1UL, out.value->getHeight());
+      auto ret = std::mismatch(batchOut.value->getData() + batchId * dim,
+                               batchOut.value->getData() + (batchId + 1) * dim,
+                               out.value->getData());
+      if (ret.second != out.value->getData() + dim) {
+        // If reaches here, the test will fail
+        EXPECT_EQ(*ret.first, *ret.second);
+      }
+    } else if (batchOut.ids) {
+      ASSERT_TRUE((bool)out.ids);
+      EXPECT_EQ(1UL, out.ids->getSize());
+      EXPECT_EQ(batchOut.ids->getElement(batchId), out.ids->getElement(0));
+    }
+  };
+
+  CHECK_GT(batchSize, 0);
+  std::vector<LayerStatePtr> statePtrs;
+  statePtrs.reserve(batchSize);
+
+  // Test layer setState() and getState()
+  for (int i = 0; i < batchSize; ++i) {
+    statePtrs.push_back(testLayer->getState());
+    testLayerState(i);
+  }
+  for (int k = 0; k < batchSize - 1; ++k) {
+    testLayer->setState(statePtrs[k]);
+    for (int i = k; i < batchSize; ++i) {
+      testLayerState(i);
+    }
+  }
+}
+
+void testBatchState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
+                    vector<Argument>& datas) {
+  auto batchSize = datas[0].getBatchSize();
+  Argument data;
+  /*two sequences*/
+  size_t numSequences = 2;
+  ICpuGpuVectorPtr sequenceStartPositions =
+      ICpuGpuVector::create(numSequences + 1, /* useGpu= */ false);
+  int* cpuStarts = sequenceStartPositions->getMutableData(false);
+  int len = ::rand() % (batchSize - 1);
+  cpuStarts[0] = 0;
+  cpuStarts[1] = len > 0 ? len : 1;
+  cpuStarts[2] = batchSize;
+
+  data.sequenceStartPositions = sequenceStartPositions;
+  for (size_t j = 0; j < datas.size(); ++j) {
+    if (datas[j].value) {
+      data.value = datas[j].value;
+    }
+    if (datas[j].ids) {
+      data.ids = datas[j].ids;
+    }
+    dataLayers[j]->setData(data);
+    dataLayers[j]->forward(PASS_TEST);
+  }
+  testLayer->resetState();
+  testLayer->forward(PASS_TEST);
+  Argument batchOut;
+  batchOut.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
+
+  /*split one miniBatch into two miniBatchs*/
+  std::vector<int> seqSplitPos;
+  for (size_t seqId = 0; seqId < numSequences; ++seqId) {
+    int len = ::rand() % (cpuStarts[seqId + 1] - cpuStarts[seqId]);
+    len = len > 0 ? len : 1;
+    seqSplitPos.push_back(cpuStarts[seqId] + len);
+  }
+
+  std::vector<int> start; /*seq start pos in source data*/
+  for (size_t seqId = 0; seqId < numSequences; ++seqId) {
+    start.push_back(cpuStarts[seqId]);
+  }
+  testLayer->resetState();
+  Argument splitData;
+  for (size_t batchId = 0; batchId < 2; ++batchId) {
+    size_t splitBatchSize = 0;
+    std::vector<int> seqLens;
+    for (size_t seqId = 0; seqId < numSequences; ++seqId) {
+      int seqLen = (batchId == 0) ? seqSplitPos[seqId] - cpuStarts[seqId]
+                                  : cpuStarts[seqId + 1] - seqSplitPos[seqId];
+      seqLens.push_back(seqLen);
+      splitBatchSize += seqLen;
+    }
+    ICpuGpuVectorPtr cpuSeqStartPos =
+        ICpuGpuVector::create(3, /* useGpu= */ false);
+    int* seqStartPosData = cpuSeqStartPos->getMutableData(false);
+    seqStartPosData[0] = 0;
+    seqStartPosData[1] = seqLens[0];
+    seqStartPosData[2] = splitBatchSize;
+
+    CHECK_GT(splitBatchSize, size_t(0));
+    splitData.sequenceStartPositions = cpuSeqStartPos;
+    for (size_t j = 0; j < datas.size(); ++j) {
+      if (datas[j].value) {
+        Matrix::resizeOrCreate(splitData.value, splitBatchSize,
+                               datas[j].value->getWidth(), false,
+                               FLAGS_use_gpu);
+        for (size_t seqId = 0; seqId < numSequences; ++seqId) {
+          if (seqLens[seqId]) {
+            splitData.value->subMatrix(seqStartPosData[seqId], seqLens[seqId])
+                ->copyFrom(
+                    *datas[j].value->subMatrix(start[seqId], seqLens[seqId]));
+          }
+        }
+      }
+      if (datas[j].ids) {
+        IVector::resizeOrCreate(splitData.ids, splitBatchSize, FLAGS_use_gpu);
+        for (size_t seqId = 0; seqId < numSequences; ++seqId) {
+          if (seqLens[seqId]) {
+            splitData.ids->subVec(seqStartPosData[seqId], seqLens[seqId])
+                ->copyFrom(*datas[j].ids->subVec(start[seqId], seqLens[seqId]));
+          }
+        }
+      }
+      dataLayers[j]->setData(splitData);
+      dataLayers[j]->forward(PASS_TEST);
+    }
+
+    testLayer->forward(PASS_TEST);
+    Argument out;
+    out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+    if (batchOut.value) {
+      size_t dim = batchOut.value->getWidth();
+      ASSERT_TRUE((bool)out.value);
+      EXPECT_EQ(dim, out.value->getWidth());
+      for (size_t seqId = 0; seqId < numSequences; ++seqId) {
+        if (seqLens[seqId]) {
+          out.value->subMatrix(seqStartPosData[seqId], seqLens[seqId])
+              ->sub(*batchOut.value->subMatrix(start[seqId], seqLens[seqId]));
+        }
+      }
+    }
+
+    std::vector<Argument> args;
+    args.push_back(out);
+    EXPECT_EQ(0, Argument::sumCosts(args)) << "testBatchState failed";
+    for (size_t seqId = 0; seqId < numSequences; ++seqId) {
+      start[seqId] += seqLens[seqId];
+    }
+  }
+}
+
+double genPerturbation(const real* oldGrad, real* newGrad, size_t dim) {
+  double gradNorm = 0, dNorm = 0;
+  for (size_t i = 0; i < dim; ++i) {
+    newGrad[i] = 2. * rand() / RAND_MAX - 1;  // NOLINT
+    dNorm += newGrad[i] * newGrad[i];
+    gradNorm += oldGrad[i] * oldGrad[i];
+  }
+  if (gradNorm > 0) {
+    real s = 0.5 * sqrt(gradNorm / dNorm);
+    for (size_t i = 0; i < dim; ++i) {
+      newGrad[i] = s * newGrad[i] + oldGrad[i];
+    }
+  }
+  double delta = 0;
+  for (size_t i = 0; i < dim; ++i) {
+    delta += oldGrad[i] * newGrad[i];
+  }
+  return delta;
+}
+
+void initWeight(MatrixPtr& weights) {
+  MatrixPtr tmpMat = weights->clone();
+  for (int i = 0; i < int(tmpMat->getElementCnt()); i++) {
+    tmpMat->getData()[i] = (11 - 2 * (i % 11));
+  }
+  weights->copyFrom(*tmpMat);
+}
+
+void initBatchState(LayerPtr dataLayer, LayerPtr testLayer,
+                    LayerStatePtr state, bool useGpu) {
+  int sequenceNum = dataLayer->getOutput().getNumSequences();
+  MatrixPtr prevBatchOutput =
+      Matrix::create(sequenceNum, testLayer->getSize(), false, useGpu);
+  MatrixPtr prevBatchState =
+      Matrix::create(sequenceNum, testLayer->getSize(), false, useGpu);
+  prevBatchOutput->randomizeUniform();
+  prevBatchState->randomizeUniform();
+  state->value.clear();
+  state->value.push_back(prevBatchOutput);
+  state->value.push_back(prevBatchState);
+}
+
+void initDataLayer(TestConfig testConf, std::vector<DataLayerPtr>* dataLayers,
+                   vector<Argument>* datas, LayerMap* layerMap,
+                   string testLayerName, size_t batchSize, bool trans,
+                   bool useGpu) {
+  ICpuGpuVectorPtr sequenceStartPositions;
+  ICpuGpuVectorPtr subSequenceStartPositions;
+  IVectorPtr cpuSequenceDims;
+  for (size_t i = 0; i < testConf.inputDefs.size(); i++) {
+    LayerConfig config;
+    config.set_name(testConf.inputDefs[i].name);
+    config.set_type("data");
+    config.set_size(testConf.inputDefs[i].dim);
+    LayerPtr layer = LayerPtr(new DataLayer(config));
+    size_t numSequence = batchSize / 10 + 1;
+
+    Argument data;
+    auto fillData = [&](bool trans, int height, int width) {
+      int newHeight = trans ? height : width;
+      int newWidth = trans ? width : height;
+      data.value = Matrix::create(newHeight, newWidth, false, useGpu);
+      data.grad = Matrix::create(newHeight, newWidth, false, useGpu);
+    };
+    switch (testConf.inputDefs[i].inputType) {
+      case INPUT_DATA:
+      case INPUT_SEQUENCE_DATA:
+      case INPUT_HASSUB_SEQUENCE_DATA:
+      case INPUT_DATA_TARGET:
+      case INPUT_SEQUENCE_MDIM_DATA:
+        fillData(trans, layer->getSize(), batchSize);
+        data.value->randomizeUniform();
+        // make sure that multi-class-cross-entry won't encounter negatives
+        // make sure that multi_binary_label satisfies 0~1
+        data.value->add(-0.5);
+        if (testLayerName != "prelu") {
+          data.value->sigmoid(*data.value);
+        }
+        data.grad->zeroMem();
+        break;
+      case INPUT_LABEL:
+      case INPUT_SEQUENCE_LABEL:
+        data.ids = VectorT<int>::create(batchSize, useGpu);
+        // now rand number can be 0 to inputDefs[i].dim
+        data.ids->rand(testConf.inputDefs[i].dim);
+        break;
+      case INPUT_SPARSE_NON_VALUE_DATA:
+        data.value = makeRandomSparseMatrix(
+            batchSize, layer->getSize(),
+            /* withValue= */ false, useGpu,
+            testConf.inputDefs[i].sparse.equalNnzPerSample);
+        break;
+      case INPUT_SPARSE_FLOAT_VALUE_DATA:
+        data.value = makeRandomSparseMatrix(batchSize, layer->getSize(),
+                                            /* withValue= */ true, useGpu);
+        break;
+      case INPUT_DENSE_DIM_DATA:
+        fillData(trans, layer->getSize(), numSequence);
+        data.value->randomizeUniform();
+        data.value->add(-0.5);
+        data.value->sigmoid(*data.value);
+        data.grad->zeroMem();
+        break;
+      default:
+        LOG(FATAL) << " unknown inputType ";
+        return;
+    }
+    if (testConf.inputDefs[i].inputType == INPUT_SEQUENCE_DATA ||
+        testConf.inputDefs[i].inputType == INPUT_HASSUB_SEQUENCE_DATA ||
+        testConf.inputDefs[i].inputType == INPUT_SEQUENCE_LABEL ||
+        testConf.inputDefs[i].inputType == INPUT_SEQUENCE_MDIM_DATA) {
+      if (!sequenceStartPositions) {
+        generateSequenceStartPositions(batchSize, sequenceStartPositions);
+      }
+      data.sequenceStartPositions = sequenceStartPositions;
+    }
+    if (testConf.inputDefs[i].inputType == INPUT_HASSUB_SEQUENCE_DATA) {
+      if (!subSequenceStartPositions) {
+        generateSubSequenceStartPositions(sequenceStartPositions,
+                                          subSequenceStartPositions);
+      }
+      data.subSequenceStartPositions = subSequenceStartPositions;
+    }
+    if (testConf.inputDefs[i].inputType == INPUT_SEQUENCE_MDIM_DATA) {
+      if (!cpuSequenceDims) {
+        generateMDimSequenceData(sequenceStartPositions, cpuSequenceDims);
+      }
+      data.cpuSequenceDims = cpuSequenceDims;
+    }
+
+    DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+    dataLayer->setData(data);
+    dataLayer->forward(PASS_GC);
+    dataLayers->push_back(dataLayer);
+    (*layerMap)[config.name()] = layer;
+    datas->push_back(data);
+  }
+}
+
+void initTestLayer(TestConfig testConf, LayerMap* layerMap,
+                   std::vector<ParameterPtr>* parameters, LayerPtr* testLayer) {
+  ParameterMap parameterMap;
+  size_t index = 0;
+  LayerConfig testConfig = testConf.layerConfig;
+  CHECK_EQ(testConf.inputDefs.size(),
+           size_t(testConf.layerConfig.inputs_size()));
+
+  auto initParameter = [&](string paraName, size_t paraSize, bool isStatic,
+                           bool initialize, ParameterConfig paraConfig) {
+    paraConfig.set_name(paraName);
+    paraConfig.set_size(paraSize);
+    paraConfig.set_initial_std(1);
+    paraConfig.set_is_static(isStatic);
+    auto para =
+        std::make_shared<Parameter>(paraConfig, FLAGS_use_gpu, initialize);
+    para->enableType(PARAMETER_VALUE);
+    if (!para->isStatic()) {
+      para->enableType(PARAMETER_GRADIENT);
+      para->enableType(PARAMETER_MOMENTUM);
+    }
+    para->randomize();
+    para->setID(index++);
+    parameters->push_back(para);
+    parameterMap[paraConfig.name()] = para;
+  };
+
+  for (size_t i = 0; i < testConf.inputDefs.size(); i++) {
+    InputDef inputDef = testConf.inputDefs[i];
+    size_t paraSize = inputDef.paraSize;
+    bool sparse = inputDef.sparse.sparse;
+    LayerInputConfig& input = *(testConfig.mutable_inputs(i));
+    input.set_input_layer_name(inputDef.name);
+
+    if (paraSize) {
+      constexpr int kParaNameLen = 20;
+      char paraName[kParaNameLen];
+      snprintf(paraName, kParaNameLen, "para_%d", (int)i);
+      input.set_input_parameter_name(paraName);
+      ParameterConfig paraConfig;
+      paraConfig.set_is_sparse(sparse);
+      paraConfig.set_format(inputDef.sparse.format);
+      if (sparse) {
+        paraConfig.add_dims((*layerMap)[input.input_layer_name()]->getSize());
+        paraConfig.add_dims(testConf.layerConfig.size());
+      }
+      initParameter(paraName, paraSize, inputDef.isStatic, false, paraConfig);
+    }
+  }
+  if (testConf.biasSize) {
+    testConfig.set_bias_parameter_name("bias");
+    ParameterConfig paraConfig;
+    initParameter(testConfig.bias_parameter_name(), testConf.biasSize,
+                  testConf.staticBias, true, paraConfig);
+  }
+
+  *testLayer = Layer::create(testConfig);
+  (*layerMap)[testConfig.name()] = *testLayer;
+  (*testLayer)->init((*layerMap), parameterMap);
+  (*testLayer)->setNeedGradient(true);
+}
+
+void testPerturbParameter(TestConfig testConf, const MatrixPtr weights,
+                          const LayerStatePtr state, real cost,
+                          real callbackCount, real* maxDiff, LayerPtr testLayer,
+                          std::vector<ParameterPtr>* parameters) {
+  char fill = ' ';
+  for (auto& parameter : *parameters) {
+    if (parameter->isStatic()) {
+      continue;
+    }
+
+    size_t dim = parameter->getSize();
+    CpuVector oldPara(dim);
+    CpuVector newPara(dim);
+    VectorPtr v = parameter->getBuf(PARAMETER_VALUE);
+    oldPara.copyFrom(*parameter->getBuf(PARAMETER_VALUE));
+    real* newp = newPara.getData();
+    real* oldp = oldPara.getData();
+    CpuVector cpuGrad(*parameter->getBuf(PARAMETER_GRADIENT));
+    vector<real> d(dim);
+
+    double delta = genPerturbation(cpuGrad.getData(), &d[0], dim);
+    // use a step such that delta / cost is FLAGS_checkgrad_eps
+    real step =
+        (delta != 0) ? cost / delta * FLAGS_checkgrad_eps : FLAGS_checkgrad_eps;
+    if (fabs(step) < 1e-6) step = 1e-6;
+    delta *= step;
+
+    // compute newCost
+    real newCost[2];
+    for (int k = 0; k < 2; k++) {
+      for (size_t i = 0; i < dim; ++i) {
+        newp[i] = (k == 0) ? oldp[i] + step * d[i] : oldp[i] - step * d[i];
+      }
+      if (testConf.testBatchState) {
+        testLayer->setState(state);
+      }
+      parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
+      parameter->setValueUpdated();
+      newCost[k] = getCostSum(testLayer, weights);
+    }
+    real diff = getDiffAndPrint(newCost[0], newCost[1], callbackCount, fill,
+                                testLayer->getName(), parameter->getName(),
+                                step, delta);
+    *maxDiff = std::max(*maxDiff, abs(diff));
+    // restore parameter
+    parameter->getBuf(PARAMETER_VALUE)->copyFrom(oldPara);
+    parameter->setValueUpdated();
+    fill = (fill == ' ') ? '.' : ' ';
+  }
+}
+
+void testPerturbInput(TestConfig testConf, const MatrixPtr weights,
+                      const LayerStatePtr state, real cost, real callbackCount,
+                      real* maxDiff, LayerPtr testLayer,
+                      std::vector<DataLayerPtr> dataLayers) {
+  char fill = ' ';
+  for (size_t index = 0; index < testConf.inputDefs.size(); index++) {
+    InputType inputType = testConf.inputDefs[index].inputType;
+    if (inputType != INPUT_DATA && inputType != INPUT_SEQUENCE_DATA &&
+        inputType != INPUT_HASSUB_SEQUENCE_DATA) {
+      continue;
+    }
+
+    MatrixPtr outV = dataLayers[index]->getOutputValue();
+    int height = outV->getHeight();
+    int width = outV->getWidth();
+    size_t dim = height * width;
+
+    CpuMatrix oldPara(height, width);
+    CpuMatrix newPara(height, width);
+    oldPara.copyFrom(*outV);
+    real* newp = newPara.getData();
+    real* oldp = oldPara.getData();
+    CpuMatrix cpuGrad(height, width);
+    cpuGrad.copyFrom(*(dataLayers[index]->getOutputGrad()));
+    CpuMatrix d(height, width);
+    real* data = d.getData();
+
+    double delta = genPerturbation(cpuGrad.getData(), data, dim);
+    // use a step such that delta / cost is FLAGS_checkgrad_eps
+    real step =
+        (delta != 0) ? cost / delta * FLAGS_checkgrad_eps : FLAGS_checkgrad_eps;
+    if (fabs(step) < 1e-6) step = 1e-6;
+    delta *= step;
+
+    real newCost[2];
+    for (int k = 0; k < 2; k++) {
+      for (size_t i = 0; i < dim; ++i) {
+        newp[i] =
+            (k == 0) ? oldp[i] + step * data[i] : oldp[i] - step * data[i];
+      }
+      if (testConf.testBatchState) {
+        testLayer->setState(state);
+      }
+      outV->copyFrom(newPara);
+      newCost[k] = getCostSum(testLayer, weights);
+    }
+
+    real diff = getDiffAndPrint(newCost[0], newCost[1], callbackCount, fill,
+                                testLayer->getName(),
+                                dataLayers[index]->getName(), step, delta);
+    *maxDiff = std::max(*maxDiff, abs(diff));
+    // restore parameter
+    outV->copyFrom(oldPara);
+    fill = (fill == ' ') ? '.' : ' ';
+  }
+}
+
+void testLayerGradKernel(TestConfig testConf, string testLayerName,
+                         size_t batchSize, bool trans, bool useGpu,
+                         bool useWeight, float epsilon) {
+#ifdef PADDLE_ONLY_CPU
+  if (useGpu) return;
+#endif
+  FLAGS_use_gpu = useGpu;
+  FLAGS_prev_batch_state = testConf.testBatchState;
+  MatrixPtr weights = nullptr;
+  testConf.layerConfig.set_name(testLayerName);
+  LOG(INFO) << " layer_type=" << testConf.layerConfig.type()
+            << " useGpu=" << useGpu;
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(testConf, &dataLayers, &datas, &layerMap, testLayerName,
+                batchSize, trans, useGpu);
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr testLayer;
+  initTestLayer(testConf, &layerMap, &parameters, &testLayer);
+
+  LayerStatePtr state = std::make_shared<LayerState>();
+  if (testConf.testBatchState) {
+    initBatchState(dataLayers[0], testLayer, state, useGpu);
+    testLayer->resetState();
+    testLayer->setState(state);
+  }
+
+  testLayer->forward(PASS_GC);
+  if (useWeight && weights == nullptr) {
+    weights = testLayer->getOutput().value->clone(0, 0, useGpu);
+    initWeight(weights);
+  }
+  std::vector<Argument> outArgs;
+  outArgs.push_back(testLayer->getOutput());
+  if (useWeight) {
+    outArgs[0].value = outArgs[0].value->clone(0, 0, useGpu);
+    outArgs[0].value->dotMul(*testLayer->getOutput().value, *weights);
+  }
+
+  real cost = Argument::sumCosts(outArgs);
+  LOG(INFO) << " cost " << cost;
+  EXPECT_FALSE(std::isnan(cost));
+
+  // Test whether the callback is called for a parameter
+  if (testLayer->getOutputGrad()) {
+    useWeight ? testLayer->getOutput().grad->copyFrom(*weights)
+              : testLayer->getOutputGrad()->resetOne();
+  }
+  vector<int> callbackFlags(parameters.size(), 0);
+  auto callback = [&](Parameter* para) { ++callbackFlags[para->getID()]; };
+  testLayer->backward(callback);
+
+  // do forward and backward for another time to test that gradient is doubled
+  int callbackCount = 1;
+  if (testConf.testAccumulate) {
+    if (testConf.testBatchState) {
+      testLayer->setState(state);
+    }
+    testLayer->forward(PASS_GC);
+    if (testLayer->getOutputGrad()) {
+      useWeight ? testLayer->getOutput().grad->copyFrom(*weights)
+                : testLayer->getOutputGrad()->resetOne();
+    }
+    testLayer->backward(callback);
+    ++callbackCount;
+  }
+  for (size_t i = 0; i < parameters.size(); ++i) {
+    EXPECT_EQ(parameters[i]->isStatic() ? 0 : callbackCount,
+              callbackFlags[i]);
+  }
+
+  // Test whether the layer's forward calculation is stable
+  // by adding perturbation to its parameters or its input layers
+  real maxDiff = 0;
+  testPerturbParameter(testConf, weights, state, cost, callbackCount, &maxDiff,
+                       testLayer, &parameters);
+  testPerturbInput(testConf, weights, state, cost, callbackCount, &maxDiff,
+                   testLayer, dataLayers);
+  EXPECT_LE(fabs(maxDiff), epsilon);
+
+  if (testConf.testState) {
+    testState(testLayer, dataLayers, datas);
+  }
+  if (testConf.testBatchState) {
+    testBatchState(testLayer, dataLayers, datas);
+  }
+}
+
+void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize,
+                   bool trans, bool useGpu, bool useWeight, float epsilon) {
+  testLayerGradKernel(testConf, testLayerName, batchSize, trans, useGpu,
+                      useWeight, epsilon);
+  bool isStaticTest = false;
+  LayerConfig testConfig = testConf.layerConfig;
+  for (size_t i = 0; i < testConf.inputDefs.size(); i++) {
+    InputDef inputDef = testConf.inputDefs[i];
+    // Some layer must set isStatic true, like DataNormLayer
+    // so use !isStatic in if
+    if (inputDef.paraSize && (!inputDef.isStatic)) {
+      testConf.inputDefs[i].isStatic = true;
+      isStaticTest = true;
+    }
+  }
+
+  if (testConf.biasSize) {
+    testConf.staticBias = true;
+    isStaticTest = true;
+  }
+  if (isStaticTest) {
+    testLayerGradKernel(testConf, testLayerName, batchSize, trans, useGpu,
+                        useWeight, epsilon);
+  }
+}
+
+void testProjectionGrad(ProjectionConfig conf, InputType inputType,
+                        size_t parameterSize, size_t batchSize, bool useGpu,
+                        bool testState) {
+  TestConfig config;
+  conf.set_name(conf.type());
+  config.layerConfig.set_type("mixed");
+  config.layerConfig.set_size(conf.output_size());
+  config.biasSize = config.layerConfig.size();
+  config.inputDefs.push_back(
+      {inputType, "layer_0", conf.input_size(), parameterSize});
+  *config.layerConfig.add_inputs()->mutable_proj_conf() = conf;
+  config.testState = testState;
+  testLayerGrad(config, "mixed", batchSize, false, useGpu);
+}
+
+void testOperatorGrad(TestConfig& config, OperatorConfig& operatorConf,
+                      size_t batchSize, bool useGpu, bool testState) {
+  config.layerConfig.set_type("mixed");
+
+  operatorConf.set_output_size(config.layerConfig.size());
+  for (size_t i = 0; i < config.inputDefs.size(); ++i) {
+    operatorConf.add_input_indices(i);
+    operatorConf.add_input_sizes(config.inputDefs[i].dim);
+  }
+
+  config.testState = testState;
+  testLayerGrad(config, "mixed", batchSize, false, useGpu);
+}
+}  //  namespace paddle
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
new file mode 100644
index 00000000000000..1e608dc0620abd
--- /dev/null
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -0,0 +1,225 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/trainer/Trainer.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "ModelConfig.pb.h"
+
+#include "TestUtil.h"
+using namespace std;  // NOLINT
+
+namespace paddle {
+enum InputType {
+  INPUT_DATA,         // dense vector
+  INPUT_LABEL,        // id
+  INPUT_DATA_TARGET,  // dense vector, but no gradient
+  INPUT_SEQUENCE_DATA,
+  INPUT_HASSUB_SEQUENCE_DATA,  // sequence has sub-sequence
+  INPUT_SEQUENCE_MDIM_DATA,
+  INPUT_SEQUENCE_LABEL,
+  INPUT_SPARSE_NON_VALUE_DATA,
+  INPUT_SPARSE_FLOAT_VALUE_DATA,
+  INPUT_DENSE_DIM_DATA,  // using sequence length to init dense data
+};
+
+struct ParaSparse {
+  bool sparse;
+  string format;
+  // if equalNnzPerSample is set true,
+  // every row of the sparse matrix in a format of CSR has a same
+  // number of nnz values. Currently, this flag is only used for
+  // selective_fc layer
+  bool equalNnzPerSample;
+  ParaSparse(const string& formatIn = "") {  // NOLINT
+    if (formatIn == "") {
+      sparse = false;
+    } else {
+      sparse = true;
+    }
+    equalNnzPerSample = false;
+  }
+  ParaSparse(const string& formatIn, bool equalNnz) {
+    format = formatIn;
+    sparse = true;
+    equalNnzPerSample = equalNnz;
+  }
+};
+
+struct InputDef {
+  InputType inputType;
+  string name;
+  size_t dim;
+  size_t paraSize;
+  ParaSparse sparse;
+  bool isStatic;
+  InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) {
+    inputType = type;
+    name = nameIn;
+    dim = dimIn;
+    paraSize = sizeIn;
+    sparse = {""};
+    isStatic = false;
+  }
+  InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn,
+           ParaSparse sparseIn) {
+    inputType = type;
+    name = nameIn;
+    dim = dimIn;
+    paraSize = sizeIn;
+    sparse = sparseIn;
+  }
+};
+
+struct TestConfig {
+  LayerConfig layerConfig;
+  std::vector<InputDef> inputDefs;
+  size_t biasSize;
+  bool testAccumulate;
+  bool testState;
+  bool staticBias;
+  bool testBatchState;
+  TestConfig()
+      : biasSize(0),
+        testAccumulate(true),
+        testState(false),
+        staticBias(false),
+        testBatchState(false) {}
+};
+
+real getCostSum(ParameterPtr& parameter, CpuVector& cpuPara,
+                LayerPtr& testLayer, MatrixPtr weights = nullptr);
+
+real getDiffAndPrint(real newCost1, real newCost2, real callbackCount,
+                     char fill, string testLayerName, string name, real step,
+                     real delta);
+
+/**
+ * @brief verify that sequentially running forward() one timestamp at one time
+ *        has same result as running forward() with one whole sequence
+ *
+ * @param testLayer[in/out]    testLayer
+ * @param dataLayers[in/out]   dataLayers
+ * @param datas[in/out]        data of dataLayers
+ */
+void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
+               vector<Argument>& datas);
+
+/**
+ * @brief verify that sequentially running forward() with short sequences one
+ *        time has same result as running forward() with long sequences.
+ *
+ * @param testLayer[in/out]    testLayer
+ * @param dataLayers[in/out]   dataLayers
+ * @param datas[in/out]        data of dataLayers
+ */
+void testBatchState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
+                    vector<Argument>& datas);
+
+/**
+ * @brief Generate a perturbation so that it is roughly aligned with the
+ *        gradient direction. This is to make sure that change along this
+ *        direction will make cost increase (or decrease) in a meaningful
+ *        way so that the finite difference can be used to approximate the
+ *        directional dirivative well.
+ *
+ * @param oldGrad[in]  input gradient
+ *        newGrad[out] output gradient
+ *        dim          dimension of oldGrad/newGrad
+ *
+ * @return sum_i(oldGrad[i] * newGrad[i])
+ */
+double genPerturbation(const real* oldGrad, real* newGrad, size_t dim);
+
+void initWeight(MatrixPtr& weights);
+
+void initBatchState(LayerPtr dataLayer, LayerPtr testLayer,
+                    LayerStatePtr state, bool useGpu);
+
+/**
+ * @brief initialize the dataLayer by its inputType
+ *
+ * @param testConf[in]        test config
+ *        dataLayers[out]     dataLayers
+ *        datas[out]          initialized data of dataLayers
+ *        layerMap[out]       layerMap
+ */
+void initDataLayer(TestConfig testConf, std::vector<DataLayerPtr>* dataLayers,
+                   vector<Argument>* datas, LayerMap* layerMap,
+                   string testLayerName, size_t batchSize, bool trans,
+                   bool useGpu);
+
+/**
+ * @brief initialize the parameter of testLayer
+ *
+ * @param testConf[in/out]    test config
+ *        layerMap[out]       layerMap
+ *        parameters[out]     parameters of testLayer
+ *        testLayer[out]      testLayer
+ */
+void initTestLayer(TestConfig testConf, LayerMap* layerMap,
+                   std::vector<ParameterPtr>* parameters, LayerPtr* testLayer);
+
+/**
+ * @brief Test whether the layer's forward calculation is stable by adding
+ *        perturbation to its parameters
+ *
+ * @param testConf[in]         test config
+ *        weights[in]          weights of testLayer
+ *        state[in]            state of testLayer
+ *        cost[in]             input cost
+ *        callbackCount[in]    number of done callback
+ *        maxDiff[in/out]      max of all previous diff
+ *        testLayer[in/out]    testLayer
+ *        parameters[in/out]   parameters of testLayer
+ */
+void testPerturbParameter(TestConfig testConf, const MatrixPtr weights,
+                          const LayerStatePtr state, real cost,
+                          real callbackCount, real* maxDiff, LayerPtr testLayer,
+                          std::vector<ParameterPtr>* parameters);
+
+/**
+ * @brief Test whether the layer's forward calculation is stable by adding
+ *        perturbation to its input layers
+ *
+ * @param testConf[in]         test config
+ *        weights[in]          weights of testLayer
+ *        state[in]            state of testLayer
+ *        cost[in]             input cost
+ *        callbackCount[in]    number of done callback
+ *        maxDiff[in/out]      max of all previous diff
+ *        testLayer[in/out]    testLayer
+ *        dataLayers[in/out]   dataLayers
+ */
+void testPerturbInput(TestConfig testConf, const MatrixPtr weights,
+                      const LayerStatePtr state, real cost, real callbackCount,
+                      real* maxDiff, LayerPtr testLayer,
+                      std::vector<DataLayerPtr> dataLayers);
+
+void testLayerGradKernel(TestConfig testConf, string testLayerName,
+                         size_t batchSize, bool trans, bool useGpu,
+                         bool useWeight = false, float epsilon = 0.02);
+
+void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize,
+                   bool trans, bool useGpu, bool useWeight = false,
+                   float epsilon = 0.02);
+
+void testProjectionGrad(ProjectionConfig conf, InputType inputType,
+                        size_t parameterSize, size_t batchSize, bool useGpu,
+                        bool testState = false);
+
+void testOperatorGrad(TestConfig& config, OperatorConfig& operatorConf,
+                      size_t batchSize, bool useGpu, bool testState = false);
+
+}  //  namespace paddle
diff --git a/paddle/gserver/tests/Sequence/tour_dict_phrase.dict b/paddle/gserver/tests/Sequence/tour_dict_phrase.dict
new file mode 100644
index 00000000000000..41f68e7f5c054d
--- /dev/null
+++ b/paddle/gserver/tests/Sequence/tour_dict_phrase.dict
@@ -0,0 +1,158 @@
+，
+的
+。
+酒店
+房间
+了
+很
+也
+不错
+是
+！
+有
+服务
+就是
+都
+住
+一
+在
+好
+月湖
+不
+可以
+.
+且
+就
+离
+方便
+早餐
+还是
+近
+位置
+干净
+床上用品
+、
+价格
+挺
+强烈推荐
+感觉
+卫生
+本来
+挺好
+性价比
+房
+前台
+下次
+交通
+不过
+很方便
+给
+没
+这个
+不少
+还有
+十一
+来
+还会
+停电
+推荐
+流
+服务员
+新
+舒适
+选择
+热情
+简直
+吃饭
+安静
+吃
+很干净
+地理位置
+便利
+得
+这
+子
+杯子
+很多
+周围
+適
+第
+天一广场
+整体
+好吃
+*
+尚可
+品质
+2
+时候
+家
+出差
+又
+较
+便宜
+整洁
+啊
+汉庭
+交通便利
+旁边
+对
+去过
+次
+利落
+合
+换
+窗户
+温馨
+最
+两
+应该
+只有
+适中
+出去玩
+很安静
+商务
+对面
+道歉
+乾
+地铁站
+居然
+不远
+总体来说
+泳池
+地段
+全家
+相对
+晚
+天一阁
+电脑
+來
+呀
+一人
+口头
+上网
+刷牙
+相当
+天
+合理
+准备
+通知
+第一天
+水温
+出来
+五星级
+快
+无
+楼层
+各方面
+华润万家
+宁波
+选
+放心
+浄
+主要原因
+安排
+客户
+一次性杯子
+起
+床垫
+一早
diff --git a/paddle/gserver/tests/Sequence/tour_train_wdseg b/paddle/gserver/tests/Sequence/tour_train_wdseg
new file mode 100644
index 00000000000000..2cdf7f7e14e53f
--- /dev/null
+++ b/paddle/gserver/tests/Sequence/tour_train_wdseg
@@ -0,0 +1,10 @@
+2  	酒店 有 很 舒适 的 床垫 子 ， 床上用品 也 应该 是 一人 一 换 ， 感觉 很 利落 对 卫生 很 放心 呀 。
+2  	很 温馨 ， 也 挺 干净 的 * 地段 不错 ， 出来 就 有 全家 ， 离 地铁站 也 近 ， 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 ， 就 第一天 给 了 一次性杯子 *
+2  	位置 方便 ， 强烈推荐 ， 十一 出去玩 的 时候 选 的 ， 对面 就是 华润万家 ， 周围 吃饭 的 也 不少 。
+2  	交通便利 ， 吃 很 便利 ， 乾 浄 、 安静 ， 商务 房 有 电脑 、 上网 快 ， 价格 可以 ， 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。
+2  	本来 准备 住 两 晚 ， 第 2 天 一早 居然 停电 ， 且 无 通知 ， 只有 口头 道歉 。 总体来说 性价比 尚可 ， 房间 较 新 ， 还是 推荐 .
+2  	这个 酒店 去过 很多 次 了 ， 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店
+2  	挺好 的 汉庭 ， 前台 服务 很 热情 ， 卫生 很 整洁 ， 房间 安静 ， 水温 适中 ， 挺好 ！
+2  	HowardJohnson 的 品质 ， 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 ， 简直 一 流 。 就 在 天一阁 、 月湖 旁边 ， 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。
+2  	酒店 很干净 ， 很安静 ， 很 温馨 ， 服务员 服务 好 ， 各方面 都 不错 *
+2  	挺好 的 ， 就是 没 窗户 ， 不过 对 得 起 这 价格
diff --git a/paddle/gserver/tests/Sequence/tour_train_wdseg.nest b/paddle/gserver/tests/Sequence/tour_train_wdseg.nest
new file mode 100644
index 00000000000000..3aa890d8aa1e15
--- /dev/null
+++ b/paddle/gserver/tests/Sequence/tour_train_wdseg.nest
@@ -0,0 +1,14 @@
+2  	酒店 有 很 舒适 的 床垫 子 ， 床上用品 也 应该 是 一人 一 换 ， 感觉 很 利落 对 卫生 很 放心 呀 。
+2  	很 温馨 ， 也 挺 干净 的 * 地段 不错 ， 出来 就 有 全家 ， 离 地铁站 也 近 ， 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 ， 就 第一天 给 了 一次性杯子 *
+
+2  	位置 方便 ， 强烈推荐 ， 十一 出去玩 的 时候 选 的 ， 对面 就是 华润万家 ， 周围 吃饭 的 也 不少 。
+2  	交通便利 ， 吃 很 便利 ， 乾 浄 、 安静 ， 商务 房 有 电脑 、 上网 快 ， 价格 可以 ， 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。
+2  	本来 准备 住 两 晚 ， 第 2 天 一早 居然 停电 ， 且 无 通知 ， 只有 口头 道歉 。 总体来说 性价比 尚可 ， 房间 较 新 ， 还是 推荐 .
+
+2  	这个 酒店 去过 很多 次 了 ， 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店
+2  	挺好 的 汉庭 ， 前台 服务 很 热情 ， 卫生 很 整洁 ， 房间 安静 ， 水温 适中 ， 挺好 ！
+
+2  	HowardJohnson 的 品质 ， 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 ， 简直 一 流 。 就 在 天一阁 、 月湖 旁边 ， 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。
+2  	酒店 很干净 ， 很安静 ， 很 温馨 ， 服务员 服务 好 ， 各方面 都 不错 *
+2  	挺好 的 ， 就是 没 窗户 ， 不过 对 得 起 这 价格
+
diff --git a/paddle/gserver/tests/Sequence/train.list b/paddle/gserver/tests/Sequence/train.list
new file mode 100644
index 00000000000000..be27acb3a5411d
--- /dev/null
+++ b/paddle/gserver/tests/Sequence/train.list
@@ -0,0 +1 @@
+gserver/tests/Sequence/tour_train_wdseg
diff --git a/paddle/gserver/tests/Sequence/train.list.nest b/paddle/gserver/tests/Sequence/train.list.nest
new file mode 100644
index 00000000000000..7683ebc68efbb0
--- /dev/null
+++ b/paddle/gserver/tests/Sequence/train.list.nest
@@ -0,0 +1 @@
+gserver/tests/Sequence/tour_train_wdseg.nest
diff --git a/paddle/gserver/tests/TestUtil.cpp b/paddle/gserver/tests/TestUtil.cpp
new file mode 100644
index 00000000000000..97fbcc81763263
--- /dev/null
+++ b/paddle/gserver/tests/TestUtil.cpp
@@ -0,0 +1,220 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "TestUtil.h"
+
+#include "paddle/utils/CommandLineParser.h"
+#include "paddle/math/SparseMatrix.h"
+
+P_DEFINE_int32(fixed_seq_length, 0, "Produce some sequence of fixed length");
+
+namespace paddle {
+
+std::string randStr(const int len) {
+  std::string str =
+      "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+  std::string s = "";
+  for (int i = 0; i < len; ++i) s += str[(rand() % 62)];  // NOLINT
+  return s;
+}
+
+MatrixPtr makeRandomSparseMatrix(size_t height, size_t width, bool withValue,
+                                 bool useGpu, bool equalNnzPerSample) {
+  std::vector<int64_t> ids(height);
+  std::vector<int64_t> indices(height + 1);
+  indices[0] = 0;
+
+  std::function<size_t()> randomer = [] { return uniformRandom(10); };
+  if (equalNnzPerSample) {
+    size_t n = 0;
+    do {
+      n = uniformRandom(10);
+    } while (!n);
+    randomer = [=] { return n; };
+  }
+  for (size_t i = 0; i < height; ++i) {
+    indices[i + 1] = indices[i] + std::min(randomer(), width);
+    ids[i] = i;
+  }
+
+  if (!withValue) {
+    std::vector<sparse_non_value_t> data;
+    data.resize(indices[height] - indices[0]);
+    for (size_t i = 0; i < data.size(); ++i) {
+      data[i].col = uniformRandom(width);
+    }
+    auto mat = Matrix::createSparseMatrix(height, width, data.size(), NO_VALUE,
+                                          SPARSE_CSR, false, useGpu);
+    if (useGpu) {
+      std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
+          ids.data(), indices.data(), data.data(), HPPL_STREAM_DEFAULT);
+    } else {
+      std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
+          ->copyFrom(ids.data(), indices.data(), data.data());
+    }
+    return mat;
+  } else {
+    std::vector<sparse_float_value_t> data;
+    data.resize(indices[height] - indices[0]);
+    for (size_t i = 0; i < data.size(); ++i) {
+      data[i].col = uniformRandom(width);
+      data[i].value = rand() / static_cast<float>(RAND_MAX);  // NOLINT
+    }
+    auto mat = Matrix::createSparseMatrix(
+        height, width, data.size(), FLOAT_VALUE, SPARSE_CSR, false, useGpu);
+    if (useGpu) {
+      std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
+          ids.data(), indices.data(), data.data(), HPPL_STREAM_DEFAULT);
+    } else {
+      std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
+          ->copyFrom(ids.data(), indices.data(), data.data());
+    }
+    return mat;
+  }
+}
+
+void generateSequenceStartPositions(size_t batchSize,
+                                    IVectorPtr& sequenceStartPositions) {
+  ICpuGpuVectorPtr gpuCpuVec;
+  generateSequenceStartPositions(batchSize, gpuCpuVec);
+  sequenceStartPositions = gpuCpuVec->getMutableVector(false);
+}
+
+void generateSequenceStartPositions(size_t batchSize,
+    ICpuGpuVectorPtr& sequenceStartPositions) {
+  int numSeqs;
+  if (FLAGS_fixed_seq_length != 0) {
+    numSeqs = std::ceil((float)batchSize / (float)FLAGS_fixed_seq_length);
+  } else {
+    numSeqs = batchSize / 10 + 1;
+  }
+  sequenceStartPositions =
+      ICpuGpuVector::create(numSeqs + 1, /* useGpu= */false);
+  int* buf = sequenceStartPositions->getMutableData(false);
+  int64_t pos = 0;
+  int len = FLAGS_fixed_seq_length;
+  int maxLen = 2 * batchSize / numSeqs;
+  for (int i = 0; i < numSeqs; ++i) {
+    if (FLAGS_fixed_seq_length == 0) {
+      len = uniformRandom(
+            std::min<int64_t>(maxLen, batchSize - pos - numSeqs + i)) + 1;
+    }
+    buf[i] = pos;
+    pos += len;
+    VLOG(1) << " len=" << len;
+  }
+  buf[numSeqs] = batchSize;
+}
+
+
+void generateSubSequenceStartPositions(
+    const ICpuGpuVectorPtr& sequenceStartPositions,
+    ICpuGpuVectorPtr& subSequenceStartPositions) {
+  int numSeqs = sequenceStartPositions->getSize() - 1;
+  const int* buf = sequenceStartPositions->getData(false);
+  int numOnes = 0;
+  for (int i = 0; i < numSeqs; ++i) {
+    if (buf[i + 1] - buf[i] == 1) {
+      ++numOnes;
+    }
+  }
+  // each seq has two sub-seq except length 1
+  int numSubSeqs = numSeqs * 2 - numOnes;
+  subSequenceStartPositions =
+      ICpuGpuVector::create(numSubSeqs + 1, /* useGpu= */ false);
+  int* subBuf = subSequenceStartPositions->getMutableData(false);
+  int j = 0;
+  for (int i = 0; i < numSeqs; ++i) {
+    if (buf[i + 1] - buf[i] == 1) {
+      subBuf[j++] = buf[i];
+    } else {
+      int len = uniformRandom(buf[i + 1] - buf[i] - 1) + 1;
+      subBuf[j++] = buf[i];
+      subBuf[j++] = buf[i] + len;
+    }
+  }
+  subBuf[j] = buf[numSeqs];
+}
+
+
+void generateMDimSequenceData(const IVectorPtr& sequenceStartPositions,
+                              IVectorPtr& cpuSequenceDims) {
+  /* generate sequences with 2 dims */
+  int numSeqs = sequenceStartPositions->getSize() - 1;
+  int numDims = 2;
+
+  cpuSequenceDims = IVector::create(numSeqs * numDims, /* useGpu= */ false);
+  int* bufStarts = sequenceStartPositions->getData();
+  int* bufDims = cpuSequenceDims->getData();
+
+  for (int i = 0; i < numSeqs; i++) {
+    int len = bufStarts[i + 1] - bufStarts[i];
+    /* get width and height randomly */
+    std::vector<int> dimVec;
+    for (int j = 0; j < len; j++) {
+      if (len % (j + 1) == 0) {
+        dimVec.push_back(1);
+      }
+    }
+    int idx = rand() % dimVec.size();  // NOLINT use rand_r
+    bufDims[i * numDims] = dimVec[idx];
+    bufDims[i * numDims + 1] = len / dimVec[idx];
+  }
+}
+
+void generateMDimSequenceData(
+    const ICpuGpuVectorPtr& sequenceStartPositions,
+    IVectorPtr& cpuSequenceDims) {
+  /* generate sequences with 2 dims */
+  int numSeqs = sequenceStartPositions->getSize() - 1;
+  int numDims = 2;
+
+  cpuSequenceDims = IVector::create(numSeqs * numDims, /* useGpu= */ false);
+  const int* bufStarts = sequenceStartPositions->getData(false);
+  int* bufDims = cpuSequenceDims->getData();
+
+  for (int i = 0; i < numSeqs; i++) {
+    int len = bufStarts[i + 1] - bufStarts[i];
+    /* get width and height randomly */
+    std::vector<int> dimVec;
+    for (int j = 0; j < len; j++) {
+      if (len % (j + 1) == 0) {
+        dimVec.push_back(1);
+      }
+    }
+    int idx = rand() % dimVec.size();  // NOLINT use rand_r
+    bufDims[i * numDims] = dimVec[idx];
+    bufDims[i * numDims + 1] = len / dimVec[idx];
+  }
+}
+
+void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b) {
+  EXPECT_EQ(a->getWidth(), b->getWidth());
+  EXPECT_EQ(a->getHeight(), b->getHeight());
+  EXPECT_EQ(a->isTransposed(), b->isTransposed());
+  for (size_t r = 0; r < a->getHeight(); ++r) {
+    for (size_t c = 0; c < a->getWidth(); ++c) {
+      EXPECT_FLOAT_EQ(a->getElement(r, c), b->getElement(r, c));
+    }
+  }
+}
+
+void checkVectorEqual(const IVectorPtr& a, const IVectorPtr& b) {
+  EXPECT_EQ(a->getSize(), b->getSize());
+  for (size_t r = 0; r < a->getSize(); ++r) {
+    EXPECT_FLOAT_EQ(a->get(r), b->get(r));
+  }
+}
+}  // namespace paddle
diff --git a/paddle/gserver/tests/TestUtil.h b/paddle/gserver/tests/TestUtil.h
new file mode 100644
index 00000000000000..6a75f92ffe2f64
--- /dev/null
+++ b/paddle/gserver/tests/TestUtil.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <gtest/gtest.h>
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+std::string randStr(const int len);
+
+inline int uniformRandom(int n) { return n == 0 ? 0 : rand() % n; }
+
+inline bool approximatelyEqual(float a, float b, float epsilon) {
+  return fabs(a - b) <= ((fabs(a) < fabs(b) ? fabs(b) : fabs(a)) * epsilon);
+}
+
+MatrixPtr makeRandomSparseMatrix(size_t height, size_t width, bool withValue,
+                                 bool useGpu, bool equalNnzPerSample = false);
+
+/**
+ * @brief generate sequenceStartPositions for INPUT_SEQUENCE_DATA,
+ *        INPUT_HASSUB_SEQUENCE_DATA and INPUT_SEQUENCE_LABEL
+ *
+ * @param batchSize                      batchSize
+ *        sequenceStartPositions[out] generation output
+ */
+void generateSequenceStartPositions(size_t batchSize,
+    IVectorPtr& sequenceStartPositions);
+
+void generateSequenceStartPositions(size_t batchSize,
+   ICpuGpuVectorPtr& sequenceStartPositions);
+
+/**
+ * @brief generate subSequenceStartPositions for INPUT_HASSUB_SEQUENCE_DATA
+ *        according to sequenceStartPositions
+ *
+ * @param sequenceStartPositions[in]     input
+ *        subSequenceStartPositions[out] generation output
+ */
+void generateSubSequenceStartPositions(
+    const IVectorPtr& sequenceStartPositions,
+    IVectorPtr& subSequenceStartPositions);
+
+void generateSubSequenceStartPositions(
+    const ICpuGpuVectorPtr& sequenceStartPositions,
+    ICpuGpuVectorPtr& subSequenceStartPositions);
+
+/**
+ * @brief generate cpuSequenceDims for INPUT_SEQUENCE_MDIM_DATA according to
+ *        sequenceStartPositions
+ *
+ * @param sequenceStartPositions[in]     input
+ *        cpuSequenceDims[out]              generation output
+ */
+void generateMDimSequenceData(
+    const IVectorPtr& sequenceStartPositions,
+    IVectorPtr& cpuSequenceDims);
+void generateMDimSequenceData(
+    const ICpuGpuVectorPtr& sequenceStartPositions,
+    IVectorPtr& cpuSequenceDims);
+
+void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b);
+
+void checkVectorEqual(const IVectorPtr& a, const IVectorPtr& b);
+}  // namespace paddle
diff --git a/paddle/gserver/tests/__init__.py b/paddle/gserver/tests/__init__.py
new file mode 100644
index 00000000000000..7f9e87eee60376
--- /dev/null
+++ b/paddle/gserver/tests/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/paddle/gserver/tests/concat_dotmul_a.conf b/paddle/gserver/tests/concat_dotmul_a.conf
new file mode 100644
index 00000000000000..52340596b9b7d3
--- /dev/null
+++ b/paddle/gserver/tests/concat_dotmul_a.conf
@@ -0,0 +1,31 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000)
+
+data = data_layer(name ="input", size=1000)
+
+with mixed_layer(size=1000) as layer1:
+    layer1 += dotmul_projection(input=data)
+
+with mixed_layer(size=1000) as layer2:
+    layer2 += dotmul_projection(input=data)
+
+concat = concat_layer(input=[layer1, layer2])
+
+outputs(concat)
diff --git a/paddle/gserver/tests/concat_dotmul_b.conf b/paddle/gserver/tests/concat_dotmul_b.conf
new file mode 100644
index 00000000000000..68859867bf3b10
--- /dev/null
+++ b/paddle/gserver/tests/concat_dotmul_b.conf
@@ -0,0 +1,29 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000)
+
+data = data_layer(name ="input", size=1000)
+
+proj1 = dotmul_projection(input=data)
+
+proj2 = dotmul_projection(input=data)
+
+concat = concat_layer(input=[proj1, proj2])
+
+outputs(concat)
diff --git a/paddle/gserver/tests/concat_fullmatrix_a.conf b/paddle/gserver/tests/concat_fullmatrix_a.conf
new file mode 100644
index 00000000000000..35bafc58ac3d7a
--- /dev/null
+++ b/paddle/gserver/tests/concat_fullmatrix_a.conf
@@ -0,0 +1,35 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+
+data = data_layer(name ="input", size=100)
+
+# fc1 is equal to fc2
+# note that in mixed_layer, default bias_attr=False,
+# and default act=LinearActivation().
+fc1 = fc_layer(input=data, size=1000, 
+               bias_attr=False, 
+               act=LinearActivation())
+
+with mixed_layer(size=1000) as fc2:
+    fc2 += full_matrix_projection(input=data)
+
+concat = concat_layer(input=[fc1, fc2])
+
+outputs(concat)
diff --git a/paddle/gserver/tests/concat_fullmatrix_b.conf b/paddle/gserver/tests/concat_fullmatrix_b.conf
new file mode 100644
index 00000000000000..00a957d97d591f
--- /dev/null
+++ b/paddle/gserver/tests/concat_fullmatrix_b.conf
@@ -0,0 +1,29 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+
+data = data_layer(name ="input", size=100)
+
+proj1 = full_matrix_projection(input=data, size=1000)
+
+proj2 = full_matrix_projection(input=data, size=1000)
+
+concat = concat_layer(input=[proj1, proj2])
+
+outputs(concat)
diff --git a/paddle/gserver/tests/concat_table_a.conf b/paddle/gserver/tests/concat_table_a.conf
new file mode 100644
index 00000000000000..2e3c518883e20c
--- /dev/null
+++ b/paddle/gserver/tests/concat_table_a.conf
@@ -0,0 +1,32 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000)
+
+data = data_layer(name ="input", size=100000)
+
+# emb1 is equal to emb2, note that bias_attr=false 
+# and act=LinearActivation() in default.
+emb1 = embedding_layer(input=data, size=128)
+
+with mixed_layer(size=128) as emb2:
+    emb2 += table_projection(input=data)
+
+concat = concat_layer(input=[emb1, emb2])
+
+outputs(concat)
diff --git a/paddle/gserver/tests/concat_table_b.conf b/paddle/gserver/tests/concat_table_b.conf
new file mode 100644
index 00000000000000..6da24a5fbc55c1
--- /dev/null
+++ b/paddle/gserver/tests/concat_table_b.conf
@@ -0,0 +1,29 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000)
+
+data = data_layer(name ="input", size=100000)
+
+proj1 = table_projection(input=data, size=128)
+
+proj2 = table_projection(input=data, size=128)
+
+concat = concat_layer(input=[proj1, proj2])
+
+outputs(concat)
diff --git a/paddle/gserver/tests/proto_files.txt b/paddle/gserver/tests/proto_files.txt
new file mode 100644
index 00000000000000..691b38c7940bd2
--- /dev/null
+++ b/paddle/gserver/tests/proto_files.txt
@@ -0,0 +1,2 @@
+./test_ProtoDataProvider/data1.bin
+./test_ProtoDataProvider/data2.bin
diff --git a/paddle/gserver/tests/proto_files_compressed.txt b/paddle/gserver/tests/proto_files_compressed.txt
new file mode 100644
index 00000000000000..7413c81e185d02
--- /dev/null
+++ b/paddle/gserver/tests/proto_files_compressed.txt
@@ -0,0 +1,2 @@
+./test_ProtoDataProvider/data1.bin.gz
+./test_ProtoDataProvider/data2.bin.gz
diff --git a/paddle/gserver/tests/pyDataProvider.py b/paddle/gserver/tests/pyDataProvider.py
new file mode 100644
index 00000000000000..c3155e7adea04d
--- /dev/null
+++ b/paddle/gserver/tests/pyDataProvider.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+import struct
+import traceback
+
+def header_creator():
+    ret = ""
+    ret += struct.pack('i', 3) # slot num
+    ret += struct.pack('i', 1) # sequence flag
+    ret += struct.pack('i', 0) # slot0 dense type
+    ret += struct.pack('i', 3) # slot0 dim
+    ret += struct.pack('i', 1) # slot1 sparse non value type
+    ret += struct.pack('i', 7) # slot1 dim
+    ret += struct.pack('i', 3) # slot2 index type
+    ret += struct.pack('i', 2) # slot2 dim
+    return ret
+
+def dense_value_creator(sample_num):
+    ret = ""
+    ret += struct.pack('i', sample_num) # slot0 sample num
+    for i in range(sample_num): # slot0 value
+        ret += struct.pack('f', 1.0)
+        ret += struct.pack('f', 2.0)
+        ret += struct.pack('f', 3.0)
+    return ret
+
+def sparse_value_creator(sample_num):
+    ret = ""
+    ret += struct.pack('i', sample_num) # slot1 sample num
+    for i in range(sample_num): # slot1 index
+        ret += struct.pack('i', i * 2)
+    ret += struct.pack('i', sample_num * 2) #slot1 length
+    for i in range(sample_num): # slot1 value
+        ret += struct.pack('i', 1)
+        ret += struct.pack('i', 2)
+    return ret
+
+def index_value_creator(sample_num):
+    ret = ""
+    ret += struct.pack('i', sample_num) # slot2 sample num
+    for i in range(sample_num): # slot2 value
+        ret += struct.pack('i', 0)
+    return ret
+
+def sequenceStartPositions_creator():
+    ret = ""
+    ret += struct.pack('i', 2) # slot0 sequence num
+    ret += struct.pack('i', 0) # slot0 sequence value1
+    ret += struct.pack('i', 1) # slot0 sequence value2
+    ret += struct.pack('i', 1) # slot1 sequence num
+    ret += struct.pack('i', 0) # slot1 sequence value1
+    ret += struct.pack('i', 2) # slot2 sequence num
+    ret += struct.pack('i', 0) # slot2 sequence value1
+    ret += struct.pack('i', 1) # slot2 sequence value2
+    return ret
+
+def subSequenceStartPositions_creator():
+    ret = ""
+    ret += struct.pack('i', 3) # slot0 subsequence num
+    ret += struct.pack('i', 0) # slot0 subsequence value1
+    ret += struct.pack('i', 1) # slot0 subsequence value2
+    ret += struct.pack('i', 2) # slot0 subsequence value3
+    ret += struct.pack('i', 2) # slot1 subsequence num
+    ret += struct.pack('i', 0) # slot1 subsequence value1
+    ret += struct.pack('i', 1) # slot1 subsequence value2
+    ret += struct.pack('i', 3) # slot2 subsequence num
+    ret += struct.pack('i', 0) # slot2 subsequence value1
+    ret += struct.pack('i', 1) # slot2 subsequence value2
+    ret += struct.pack('i', 2) # slot2 subsequence value3
+    return ret
+
+class SimpleDataProvider:
+    def __init__(self, *file_list):
+        self.file_list = file_list
+
+    def shuffle(self):
+        pass
+
+    def reset(self):
+        pass
+
+    def getHeader(self):
+       return  header_creator()
+
+    def getNextBatch(self, batch_size):
+        ret = ""
+        ret += struct.pack('i', 2) # batch size
+        ret += dense_value_creator(2) # slot0
+        ret += sparse_value_creator(2) # slot1
+        ret += index_value_creator(2) # slot2
+        ret += sequenceStartPositions_creator()
+        return ret
+
+class SimpleNestDataProvider:
+    def __init__(self, *file_list):
+        self.file_list = file_list
+
+    def shuffle(self):
+        pass
+
+    def reset(self):
+        pass
+
+    def getHeader(self):
+        return header_creator()
+
+    def getNextBatch(self, batch_size):
+        ret = ""
+        ret += struct.pack('i', 2) # batch size
+        ret += dense_value_creator(4) # slot0
+        ret += sparse_value_creator(4) # slot1
+        ret += index_value_creator(4) # slot2
+        ret += sequenceStartPositions_creator()
+        ret += subSequenceStartPositions_creator()
+        return ret
+
+if __name__ == "__main__":
+    # test code
+    data_provider = SimpleDataProvider('./test_batch')
+    print len(data_provider.getHeader())
+    print len(data_provider.getNextBatch(2))
+
+    data_provider = SimpleNestDataProvider('./test_batch')
+    print len(data_provider.getHeader())
+    print len(data_provider.getNextBatch(2))
diff --git a/paddle/gserver/tests/pyDataProvider/pyDataProviderList b/paddle/gserver/tests/pyDataProvider/pyDataProviderList
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/paddle/gserver/tests/pyDataProvider/trainer.conf b/paddle/gserver/tests/pyDataProvider/trainer.conf
new file mode 100644
index 00000000000000..7957814c010d25
--- /dev/null
+++ b/paddle/gserver/tests/pyDataProvider/trainer.conf
@@ -0,0 +1,75 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+################################### Data Configuration ###################
+TrainData(PyData(type="py", 
+            files = "./gserver/tests/pyDataProvider/pyDataProviderList", 
+            load_data_module="pyDataProvider", 
+            load_data_object="SimpleDataProvider"))
+
+################################### Algorithm Configuration #############
+Settings(
+    learning_rate_decay_a = 1e-05,
+    learning_rate_decay_b = 1e-06,
+    learning_rate = 0.001,
+    batch_size = 1,
+    algorithm = 'sgd',
+    num_batches_per_send_parameter = 1,
+    num_batches_per_get_parameter = 1,
+)
+
+################################### Network Configuration ###############
+Layer(type = "data", name = "input1", size = 3)
+Layer(type = "data", name = "input2", size = 7)
+
+Layer(inputs = [Input("input1", 
+                      decay_rate = 0.12, 
+                      initial_std = 0.02, 
+                      parameter_name = "_layer1_1.w"), 
+                Input("input2", 
+                      decay_rate = 0.12, 
+                      initial_std = 0.02, 
+                      parameter_name = "_layer1_2.w"),
+               ], 
+      name = "layer1", 
+      bias = Bias(parameter_name = "_layer1.bias"), 
+      active_type = "sigmoid", 
+      type = "fc", 
+      size = 100)
+Layer(inputs = [Input("layer1", 
+                      decay_rate = 0.06, 
+                      initial_std = 0.02, 
+                      parameter_name = "_layer2.w")], 
+      name = "layer2", 
+      bias = Bias(parameter_name = "_layer2.bias"), 
+      active_type = "sigmoid", 
+      type = "fc", 
+      size = 100)
+Layer(inputs = [Input("layer2", 
+                      decay_rate = 0.02, 
+                      initial_std = 0.02, 
+                      parameter_name = "_layer_output.w")], 
+      name = "output", 
+      bias = Bias(parameter_name = "_layer_output.bias"), 
+      active_type = "softmax", 
+      type = "fc", 
+      size = 10)
+
+Layer(type = "data", name = "label", size = 1)
+Layer(inputs = [Input("output"), Input("label")], 
+      type = "multi-class-cross-entropy", 
+      name = "cost")
+Inputs("input1", "input2", "label")
+Outputs("cost")
diff --git a/paddle/gserver/tests/sequenceGen.py b/paddle/gserver/tests/sequenceGen.py
new file mode 100644
index 00000000000000..dd2b90dd4986cf
--- /dev/null
+++ b/paddle/gserver/tests/sequenceGen.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+#coding=utf-8
+
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+from paddle.trainer.PyDataProviderWrapper import *
+
+@init_hook_wrapper
+def hook(obj, dict_file, **kwargs):
+    obj.word_dict = dict_file
+    obj.slots = [IndexSlot(len(obj.word_dict)), IndexSlot(3)]
+    obj.logger.info('dict len : %d' % (len(obj.word_dict)))
+
+@provider(use_seq=True, init_hook=hook)
+def process(obj, file_name):
+    with open(file_name, 'r') as fdata:
+        for line in fdata:
+            label, comment = line.strip().split('\t')
+            label = int(''.join(label.split()))
+            words = comment.split()
+            word_slot = [obj.word_dict[w] for w in words if w in obj.word_dict]
+            yield word_slot, [label]
+
+## for hierarchical sequence network
+@provider(use_seq=True, init_hook=hook)
+def process2(obj, file_name):
+    with open(file_name) as fdata:
+        label_list = []
+        word_slot_list = []
+        for line in fdata:
+            if (len(line)) > 1:
+                label,comment = line.strip().split('\t')
+                label = int(''.join(label.split()))
+                words = comment.split()
+                word_slot = [obj.word_dict[w] for w in words if w in obj.word_dict]
+                label_list.append([label])
+                word_slot_list.append(word_slot)
+            else:
+                yield word_slot_list, label_list
+                label_list = []
+                word_slot_list = []
diff --git a/paddle/gserver/tests/sequence_layer_group.conf b/paddle/gserver/tests/sequence_layer_group.conf
new file mode 100644
index 00000000000000..9ad2b3762845fa
--- /dev/null
+++ b/paddle/gserver/tests/sequence_layer_group.conf
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources(train_list='gserver/tests/Sequence/train.list',
+                       test_list=None,
+                       module='sequenceGen',
+                       obj='process',
+                       args={"dict_file":dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path,'r').readlines())
+word_dim = 128
+hidden_dim = 256
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+# (lstm_input + lstm) is equal to lstmemory 
+with mixed_layer(size=hidden_dim*4) as lstm_input:
+    lstm_input += full_matrix_projection(input=emb)
+
+lstm = lstmemory_group(input=lstm_input,
+                       size=hidden_dim,
+                       act=TanhActivation(),
+                       gate_act=SigmoidActivation(),
+                       state_act=TanhActivation(),
+                       lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
+
+lstm_last = last_seq(input=lstm)
+
+with mixed_layer(size=label_dim, 
+                 act=SoftmaxActivation(), 
+                 bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_last)
+
+outputs(classification_cost(input=output, label=data_layer(name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_nest_layer_group.conf b/paddle/gserver/tests/sequence_nest_layer_group.conf
new file mode 100644
index 00000000000000..8c3a08f16cd1cc
--- /dev/null
+++ b/paddle/gserver/tests/sequence_nest_layer_group.conf
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources(train_list='gserver/tests/Sequence/train.list.nest',
+                       test_list=None,
+                       module='sequenceGen',
+                       obj='process2',
+                       args={"dict_file":dict_file})
+
+settings(batch_size=2)
+######################## network configure ################################
+dict_dim = len(open(dict_path,'r').readlines())
+word_dim = 128
+hidden_dim = 256
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb_group = embedding_layer(input=data, size=word_dim)
+
+# (lstm_input + lstm) is equal to lstmemory 
+def lstm_group(lstm_group_input):
+    with mixed_layer(size=hidden_dim*4) as group_input:
+      group_input += full_matrix_projection(input=lstm_group_input)
+
+    lstm_output = lstmemory_group(input=group_input,
+                                  name="lstm_group",
+                                  size=hidden_dim,
+                                  act=TanhActivation(),
+                                  gate_act=SigmoidActivation(),
+                                  state_act=TanhActivation(),
+                                  lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
+    return lstm_output
+
+lstm_nest_group = recurrent_group(input=SubsequenceInput(emb_group),
+                                  step=lstm_group,
+                                  name="lstm_nest_group")
+# hasSubseq ->(seqlastins) seq
+lstm_last = last_seq(input=lstm_nest_group, agg_level=AggregateLevel.EACH_SEQUENCE)
+
+# seq ->(expand) hasSubseq
+lstm_expand = expand_layer(input=lstm_last, expand_as=emb_group, expand_level=ExpandLevel.FROM_SEQUENCE)
+
+# hasSubseq ->(average) seq
+lstm_average = pooling_layer(input=lstm_expand,
+                             pooling_type=AvgPooling(),
+                             agg_level=AggregateLevel.EACH_SEQUENCE)
+
+with mixed_layer(size=label_dim, 
+                 act=SoftmaxActivation(), 
+                 bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_average)
+
+outputs(classification_cost(input=output, label=data_layer(name="label", size=1)))
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
new file mode 100644
index 00000000000000..8e857781468fed
--- /dev/null
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -0,0 +1,211 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <gtest/gtest.h>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/trainer/Trainer.h"
+#include "TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+P_DECLARE_bool(use_gpu);
+P_DECLARE_int32(gpu_id);
+P_DECLARE_bool(thread_local_rand_use_global_seed);
+
+enum InputType {
+  INPUT_DATA,         // dense vector
+  INPUT_LABEL,        // id
+  INPUT_DATA_TARGET,  // dense vector, but no gradient
+  INPUT_SEQUENCE_DATA,
+  INPUT_SEQUENCE_LABEL,
+  INPUT_SPARSE_NON_VALUE_DATA
+};
+
+struct InputDef {
+  InputType inputType;
+  string name;
+  size_t dim;
+};
+
+struct TestConfig {
+  EvaluatorConfig evaluatorConfig;
+  std::vector<InputDef> inputDefs;
+  bool testAccumulate;
+  TestConfig() : testAccumulate(true) {}
+};
+
+void testEvaluator(TestConfig testConf, string testEvaluatorName,
+                   size_t batchSize, bool useGpu) {
+#ifdef PADDLE_ONLY_CPU
+  if (useGpu) return;
+#endif
+  FLAGS_use_gpu = useGpu;
+  testConf.evaluatorConfig.set_name(testEvaluatorName);
+  LOG(INFO) << " evaluator_type=" << testConf.evaluatorConfig.type()
+            << " useGpu=" << useGpu;
+
+  std::vector<Argument> arguments;
+  for (size_t i = 0; i < testConf.inputDefs.size(); ++i) {
+    Argument data;
+    size_t dim = testConf.inputDefs[i].dim;
+    switch (testConf.inputDefs[i].inputType) {
+      case INPUT_DATA:
+      case INPUT_SEQUENCE_DATA:
+      case INPUT_DATA_TARGET:
+        data.value = Matrix::create(batchSize, dim, false, useGpu);
+        data.value->randomizeUniform();
+
+        // make sure output > 0 && output < 1
+        data.value->add(-0.5);
+        data.value->sigmoid(*data.value);
+        break;
+      case INPUT_LABEL:
+      case INPUT_SEQUENCE_LABEL:
+        data.ids = VectorT<int>::create(batchSize, useGpu);
+        data.ids->rand(dim);  // now rand number can be 0 to inputDefs[i].dim.
+        break;
+      case INPUT_SPARSE_NON_VALUE_DATA:
+        data.value = makeRandomSparseMatrix(batchSize, dim,
+                                            /* withValue= */ false, useGpu);
+        break;
+      default:
+        LOG(FATAL) << " unknown inputType ";
+        return;
+    }
+
+    arguments.push_back(data);
+  }
+
+  Evaluator* testEvaluator = Evaluator::create(testConf.evaluatorConfig);
+  double totalScore = 0.0;
+  totalScore += testEvaluator->evalImp(arguments);
+  testEvaluator->updateSamplesNum(arguments);
+  LOG(INFO) << *testEvaluator;
+
+  double totalScore2 = 0.0;
+  if (testConf.testAccumulate) {
+    totalScore2 += testEvaluator->evalImp(arguments);
+    EXPECT_LE(fabs(totalScore - totalScore2), 1.0e-5);
+  }
+}
+
+void testEvaluatorAll(TestConfig testConf, string testEvaluatorName,
+                   size_t batchSize) {
+  testEvaluator(testConf, testEvaluatorName, batchSize, true);
+  testEvaluator(testConf, testEvaluatorName, batchSize, false);
+}
+
+TEST(Evaluator, classification_error) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("classification_error");
+
+  config.inputDefs.push_back({INPUT_DATA, "output", 50});
+  config.inputDefs.push_back({INPUT_LABEL, "label", 50});
+  testEvaluatorAll(config, "classification_error", 100);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  testEvaluatorAll(config, "classification_error_weight", 100);
+
+  // multi binary labels
+  config.inputDefs.clear();
+  config.inputDefs.push_back({INPUT_DATA, "output", 100});
+  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "label", 100});
+  // Not support GPU
+  testEvaluator(config, "classification_error_multi_binary_label", 50, false);
+
+  config.evaluatorConfig.set_classification_threshold(0.4);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  // Not support GPU
+  testEvaluator(config, "classification_error_weight_multi_binary_label", 50,
+                false);
+}
+
+TEST(Evaluator, sum) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("sum");
+
+  // sum of output
+  config.inputDefs.push_back({INPUT_DATA, "output", 10});
+  testEvaluatorAll(config, "sum_output", 200);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  testEvaluatorAll(config, "sum_output_weight", 200);
+
+  // sum of label
+  config.inputDefs.clear();
+  config.inputDefs.push_back({INPUT_LABEL, "label", 10});
+  testEvaluatorAll(config, "sum_label", 200);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  testEvaluatorAll(config, "sum_label_weight", 200);
+}
+
+TEST(Evaluator, last_column_sum) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("last-column-sum");
+
+  config.inputDefs.push_back({INPUT_DATA, "output", 50});
+  testEvaluatorAll(config, "last-column-sum", 200);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  testEvaluatorAll(config, "last-column-sum_weight", 200);
+}
+
+TEST(Evaluator, last_column_auc) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("last-column-auc");
+
+  config.inputDefs.push_back({INPUT_DATA, "output", 2});
+  config.inputDefs.push_back({INPUT_LABEL, "label", 2});
+  testEvaluatorAll(config, "last-column-auc", 500);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  testEvaluatorAll(config, "last-column-auc_weight", 200);
+}
+
+TEST(Evaluator, precision_recall) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("precision_recall");
+
+  config.inputDefs.push_back({INPUT_DATA, "output", 10});
+  config.inputDefs.push_back({INPUT_LABEL, "label", 10});
+  testEvaluatorAll(config, "precision_recall", 200);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  testEvaluatorAll(config, "precision_recall_weight", 200);
+
+  LOG(INFO) << "positive_label = 5";
+  config.evaluatorConfig.set_positive_label(5);
+  testEvaluatorAll(config, "precision_recall_weight", 200);
+
+  // multi binary labels
+  config.inputDefs.clear();
+  config.evaluatorConfig.set_positive_label(-1);
+  config.inputDefs.push_back({INPUT_DATA, "output", 10});
+  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "label", 10});
+  // Not support GPU
+  testEvaluator(config, "precision_recall_multi_binary_label", 100, false);
+
+  LOG(INFO) << "classification_threshold = 0.4";
+  config.evaluatorConfig.set_classification_threshold(0.4);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  // Not support GPU
+  testEvaluator(config, "precision_recall_weight_multi_binary_label", 100,
+                false);
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
new file mode 100644
index 00000000000000..ac202fe976c1df
--- /dev/null
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -0,0 +1,1266 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <vector>
+#include <string>
+#include "paddle/gserver/layers/DataLayer.h"
+#include "ModelConfig.pb.h"
+#include "paddle/trainer/Trainer.h"
+
+#include "TestUtil.h"
+#include "LayerGradUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+P_DECLARE_bool(use_gpu);
+P_DECLARE_int32(gpu_id);
+P_DECLARE_double(checkgrad_eps);
+P_DECLARE_bool(thread_local_rand_use_global_seed);
+P_DEFINE_bool(need_high_accuracy, false,
+              "whether need to run in double accuracy");
+P_DECLARE_bool(prev_batch_state);
+
+TEST(Operator, dot_mul) {
+  TestConfig config;
+  config.layerConfig.set_size(10);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
+  operatorConf.set_type("dot_mul");
+  operatorConf.set_dotmul_scale(-1);
+
+  testOperatorGrad(config, operatorConf, 100, false, false);
+}
+
+TEST(Projection, context) {
+  for (auto contextStart : {-5, -3, -1, 0, 3}) {
+    for (auto contextLength : {1, 2, 5, 7}) {
+      for (auto batchSize : {1, 2, 5, 20, 100}) {
+        for (auto trainablePadding : {false, true}) {
+          LOG(INFO) << " contextStart=" << contextStart
+                    << " contextLength=" << contextLength
+                    << " batchSize=" << batchSize
+                    << " trainablePadding=" << trainablePadding;
+          ProjectionConfig conf;
+          conf.set_type("context");
+          conf.set_input_size(10);
+          conf.set_context_start(contextStart);
+          conf.set_context_length(contextLength);
+          conf.set_trainable_padding(trainablePadding);
+          conf.set_output_size(conf.context_length() * conf.input_size());
+          int pad =
+              std::max(0, -conf.context_start()) +
+              std::max(0, conf.context_start() + conf.context_length() - 1);
+          for (auto useGpu : {false, true}) {
+            testProjectionGrad(
+                conf, INPUT_SEQUENCE_DATA,
+                trainablePadding ? conf.input_size() * pad : 0, batchSize,
+                useGpu,
+                contextStart + contextLength <= 1);  // = testState
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Projection, trans_fc) {
+  ProjectionConfig conf;
+  conf.set_type("trans_fc");
+  conf.set_input_size(50);
+  conf.set_output_size(20);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 1000,
+                       /* batchSize */ 100, useGpu);
+  }
+}
+
+TEST(Projection, fc) {
+  ProjectionConfig conf;
+  conf.set_type("fc");
+  conf.set_input_size(10);
+  conf.set_output_size(20);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 200,
+                       /* batchSize */ 100, useGpu);
+  }
+}
+
+TEST(Projection, dot_mul) {
+  ProjectionConfig conf;
+  conf.set_type("dot_mul");
+  conf.set_input_size(20);
+  conf.set_output_size(20);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 20,
+                       /* batchSize */ 100, useGpu);
+  }
+}
+
+TEST(Projection, table) {
+  ProjectionConfig conf;
+  conf.set_type("table");
+  conf.set_input_size(10);
+  conf.set_output_size(20);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf, INPUT_LABEL, /* parameterSize */ 200,
+                       /* batchSize */ 100, useGpu);
+  }
+}
+
+TEST(Projection, identity) {
+  ProjectionConfig conf;
+  conf.set_type("identity");
+  conf.set_input_size(10);
+  conf.set_output_size(10);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 0,
+                       /* batchSize */ 100, useGpu);
+  }
+}
+
+TEST(Layer, concat) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("concat");
+  config.layerConfig.set_size(15);
+  config.layerConfig.set_active_type("sigmoid");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "concat", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, AddtoLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("addto");
+  config.layerConfig.set_size(10);
+  config.layerConfig.set_active_type("sigmoid");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "addto", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, CRFLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("crf");
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 120});
+  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "crf", 100, /* trans */ false, /* useGpu */ useGpu,
+                  false /*useWeight*/, 0.03 /*epsilon*/);
+  }
+}
+
+TEST(Layer, CTCLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("ctc");
+  config.layerConfig.set_norm_by_times(false);
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "ctc", 100, /* trans */ false, /* useGpu */ useGpu);
+  }
+}
+
+TEST(Layer, cosSimLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("cos");
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 50, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "cos", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, CosSimVecMatLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("cos_vm");
+  config.layerConfig.set_size(5);  // output size
+  config.layerConfig.set_cos_scale(2.0);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "cos_vm", 100, false, useGpu);
+  }
+}
+
+void testConvLayer(const string& type, bool trans, bool useGpu) {
+  TestConfig config;
+  config.biasSize = 16;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(16);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 768, 288});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(3);
+  conv->set_channels(3);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(16);
+  conv->set_output_x(
+      (2 * conv->padding() + conv->img_size() - conv->filter_size()) /
+          ((float)conv->stride()) +
+      1.5);
+  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+                              config.layerConfig.num_filters());
+
+  testLayerGrad(config, "conv", 100, trans, useGpu);
+}
+
+TEST(Layer, convLayer) {
+  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ false);
+#ifndef PADDLE_ONLY_CPU
+  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ true);
+  testConvLayer("cudnn_conv", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+TEST(Layer, blockExpandLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("blockexpand");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 6144, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  BlockExpandConfig* blockExpand = input->mutable_block_expand_conf();
+  blockExpand->set_img_size_x(64);
+  blockExpand->set_img_size_y(32);
+  blockExpand->set_channels(3);
+  blockExpand->set_padding_x(0);
+  blockExpand->set_padding_y(0);
+  blockExpand->set_block_x(4);
+  blockExpand->set_block_y(32);
+  blockExpand->set_stride_x(2);
+  blockExpand->set_stride_y(2);
+  blockExpand->set_output_x(
+      1 +
+      (2 * blockExpand->padding_x() + blockExpand->img_size_x() -
+       blockExpand->block_x() + blockExpand->stride_x() - 1) /
+          blockExpand->stride_x());
+  blockExpand->set_output_y(
+      1 +
+      (2 * blockExpand->padding_y() + blockExpand->img_size_y() -
+       blockExpand->block_y() + blockExpand->stride_y() - 1) /
+          blockExpand->stride_y());
+  config.layerConfig.set_size(blockExpand->block_x() * blockExpand->block_y() *
+                              blockExpand->channels());
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "blockexpand", 100, false, useGpu);
+  }
+}
+
+void testFcLayer(string format, size_t nnz) {
+  TestConfig config;
+  config.biasSize = 4096;
+  config.layerConfig.set_type("fc");
+  config.layerConfig.set_size(4096);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_drop_rate(0.1);
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", 8192, nnz, ParaSparse(format)});
+  config.layerConfig.add_inputs();
+
+  LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
+            << config.inputDefs[0].sparse.format;
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "fc", 100, /* trans */ false, useGpu,
+                  /* weight */ true);
+  }
+}
+
+TEST(Layer, fcLayer) {
+  testFcLayer("", 4096 * 4096 * 2);
+  testFcLayer("csc", 4096 * 40);
+  testFcLayer("csr", 4096 * 40);
+}
+
+TEST(Layer, SelectiveFullyConnectedLayer) {
+  TestConfig config;
+  size_t nin = 16;
+  size_t nout = 256;
+  config.layerConfig.set_type("selective_fc");
+  config.layerConfig.set_size(nout);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_has_selected_colums(true);
+  config.layerConfig.set_selective_fc_pass_generation(false);
+  config.biasSize = nout;
+
+  config.inputDefs.push_back({INPUT_DATA, "input0", nin, nin * nout});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back(
+      {INPUT_SPARSE_NON_VALUE_DATA, "index", nout, 0, ParaSparse("csr", true)});
+  config.layerConfig.add_inputs();
+
+  testLayerGrad(config, "selective_fc", 100,
+                /* trans= */ false, /* useGup= */ false, false);
+#ifndef PADDLE_ONLY_CPU
+  testLayerGrad(config, "selective_fc", 100,
+                /* trans= */ false, /* useGup= */ true, false);
+#endif
+}
+
+TEST(Layer, DataNormLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("data_norm");
+  config.layerConfig.set_size(20);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 100});
+  config.inputDefs.back().isStatic = true;
+  config.layerConfig.add_inputs();
+
+  for (auto strategy : {"z-score", "min-max", "decimal-scaling"}) {
+    config.layerConfig.set_data_norm_strategy(strategy);
+    // The parameters are static, so not support GPU now
+    testLayerGrad(config, "data_norm", 200, /* trans */ false,
+                  /* useGpu */ false);
+  }
+}
+
+TEST(Layer, hsigmoidLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("hsigmoid");
+  config.layerConfig.set_num_classes(5);
+  config.layerConfig.set_size(1);
+  config.biasSize = config.layerConfig.num_classes() - 1;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 200});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 5, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // Not support GPU now
+  testLayerGrad(config, "hsigmoid", 100, /* trans */ false, /* useGpu */ false);
+}
+
+TEST(Layer, multi_cross) {
+  TestConfig config;
+  config.layerConfig.set_type("multi-class-cross-entropy");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "multi-class-cross-entropy", 100, /* trans */ false,
+                  useGpu);
+  }
+}
+
+TEST(Layer, multi_binary_label) {
+  TestConfig config;
+  config.layerConfig.set_type("multi_binary_label_cross_entropy");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // Not support GPU now
+  testLayerGrad(config, "multi_binary_label_cross_entropy", 100,
+                /* trans */ false, /* useGpu */ false);
+}
+
+TEST(Layer, multi_cross_with_selfnorm) {
+  TestConfig config;
+  config.layerConfig.set_type("multi_class_cross_entropy_with_selfnorm");
+  config.layerConfig.set_softmax_selfnorm_alpha(0.1);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // Not support GPU now
+  testLayerGrad(config, "multi_class_cross_entropy_with_selfnorm", 100,
+                /* trans */ false,
+                /* useGpu */ false);
+}
+
+TEST(Layer, multi_cross_soft) {
+  TestConfig config;
+  config.layerConfig.set_type("soft_binary_class_cross_entropy");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "soft_binary_class_cross_entropy", 100,
+                  /* trans */ false, useGpu);
+  }
+}
+
+TEST(Layer, square_error) {
+  TestConfig config;
+  config.layerConfig.set_type("square_error");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
+  }
+}
+
+TEST(Layer, sparse_square_error) {
+  TestConfig config;
+  config.layerConfig.set_type("square_error");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // "GpuSparseMatrix" as label is not supported
+  testLayerGrad(config, "square_error", 100, /* trans */ false,
+                /* useGpu */ false);
+}
+
+TEST(Layer, sparse_float_square_error) {
+  TestConfig config;
+  config.layerConfig.set_type("square_error");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_SPARSE_FLOAT_VALUE_DATA, "layer_1", 50, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // "GpuSparseMatrix" as label is not supported
+  testLayerGrad(config, "square_error", 100, /* trans */ false,
+                /* useGpu */ false);
+}
+
+TEST(Layer, square_error_weighted) {
+  TestConfig config;
+  config.layerConfig.set_type("square_error");
+  config.biasSize = 0;
+  config.testAccumulate = false;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
+  }
+}
+
+TEST(Layer, huber_two_class) {
+  TestConfig config;
+  config.layerConfig.set_type("huber");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 2, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "huber", 100, /* trans */ false, useGpu);
+  }
+}
+
+void testExpandLayer(string trans_type, bool hasSubseq) {
+  TestConfig config;
+  config.layerConfig.set_type("expand");
+
+  config.inputDefs.push_back(
+      {trans_type == "non-seq" ? INPUT_DENSE_DIM_DATA : INPUT_SEQUENCE_DATA,
+       "layer_0", 10, 0});
+  config.inputDefs.push_back(
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA, "layer_1",
+       10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.set_trans_type(trans_type);
+  LOG(INFO) << " trans_type=" << trans_type << " hasSubseq=" << hasSubseq;
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "expand", 30, false, useGpu);
+  }
+}
+
+TEST(Layer, ExpandLayer) {
+  testExpandLayer("non-seq", false);  // non-seq expand to seq
+  testExpandLayer("non-seq", true);   // non-seq expand to hasSubseq
+  testExpandLayer("seq", true);       // seq expand to hasSubseq
+}
+
+void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) {
+  TestConfig config;
+  config.layerConfig.set_type(layer_type);
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back(
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA, "layer_0",
+       10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.set_trans_type(trans_type);
+
+  auto testDegradeLayerGrad = [](TestConfig& config, string layer_type) {
+    for (auto useGpu : {false, true}) {
+      testLayerGrad(config, layer_type, 100, false, useGpu);
+    }
+  };
+
+  if (layer_type == "average") {
+    for (auto strategy : {"average", "sum", "squarerootn"}) {
+      LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
+                << " average_strategy=" << strategy;
+      config.layerConfig.set_average_strategy(strategy);
+      testDegradeLayerGrad(config, layer_type);
+    }
+  } else {
+    LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type;
+    testDegradeLayerGrad(config, layer_type);
+  }
+}
+
+TEST(Layer, MaxLayer) {
+  testDegradeLayer(false, "max", "non-seq");  // seq max to non-seq
+  testDegradeLayer(true, "max", "non-seq");   // hasSubseq max to non-seq
+  testDegradeLayer(true, "max", "seq");       // hasSubseq max to seq
+}
+
+TEST(Layer, SequenceLastInstanceLayer) {
+  testDegradeLayer(false, "seqlastins",
+                   "non-seq");  // seq seqlastins to non-seq
+  testDegradeLayer(true, "seqlastins",
+                   "non-seq");  // hasSubseq seqlastins to non-seq
+  testDegradeLayer(true, "seqlastins", "seq");  // hasSubseq seqlastins to seq
+}
+
+TEST(Layer, AverageLayer) {
+  testDegradeLayer(false, "average", "non-seq");  // seq average to non-seq
+  testDegradeLayer(true, "average", "non-seq");  // hasSubseq average to non-seq
+  testDegradeLayer(true, "average", "seq");      // hasSubseq average to seq
+}
+
+TEST(Layer, SequenceConcatLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("seqconcat");
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "seqconcat", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, SequenceReshapeLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("seqreshape");
+  config.layerConfig.set_size(10);
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 100, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "seqreshape", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, ConvShiftLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("conv_shift");
+  config.layerConfig.set_size(10);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 3, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // Not support GPU now
+  testLayerGrad(config, "conv_shift", 100, false, false);
+}
+
+TEST(Layer, PowerLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("power");
+  config.layerConfig.set_size(10);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "power", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, ConvexCombinationLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("convex_comb");
+  config.layerConfig.set_size(20);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "convex_comb", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, InterpolationLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("interpolation");
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_2", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "interpolation", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, OuterProdLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("out_prod");
+  config.layerConfig.set_size(100);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "out_prod", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, SlopeInterceptLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("slope_intercept");
+  config.layerConfig.set_size(10);
+  config.layerConfig.set_slope(1.0);
+  config.layerConfig.set_intercept(0.1);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "slope_intercept", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, ScalingLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("scaling");
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "scaling", 100, false, useGpu);
+  }
+}
+
+void testNormLayer(const string& normType, bool trans, bool useGpu) {
+  TestConfig config;
+  config.layerConfig.set_type("norm");
+  config.layerConfig.set_active_type("relu");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type(normType);
+  norm->set_channels(16);
+  norm->set_size(5);
+  norm->set_scale(0.001);
+  norm->set_pow(0.75);
+  norm->set_blocked(0);
+  norm->set_img_size(14);
+  norm->set_output_x(norm->img_size());
+  if (norm->norm_type() == "cmrnorm" ||
+      norm->norm_type() == "cmrnorm-projection") {
+    norm->set_scale(norm->scale() / norm->size());
+  } else {
+    norm->set_scale(norm->scale() / (norm->size() * norm->size()));
+  }
+
+  config.layerConfig.set_size(norm->output_x() * norm->output_x() *
+                              norm->channels());
+  config.biasSize = 0;
+
+  testLayerGrad(config, "norm", 100, trans, useGpu);
+}
+
+#ifndef PADDLE_ONLY_CPU
+TEST(Layer, NormLayer) {
+  testNormLayer("cmrnorm-projection", /* trans= */ false, /* useGpu= */ true);
+}
+#endif
+
+void setPoolConfig(TestConfig* config, PoolConfig* pool,
+                   const string& poolType) {
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool");
+  (*config).layerConfig.set_num_filters(16);
+  (*config).layerConfig.set_partial_sum(1);
+  (*config).layerConfig.set_shared_biases(true);
+
+  pool->set_pool_type(poolType);
+  pool->set_channels(16);
+  pool->set_size_x(3);
+  if (poolType == "cudnn-max-pool" || poolType == "cudnn-avg-pool") {
+    pool->set_padding(0);
+  } else {
+    pool->set_start(0);
+  }
+  pool->set_stride(2);
+  pool->set_output_x((pool->img_size() - pool->start() - pool->size_x()) /
+                         ((float)pool->stride()) +
+                     1.5);
+}
+
+void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  setPoolConfig(&config, pool, poolType);
+  pool->set_img_size(14);
+  config.layerConfig.set_size(pool->output_x() * pool->output_x() *
+                              pool->channels());
+
+  testLayerGrad(config, "pool", 100, trans, useGpu);
+}
+
+#ifndef PADDLE_ONLY_CPU
+void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  setPoolConfig(&config, pool, poolType);
+  pool->set_size_y(4);
+  pool->set_stride_y(3);
+  pool->set_img_size(10);
+  pool->set_img_size_y(20);
+  pool->set_output_y((pool->img_size_y() - pool->start() - pool->size_y()) /
+                         ((float)pool->stride_y()) +
+                     1.5);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  testLayerGrad(config, "pool", 100, trans, useGpu);
+}
+#endif
+
+TEST(Layer, PoolLayer) {
+  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
+  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
+
+#ifndef PADDLE_ONLY_CPU
+  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+TEST(Layer, rankCostLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("rank-cost");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "rank-cost", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, weightedRankCostLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("rank-cost");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_3", 1, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "weighted-rank-cost", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, TensorLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("tensor");
+  config.layerConfig.set_size(10);
+  config.layerConfig.set_active_type("sigmoid");
+  config.biasSize = config.layerConfig.size();
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 250});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 5, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "tensor", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, RecurrentLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("recurrent");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("tanh");
+  config.biasSize = 4;
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 4, /* paraSize= */ 16});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    for (auto reversed : {false, true}) {
+      config.layerConfig.set_reversed(reversed);
+      config.testState = !reversed;
+      testLayerGrad(config, "recurrent", 50, /* trans= */ false, useGpu);
+    }
+  }
+}
+
+TEST(Layer, LstmLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("lstmemory");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_state_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 28;
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 64});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    for (auto reversed : {false, true}) {
+      config.layerConfig.set_reversed(reversed);
+      config.testState = !reversed;
+      testLayerGrad(config, "lstmemory", 100, /* trans= */ false, useGpu);
+    }
+  }
+  for (auto useGpu : {true}) {
+    config.testBatchState = true;
+    config.layerConfig.set_reversed(false);
+    testLayerGrad(config, "lstmemory", 10, /* trans= */ false, useGpu);
+  }
+}
+
+TEST(Layer, MDLstmLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("mdlstmemory");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_state_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 4 * 9;
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_MDIM_DATA, "layer_0", 4 * 5, 4 * 4 * 5});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_directions(true);
+  config.layerConfig.add_directions(true);
+
+  for (auto useGpu : {false, true}) {
+    for (int i = 0; i < 2; i++) {
+      for (int j = 0; j < 2; j++) {
+        config.layerConfig.set_directions(0, bool(i));
+        config.layerConfig.set_directions(1, bool(j));
+        testLayerGrad(config, "mdlstmemory", 100, false, useGpu);
+      }
+    }
+  }
+}
+
+TEST(Layer, ParameterReluLayer) {
+  auto testParameterReluLayer = [&](size_t inputSize, size_t channels) {
+    TestConfig config;
+    config.layerConfig.set_type("prelu");
+    config.inputDefs.push_back({INPUT_DATA, "layer_0", inputSize, channels});
+    config.layerConfig.add_inputs();
+    config.layerConfig.set_size(inputSize);
+    config.layerConfig.set_partial_sum(inputSize /
+                                       channels);  // size of feature map
+    for (auto useGpu : {false, true}) {
+      testLayerGrad(config, "prelu", 100, false, useGpu);
+    }
+  };
+
+  testParameterReluLayer(192, 1);
+  testParameterReluLayer(192, 3);
+  testParameterReluLayer(192, 192);
+}
+
+TEST(Layer, ResizeLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("resize");
+  config.layerConfig.set_size(64);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 16, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "resize", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, NCELayer) {
+  TestConfig config;
+  size_t numClasses = 4;
+  config.layerConfig.set_type("nce");
+  config.layerConfig.set_size(1);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_num_classes(numClasses);
+  config.biasSize = numClasses;
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 16 * numClasses});
+  config.inputDefs.push_back(
+      {INPUT_LABEL, "label", /* dim= */ numClasses, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto withWeight : {false, true}) {
+    if (withWeight) {
+      config.inputDefs.push_back(
+          {INPUT_DATA_TARGET, "weight", /* dim= */ 1, /* paraSize= */ 0});
+      config.layerConfig.add_inputs();
+    }
+
+    for (auto isIdLabel : {false, true}) {
+      config.inputDefs[1] = {
+          isIdLabel ? INPUT_LABEL : INPUT_SPARSE_NON_VALUE_DATA, "label",
+          /* dim= */ numClasses,
+          /* paraSize= */ 0};
+
+      for (auto withDist : {false, true}) {
+        config.layerConfig.clear_neg_sampling_dist();
+        if (withDist) {
+          double sum = 0;
+          for (size_t i = 0; i < numClasses; ++i) {
+            real p = rand();  // NOLINT use rand_r
+            config.layerConfig.add_neg_sampling_dist(p);
+            sum += p;
+          }
+          for (size_t i = 0; i < numClasses; ++i) {
+            real p = config.layerConfig.neg_sampling_dist(i) / sum;
+            config.layerConfig.set_neg_sampling_dist(i, p);
+          }
+        }
+        LOG(INFO) << "NCELayer "
+                  << " isIdLabel=" << isIdLabel << " withWeight=" << withWeight
+                  << " withDist=" << withDist;
+        // Not support GPU now
+        testLayerGrad(config, "nce", 100, /* trans= */ false,
+                      /* useGpu */ false);
+      }
+    }
+  }
+}
+
+TEST(Layer, GatedRecurrentLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("gated_recurrent");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 12;
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    for (auto reversed : {false, true}) {
+      config.layerConfig.set_reversed(reversed);
+      config.testState = !reversed;
+      testLayerGrad(config, "gated_recurrent", 100, /* trans= */ false, useGpu);
+    }
+  }
+}
+
+TEST(Layer, GruStepLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("gru_step");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 12;
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "gruStep", 100, /* trans= */ false, useGpu);
+  }
+}
+
+TEST(Layer, LstmStepLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("lstm_step");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_state_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 12;
+  config.testAccumulate = false;
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 0});
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "lstmStep", 100, /* trans= */ false, useGpu);
+  }
+}
+
+void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
+  TestConfig config;
+  const int CHANNELS = 10;
+  const int IMG_SIZE = 16;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE);
+  config.layerConfig.set_active_type("sigmoid");
+  config.biasSize = CHANNELS;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0",
+                              /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS,
+                              /* paraSize= */ CHANNELS});
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
+  config.inputDefs.back().isStatic = true;
+  config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
+  config.inputDefs.back().isStatic = true;
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  ImageConfig* img_conf = input->mutable_image_conf();
+  img_conf->set_channels(CHANNELS);
+  img_conf->set_img_size(IMG_SIZE);
+
+  testLayerGrad(config, "batch_norm", 64, /* trans= */ trans, useGpu,
+                /* useWeight */ true);
+}
+
+TEST(Layer, BatchNormalizationLayer) {
+  testBatchNormLayer("batch_norm", false, false);
+#ifndef PADDLE_ONLY_CPU
+  testBatchNormLayer("batch_norm", false, true);
+  if (hl_get_cudnn_lib_version() >= int(4000)) {
+    testBatchNormLayer("cudnn_batch_norm", false, true);
+  }
+#endif
+}
+
+TEST(Operator, conv) {
+  TestConfig config;
+  const int NUM_FILTERS = 16;
+  const int FILTER_SIZE = 2;
+  const int FILTER_SIZE_Y = 3;
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 16;
+  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
+  operatorConf.set_type("conv");
+  ConvConfig* conv = operatorConf.mutable_conv_conf();
+  operatorConf.set_num_filters(NUM_FILTERS);
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_channels(CHANNELS);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(IMAGE_SIZE);
+  int outputSize =
+      int(1.0 * (2 * conv->padding() + conv->img_size() - conv->filter_size()) /
+          conv->stride()) +
+      1;
+  conv->set_output_x(outputSize);
+  config.layerConfig.set_size(outputSize * outputSize *
+                              config.layerConfig.num_filters());
+  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+                              NUM_FILTERS);
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE * CHANNELS, 0});
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_1",
+       FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  testOperatorGrad(config, operatorConf, 100, /*useGpu*/ true, false);
+}
+
+TEST(Layer, FeatureMapExpandLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("featmap_expand");
+  const int CHANNELS = 10;
+  const int INPUT_SIZE = 100;
+  config.layerConfig.set_size(INPUT_SIZE * CHANNELS);
+  config.layerConfig.set_num_filters(CHANNELS);
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0",
+                              /* dim= */ INPUT_SIZE, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "featmap_expand",
+                  /*batch_size*/ 100, /* trans= */ false, useGpu,
+                  /* useWeight */ true);
+  }
+}
+
+TEST(Layer, MultiplexLayer) {
+  TestConfig config;
+  const int LAYER_SIZE = 100;
+  config.layerConfig.set_type("multiplex");
+  config.layerConfig.set_size(LAYER_SIZE);
+
+  config.inputDefs.push_back({INPUT_LABEL, "layer_0", 2, 0});
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_1", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_2", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "multiplex", 512, /* trans= */ false, useGpu);
+  }
+}
+
+
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_LinearChainCRF.cpp b/paddle/gserver/tests/test_LinearChainCRF.cpp
new file mode 100644
index 00000000000000..f45e40c8b6acb5
--- /dev/null
+++ b/paddle/gserver/tests/test_LinearChainCRF.cpp
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/gserver/layers/LinearChainCRF.h"
+#include "paddle/utils/Util.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static inline bool getNextSequence(vector<int>& seq, int numClasses) {
+  for (auto& v : seq) {
+    if (++v < numClasses) {
+      return true;
+    }
+    v = 0;
+  }
+  return false;
+}
+
+TEST(LinearChainCRF, decoding) {
+  const int numClasses = 4;
+  CpuVector para(numClasses * (numClasses + 2));
+  real* a = para.getData();
+  real* b = para.getData() + numClasses;
+  real* w = para.getData() + 2 * numClasses;
+  LinearChainCRF crf(4, para.getData(), nullptr);
+  for (int length : {1, 2, 3, 10}) {
+    for (int tries = 0; tries < 10; ++tries) {
+      CpuMatrix x(length, numClasses);
+      x.randomizeUniform();
+      para.randnorm(0, 2);
+      vector<int> decodingResult(length);
+      vector<int> bestResult(length);
+      vector<int> testResult(length, 0);
+      crf.decode(x.getData(), &decodingResult[0], length);
+      real bestScore = -std::numeric_limits<real>::max();
+      do {
+        real score = a[testResult.front()] + b[testResult.back()];
+        score += x.getElement(0, testResult.front());
+        for (int k = 1; k < length; ++k) {
+          score += x.getElement(k, testResult[k]) +
+                   w[numClasses * testResult[k - 1] + testResult[k]];
+        }
+        if (score > bestScore) {
+          bestScore = score;
+          bestResult = testResult;
+        }
+      } while (getNextSequence(testResult, numClasses));
+      for (int k = 0; k < length; ++k) {
+        EXPECT_EQ(decodingResult[k], bestResult[k]);
+      }
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_MultinomialSampler.cpp b/paddle/gserver/tests/test_MultinomialSampler.cpp
new file mode 100644
index 00000000000000..39a90958331f6c
--- /dev/null
+++ b/paddle/gserver/tests/test_MultinomialSampler.cpp
@@ -0,0 +1,145 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <random>
+
+#include <gtest/gtest.h>
+#include <vector>
+
+#undef PADDLE_DISABLE_TIMER
+#include "paddle/utils/Stat.h"
+
+#include "paddle/utils/Util.h"
+#include "paddle/gserver/layers/MultinomialSampler.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+class MultinomialSamplerTester : public MultinomialSampler {
+public:
+  MultinomialSamplerTester(real* prob, int size)
+      : MultinomialSampler(prob, size) {}
+
+  template <typename Rand1>
+  int testGen(Rand1 rand1) {
+    return gen1(rand1);
+  }
+};
+
+TEST(MultinomialSampler, gen) {
+  int numGrids = 1024 * 1024;
+  int size = 1024 * 4;
+
+  default_random_engine reng;
+  uniform_int_distribution<int> rand(1, numGrids / size * 1.8);
+  vector<real> prob;
+  int sum = 0;
+  for (int i = 0; i < size; ++i) {
+    prob.push_back(rand(reng));
+    sum += prob.back();
+  }
+  CHECK_LE(sum, numGrids);
+  prob.back() += numGrids - sum;
+
+  vector<int> counts(size);
+  MultinomialSamplerTester sampler(&prob[0], size);
+  counts.assign(size, 0);
+  {
+    double s = (double)size / (double)numGrids;
+    REGISTER_TIMER("MultinomialSampler");
+    for (double i = 0; i < numGrids; ++i) {
+      int ret = sampler.testGen([i, s]() { return s * i; });
+      if (ret < 0 || ret >= size) {
+        EXPECT_GE(ret, 0);
+        EXPECT_LT(ret, size);
+        break;
+      }
+      ++counts[ret];
+    }
+  }
+  for (int i = 0; i < size; ++i) {
+    if (prob[i] != counts[i]) {
+      EXPECT_EQ(prob[i], counts[i]);
+      LOG(INFO) << "i=" << i;
+      break;
+    }
+  }
+}
+
+void benchmarkRandom() {
+  int n = 1024 * 1024;
+
+  int sum;
+  double sum1;
+
+  sum = 0;
+  unsigned int seed = 1;
+  {
+    REGISTER_TIMER("crand");
+    for (int i = 0; i < n; ++i) {
+      sum += rand_r(&seed) % 1000;
+    }
+  }
+  LOG(INFO) << "sum=" << sum;
+
+  default_random_engine reng;
+  uniform_int_distribution<int> rand(1, 1000);
+  sum = 0;
+  {
+    REGISTER_TIMER("stdrand");
+    for (int i = 0; i < n; ++i) {
+      sum += rand(reng);
+    }
+  }
+  LOG(INFO) << "sum=" << sum;
+
+  sum = 0;
+  {
+    REGISTER_TIMER("default_random_engine");
+    for (int i = 0; i < n; ++i) {
+      sum += reng();
+    }
+  }
+  LOG(INFO) << "sum=" << sum;
+
+  uniform_real_distribution<double> rand1(0, 1);
+  sum1 = 0;
+  {
+    REGISTER_TIMER("stdrand1");
+    for (int i = 0; i < n; ++i) {
+      sum1 += rand1(reng);
+    }
+  }
+  LOG(INFO) << "sum1=" << sum1;
+
+  sum1 = 0;
+  {
+    real a = 1.0f / (real)RAND_MAX;
+    REGISTER_TIMER("crand1");
+    for (int i = 0; i < n; ++i) {
+      sum1 += a * rand_r(&seed);
+    }
+  }
+  LOG(INFO) << "sum1=" << sum1;
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  benchmarkRandom();
+  int ret = RUN_ALL_TESTS();
+  globalStat.printSegTimerStatus();
+  return ret;
+}
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
new file mode 100644
index 00000000000000..1c6a8b0017fc91
--- /dev/null
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -0,0 +1,242 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#undef PADDLE_DISABLE_TIMER
+#include <paddle/utils/PythonUtil.h>
+#include <cstdlib>
+#include <algorithm>
+#include <gtest/gtest.h>
+
+#include "paddle/trainer/Trainer.h"
+#include "paddle/utils/Stat.h"
+#include "TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+P_DECLARE_int32(gpu_id);
+P_DECLARE_double(checkgrad_eps);
+P_DEFINE_bool(use_label, true, "input label or sequence label");
+P_DEFINE_bool(static_para, false, "static parameter");
+
+struct DataIn {
+  std::vector<Argument> inArgs;
+  std::vector<MatrixPtr> outGrads;
+  std::vector<VectorPtr> paraValues;
+};
+
+struct DataOut {
+  std::vector<MatrixPtr> outValues;
+  std::vector<VectorPtr> paraGrads;
+};
+
+void initArgument(DataIn& data, const std::string& configPath,
+                  bool useGpu = FLAGS_use_gpu) {
+  TrainerConfigHelper config(configPath);
+  size_t batchSize = config.getOptConfig().batch_size();
+
+  for (const auto& layer_name : config.getModelConfig().input_layer_names()) {
+    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
+                                     config.getModelConfig().layers().end(),
+                                     [=](const LayerConfig& layer_config) {
+                                       return layer_config.name() == layer_name;
+                                     });
+    CHECK(layer_config != config.getModelConfig().layers().end());
+
+    size_t layerSize = layer_config->size();
+    Argument arg;
+    arg.value = Matrix::create(batchSize, layerSize, false, useGpu);
+    arg.grad = Matrix::create(batchSize, layerSize, false, useGpu);
+    arg.value->randomizeUniform();
+    arg.value->add(-0.5);
+    arg.value->sigmoid(*arg.value);
+    arg.grad->zeroMem();
+    if (FLAGS_use_label) {
+      arg.ids = VectorT<int>::create(batchSize, useGpu);
+      arg.ids->rand(layerSize);
+    }
+    generateSequenceStartPositions(batchSize, arg.sequenceStartPositions);
+    data.inArgs.push_back(arg);
+  }
+
+  for (const auto& layer_name : config.getModelConfig().output_layer_names()) {
+    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
+                                     config.getModelConfig().layers().end(),
+                                     [=](const LayerConfig& layer_config) {
+                                       return layer_config.name() == layer_name;
+                                     });
+    CHECK(layer_config != config.getModelConfig().layers().end());
+
+    size_t layerSize = layer_config->size();
+    MatrixPtr grad = Matrix::create(batchSize, layerSize, false, useGpu);
+    grad->randomizeUniform();
+    data.outGrads.push_back(grad);
+  }
+
+  for (const auto& para_config : config.getModelConfig().parameters()) {
+    VectorPtr value = Vector::create(para_config.size(), useGpu);
+    value->randnorm(0, 2);
+    data.paraValues.push_back(value);
+  }
+}
+
+void calcGradient(DataIn& in, DataOut& out, const std::string& configPath) {
+  *ThreadLocalRand::getSeed() = 0;
+  srand(0);
+
+  Trainer trainer;
+  auto config = std::make_shared<TrainerConfigHelper>(configPath);
+  trainer.init(config, false);
+
+  std::vector<ParameterPtr> parameters;
+  vector<Argument> outArgs;
+
+  auto gradientMachine = trainer.getGradientMachine();
+  parameters = gradientMachine->getParameters();
+  if (FLAGS_static_para) {
+    for (size_t i = 0; i < parameters.size(); i++) {
+      parameters[i]->getBuf(PARAMETER_VALUE)->one();
+    }
+  } else {
+    for (size_t i = 0; i < in.paraValues.size(); i++) {
+      parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
+    }
+  }
+  gradientMachine->start(trainer.getConfig(), nullptr);
+  gradientMachine->forward(in.inArgs, &outArgs, PASS_TRAIN);
+  for (size_t i = 0; i < in.outGrads.size(); i++) {
+    outArgs[i].grad->copyFrom(*in.outGrads[i]);
+  }
+  gradientMachine->backward();
+  for (size_t i = 0; i < in.outGrads.size(); i++) {
+    MatrixPtr value =
+        Matrix::create(outArgs[i].value->getHeight(),
+                       outArgs[i].value->getWidth(), false, false);
+    value->copyFrom(*outArgs[i].value);
+    out.outValues.push_back(value);
+  }
+  for (size_t i = 0; i < in.paraValues.size(); i++) {
+    VectorPtr grad = Vector::create(
+        parameters[i]->getBuf(PARAMETER_GRADIENT)->getSize(), false);
+    grad->copyFrom(*parameters[i]->getBuf(PARAMETER_GRADIENT));
+    out.paraGrads.push_back(grad);
+  }
+
+  for (int i = 0; i < 20; i++) {
+    REGISTER_TIMER("forward");
+    gradientMachine->forward(in.inArgs, &outArgs, PASS_TRAIN);
+  }
+  for (int i = 0; i < 20; i++) {
+    REGISTER_TIMER("backward");
+    gradientMachine->backward();
+  }
+
+  gradientMachine->finish();
+}
+
+void checkBuffer(real* A, const char* desA, real* B, const char* desB,
+                 size_t len, size_t width = 1) {
+  int nNum = 0;
+  for (size_t i = 0; i < len; ++i) {
+    real diff = fabs(A[i] - B[i]);
+    if (diff > 0.0f &&
+        diff / std::max(fabs(A[i]), fabs(B[i])) > FLAGS_checkgrad_eps) {
+      nNum++;
+      LOG(INFO) << "Row: " << i / width << ", " << desA << " : " << A[i]
+                << "    " << desB << " : " << B[i];
+    }
+  }
+  EXPECT_EQ(0, nNum);
+}
+
+void compareGradient(DataOut& outA, DataOut& outB) {
+  LOG(INFO) << "------------------------------"
+            << " Check Network Output "
+            << "------------------------------";
+  for (size_t i = 0; i < outA.outValues.size(); ++i) {
+    LOG(INFO) << "OUTPUT VALUE: " << i;
+    checkBuffer(outA.outValues[i]->getData(), "network A output",
+                outB.outValues[i]->getData(), "network B output",
+                outA.outValues[i]->getElementCnt(),
+                outA.outValues[i]->getWidth());
+  }
+
+  if (!FLAGS_static_para) {
+    LOG(INFO) << "------------------------------"
+              << " Check Parameters "
+              << "------------------------------";
+    for (size_t i = 0; i < outA.paraGrads.size(); ++i) {
+      LOG(INFO) << "PARAMETER GRADIENT: " << i;
+      checkBuffer(outA.paraGrads[i]->getData(), "Network A",
+                  outB.paraGrads[i]->getData(), "Network B",
+                  outA.paraGrads[i]->getSize());
+    }
+  }
+}
+
+void compareNetwork(const std::string& config_file_a,
+                    const std::string& config_file_b) {
+  DataIn in;
+  initArgument(in, config_file_a);
+
+  DataOut dataA;
+  calcGradient(in, dataA, config_file_a);
+  LOG(INFO) << "forwardBackward of Network A is finished";
+  globalStat.printSegTimerStatus();
+  globalStat.reset();
+  LOG(INFO) << "\n\n";
+
+  DataOut dataB;
+  calcGradient(in, dataB, config_file_b);
+  LOG(INFO) << "forwardBackward of the Network B is finished";
+  globalStat.printSegTimerStatus();
+  globalStat.reset();
+  LOG(INFO) << "\n\n";
+
+  compareGradient(dataA, dataB);
+}
+
+TEST(Compare, concat_dotmul) {
+  std::string config_file_a = "./gserver/tests/concat_dotmul_a.conf";
+  std::string config_file_b = "./gserver/tests/concat_dotmul_b.conf";
+  compareNetwork(config_file_a, config_file_b);
+}
+
+TEST(Compare, concat_fullmatrix) {
+  std::string config_file_a = "./gserver/tests/concat_fullmatrix_a.conf";
+  std::string config_file_b = "./gserver/tests/concat_fullmatrix_b.conf";
+  compareNetwork(config_file_a, config_file_b);
+}
+
+TEST(Compare, concat_table) {
+  std::string config_file_a = "./gserver/tests/concat_table_a.conf";
+  std::string config_file_b = "./gserver/tests/concat_table_b.conf";
+  compareNetwork(config_file_a, config_file_b);
+}
+
+P_DEFINE_string(config_file_a, "", "config of one network to compare");
+P_DEFINE_string(config_file_b, "", "config of another network to compare");
+TEST(Compare, network) {
+  if (FLAGS_config_file_a != "" && FLAGS_config_file_b != "") {
+    compareNetwork(FLAGS_config_file_a, FLAGS_config_file_b);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  initPython(argc, argv);
+  int ret = RUN_ALL_TESTS();
+  return ret;
+}
diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp
new file mode 100644
index 00000000000000..68f7f43261c835
--- /dev/null
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
@@ -0,0 +1,721 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <memory>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "paddle/utils/Util.h"
+#include "paddle/gserver/dataproviders/ProtoDataProvider.h"
+
+#include "TestUtil.h"
+
+using namespace std;  // NOLINT
+
+std::vector<string> protoFiles{
+    "./test_ProtoDataProvider/data1.bin", "./test_ProtoDataProvider/data2.bin",
+};
+std::vector<string> protoFilesCompressed{
+    "./test_ProtoDataProvider/data1.bin.gz",
+    "./test_ProtoDataProvider/data2.bin.gz",
+};
+
+const char* kTestDir = "./test_ProtoDataProvider";
+const char kProtoFileList[] = "gserver/tests/proto_files.txt";
+const char kProtoFileListCompressed[] =
+    "gserver/tests/proto_files_compressed.txt";
+const int kSpraseMatrixDim = 1024;
+
+using namespace paddle;  // NOLINT
+
+void prepareData(DataBatch* batch, const int* numPerSlotType, bool iid,
+                 bool useGpu) {
+  batch->clear();
+  int64_t size = uniformRandom(100) + 10;
+  batch->setSize(size);
+
+  ICpuGpuVectorPtr sequenceStartPositions;
+  ICpuGpuVectorPtr subSequenceStartPositions;
+  if (!iid) {
+    int numSeqs = uniformRandom(10) + 1;
+    sequenceStartPositions =
+        ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false);
+    int* buf = sequenceStartPositions->getMutableData(false);
+    subSequenceStartPositions =
+        ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false);
+    int* subBuf = subSequenceStartPositions->getMutableData(false);
+    int64_t pos = 0;
+    int maxLen = 2 * size / numSeqs;
+    for (int i = 0; i < numSeqs; ++i) {
+      int len =
+          uniformRandom(min<int64_t>(maxLen, size - pos - numSeqs + i)) + 1;
+      buf[i] = pos;
+      subBuf[i] = pos;
+      pos += len;
+      VLOG(1) << " len=" << len;
+    }
+    buf[numSeqs] = size;
+    subBuf[numSeqs] = size;
+  }
+
+  vector<Argument>& arguments = batch->getStreams();
+  for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_DENSE]; ++i) {
+    int64_t dim = rand() % 10 + 4;  // NOLINT rand_r
+    MatrixPtr mat = Matrix::create(size, dim, /* trans= */ false, false);
+    mat->randomizeUniform();
+    Argument arg;
+    arg.value = mat;
+    arg.sequenceStartPositions = sequenceStartPositions;
+    arguments.push_back(arg);
+  }
+  for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE]; ++i) {
+    MatrixPtr mat =
+        makeRandomSparseMatrix(size, kSpraseMatrixDim, false, useGpu);
+    Argument arg;
+    arg.value = mat;
+    arg.sequenceStartPositions = sequenceStartPositions;
+    arg.subSequenceStartPositions = subSequenceStartPositions;
+    arguments.push_back(arg);
+  }
+  for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE]; ++i) {
+    MatrixPtr mat =
+        makeRandomSparseMatrix(size, kSpraseMatrixDim, true, useGpu);
+    Argument arg;
+    arg.value = mat;
+    arg.sequenceStartPositions = sequenceStartPositions;
+    arguments.push_back(arg);
+  }
+  for (int i = 0; i < numPerSlotType[SlotDef::STRING]; ++i) {
+    int64_t dim = rand() % 10 + 4;  // NOLINT rand_r
+    SVectorPtr vec = std::make_shared<std::vector<std::string>>();
+    for (int j = 0; j < size; ++j) {
+      vec->push_back(randStr(dim));
+    }
+    Argument arg;
+    arg.strs = vec;
+    arg.sequenceStartPositions = sequenceStartPositions;
+    arguments.push_back(arg);
+  }
+  for (int i = 0; i < numPerSlotType[SlotDef::INDEX]; ++i) {
+    int64_t dim = rand() % 10 + 4;  // NOLINT rand_r
+    IVectorPtr vec = IVector::create(size, /* useGpu= */ false);
+    int* buf = vec->getData();
+    for (int j = 0; j < size; ++j) {
+      buf[j] = uniformRandom(dim);
+    }
+    Argument arg;
+    arg.ids = vec;
+    arg.sequenceStartPositions = sequenceStartPositions;
+    arguments.push_back(arg);
+  }
+}
+
+inline int getSlotDim(const Argument& arg) {
+  if (arg.value) {
+    return arg.value->getWidth();
+  } else if (arg.ids) {
+    return arg.ids->getMax() + 1;
+  } else if (arg.strs) {
+    return 1;
+  }
+  LOG(FATAL) << "Invalid argument";
+  return 0;
+}
+
+inline SlotDef::SlotType getSlotType(const Argument& arg) {
+  if (arg.value) {
+    auto & m = *arg.value;
+    auto& type = typeid(m);
+    if (type == typeid(CpuMatrix) || type == typeid(GpuMatrix)) {
+      return SlotDef::VECTOR_DENSE;
+    }
+    if (type == typeid(CpuSparseMatrix)) {
+      auto valueType =
+          std::dynamic_pointer_cast<CpuSparseMatrix>(arg.value)->getValueType();
+      if (NO_VALUE == valueType) {
+        return SlotDef::VECTOR_SPARSE_NON_VALUE;
+      } else {
+        return SlotDef::VECTOR_SPARSE_VALUE;
+      }
+    }
+    if (type == typeid(GpuSparseMatrix)) {
+      auto valueType =
+          std::dynamic_pointer_cast<GpuSparseMatrix>(arg.value)->getValueType();
+      if (NO_VALUE == valueType) {
+        return SlotDef::VECTOR_SPARSE_NON_VALUE;
+      } else {
+        return SlotDef::VECTOR_SPARSE_VALUE;
+      }
+    }
+
+    LOG(FATAL) << "Unknown matrix type";
+  }
+  if (arg.ids) return SlotDef::INDEX;
+  if (arg.strs) return SlotDef::STRING;
+  LOG(FATAL) << "Invalid argument";
+  return SlotDef::VECTOR_DENSE;
+}
+
+void getColRow(const Argument& arg, int64_t pos, bool useGpu, int* colNum,
+               const int** rowCols, const real** rowValues) {
+  SlotDef::SlotType type = getSlotType(arg);
+  GpuSparseMatrixPtr matGpu;
+  CpuSparseMatrixPtr matCpu;
+  if (useGpu) {
+    matGpu = dynamic_pointer_cast<GpuSparseMatrix>(arg.value);
+    ASSERT_TRUE(matGpu != NULL);
+  } else {
+    matCpu = dynamic_pointer_cast<CpuSparseMatrix>(arg.value);
+    ASSERT_TRUE(matCpu != NULL);
+  }
+  *colNum = useGpu ? matGpu->getColNum(pos) : matCpu->getColNum(pos);
+  *rowCols = useGpu ? matGpu->getRowCols(pos) : matCpu->getRowCols(pos);
+  if (type == SlotDef::VECTOR_SPARSE_VALUE) {
+    *rowValues = useGpu ? matGpu->getRowValues(pos) : matCpu->getRowValues(pos);
+  } else {
+    *rowValues = NULL;
+  }
+}
+
+void makeSample(const vector<Argument>& arguments, int64_t pos,
+                bool isBeginning, DataSample* sample, bool useGpu) {
+  sample->set_is_beginning(isBeginning);
+  int slotid = 0;
+  for (auto& arg : arguments) {
+    SlotDef::SlotType type = getSlotType(arg);
+    int64_t dim = getSlotDim(arg);
+    switch (type) {
+      case SlotDef::VECTOR_DENSE: {
+        VectorSlot* vecSlot = sample->add_vector_slots();
+        auto values = vecSlot->mutable_values();
+        values->Reserve(dim);
+        for (int i = 0; i < dim; ++i) {
+          values->AddAlreadyReserved(
+              static_cast<float>(arg.value->getElement(pos, i)));
+        }
+        break;
+      }
+      case SlotDef::INDEX: {
+        sample->add_id_slots(arg.ids->get(pos));
+        break;
+      }
+      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
+        VectorSlot* vecSlot = sample->add_vector_slots();
+        auto ids = vecSlot->mutable_ids();
+        int colNum;
+        const int* rowCols;
+        const real* rowValues;  // nullptr
+        getColRow(arg, pos, useGpu, &colNum, &rowCols, &rowValues);
+        ids->Reserve(colNum);
+        for (int i = 0; i < colNum; ++i) {
+          ids->AddAlreadyReserved(rowCols[i]);
+        }
+        SubseqSlot* subseqSlot = sample->add_subseq_slots();  // subseq
+        subseqSlot->set_slot_id(slotid);
+        auto lens = subseqSlot->mutable_lens();
+        lens->Add(colNum);
+        break;
+      }
+      case SlotDef::VECTOR_SPARSE_VALUE: {
+        VectorSlot* vecSlot = sample->add_vector_slots();
+        auto values = vecSlot->mutable_values();
+        auto ids = vecSlot->mutable_ids();
+        int colNum;
+        const int* rowCols;
+        const real* rowValues;
+        getColRow(arg, pos, useGpu, &colNum, &rowCols, &rowValues);
+        ids->Reserve(colNum);
+        values->Reserve(colNum);
+        for (int i = 0; i < colNum; ++i) {
+          ids->AddAlreadyReserved(rowCols[i]);
+          values->AddAlreadyReserved(rowValues[i]);
+        }
+        break;
+      }
+      case SlotDef::VAR_MDIM_DENSE:
+      case SlotDef::VAR_MDIM_INDEX: {
+        LOG(FATAL) << "Not implemented";
+        break;
+      }
+      case SlotDef::STRING: {
+        VectorSlot* vecSlot = sample->add_vector_slots();
+        vecSlot->add_strs((*arg.strs)[pos]);
+        break;
+      }
+    }
+    slotid++;
+  }
+}
+
+void writeData(const DataBatch& batch, bool useGpu, bool dataCompression) {
+  DataHeader header;
+  const vector<Argument>& arguments = batch.getStreams();
+  for (auto& argument : arguments) {
+    SlotDef* slotDef = header.add_slot_defs();
+    slotDef->set_type(getSlotType(argument));
+    slotDef->set_dim(getSlotDim(argument));
+  }
+  VLOG(1) << "header=" << header.DebugString();
+
+  int64_t totalSeqs = batch.getNumSequences();
+  int64_t seq = 0;
+  ICpuGpuVectorPtr sequenceStartPositions =
+      arguments[0].sequenceStartPositions;
+  int64_t numWritten = 0;
+  vector<string> curProtoFiles =
+      dataCompression ? protoFilesCompressed : protoFiles;
+  for (size_t i = 0; i < curProtoFiles.size(); ++i) {
+    int64_t numSeqs = totalSeqs * (i + 1) / curProtoFiles.size() -
+                      totalSeqs * i / curProtoFiles.size();
+    ofstream os(curProtoFiles[i]);
+    CHECK(os) << "Fail to open " << curProtoFiles[i];
+    unique_ptr<ProtoWriter> writer(new ProtoWriter(&os, dataCompression));
+    CHECK(writer->write(header));
+    for (int j = 0; j < numSeqs; ++j, ++seq) {
+      int64_t begin = seq;
+      int64_t end = seq + 1;
+      if (sequenceStartPositions) {
+        begin = sequenceStartPositions->getElement(seq);
+        end = sequenceStartPositions->getElement(seq + 1);
+      }
+      for (int pos = begin; pos < end; ++pos) {
+        DataSample sample;
+        makeSample(arguments, pos, pos == begin, &sample, useGpu);
+        CHECK(writer->write(sample));
+        ++numWritten;
+      }
+    }
+
+    writer.reset(nullptr);
+    os.close();
+  }
+  CHECK_EQ(arguments[0].getBatchSize(), numWritten);
+}
+
+// check that the sample at pos1 in args1 is same as the sample at pos2 in args2
+void checkSample(const vector<Argument>& args1, int64_t pos1,
+                 const vector<Argument>& args2, int64_t pos2, bool useGpu) {
+  EXPECT_EQ(args1.size(), args2.size());
+  VLOG(1) << " pos1=" << pos1 << " pos2=" << pos2;
+
+  for (size_t i = 0; i < args1.size(); ++i) {
+    auto type = getSlotType(args1[i]);
+    int dim = getSlotDim(args1[i]);
+    EXPECT_EQ(type, getSlotType(args2[i]));
+    if (type == SlotDef::INDEX) {
+      EXPECT_GE(dim, getSlotDim(args2[i]));
+    } else {
+      EXPECT_EQ(dim, getSlotDim(args2[i]));
+    }
+    switch (type) {
+      case SlotDef::VECTOR_DENSE: {
+        for (int j = 0; j < dim; ++j) {
+          EXPECT_EQ(static_cast<float>(args1[i].value->getElement(pos1, j)),
+                    static_cast<float>(args2[i].value->getElement(pos2, j)));
+        }
+        break;
+      }
+      case SlotDef::INDEX: {
+        EXPECT_EQ(args1[i].ids->get(pos1), args2[i].ids->get(pos2));
+        break;
+      }
+      case SlotDef::VECTOR_SPARSE_NON_VALUE:
+      case SlotDef::VECTOR_SPARSE_VALUE: {
+        int colNum1, colNum2;
+        const int *rowCols1, *rowCols2;
+        const real *rowValues1, *rowValues2;
+        getColRow(args1[i], pos1, useGpu, &colNum1, &rowCols1, &rowValues1);
+        getColRow(args2[i], pos2, useGpu, &colNum2, &rowCols2, &rowValues2);
+        EXPECT_EQ(colNum1, colNum2);
+        for (int j = 0; j < colNum1; ++j) {
+          EXPECT_EQ(rowCols1[j], rowCols2[j]);
+          if (type == SlotDef::VECTOR_SPARSE_VALUE) {
+            EXPECT_EQ(rowValues1[j], rowValues2[j]);
+          }
+        }
+        break;
+      }
+      case SlotDef::VAR_MDIM_DENSE:
+      case SlotDef::VAR_MDIM_INDEX: {
+        LOG(FATAL) << "Not implemented";
+        break;
+      }
+      case SlotDef::STRING: {
+        EXPECT_EQ((*args1[i].strs)[pos1], (*args2[i].strs)[pos2]);
+        break;
+      }
+    }
+  }
+}
+
+void testProtoDataProvider(int* numPerSlotType, bool iid, bool async,
+                           bool useGpu, bool dataCompression,
+                           int numConstantSlots = 0) {
+  mkDir(kTestDir);
+  DataBatch data;
+
+  prepareData(&data, numPerSlotType, iid, useGpu);
+  writeData(data, useGpu, dataCompression);
+
+  DataConfig config;
+  config.set_type("proto");
+  config.set_files(dataCompression ? kProtoFileListCompressed : kProtoFileList);
+  config.set_async_load_data(async);
+
+  for (int i = 0; i < numConstantSlots; ++i) {
+    config.add_constant_slots(i + 11);
+    MatrixPtr w = Matrix::create(data.getSize(), 1, /* trans= */ false,
+                                 /* useGpu= */ false);
+    w->assign(config.constant_slots(i));
+    data.appendData(w);
+  }
+
+  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
+  dataProvider->setSkipShuffle();
+
+  EXPECT_EQ(data.getSize(), dataProvider->getSize());
+
+  int64_t batchSize = 10;
+  DataBatch batch;
+
+  size_t seq1 = 0;
+  vector<Argument>& args1 = data.getStreams();
+  ICpuGpuVectorPtr sequenceStartPositions1 =
+      args1[0].sequenceStartPositions;
+
+  dataProvider->reset();
+
+  while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
+    CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
+    vector<Argument>& args2 = batch.getStreams();
+    ICpuGpuVectorPtr sequenceStartPositions2 =
+        args2[0].sequenceStartPositions;
+    for (auto& arg : args2) {
+      EXPECT_EQ(iid, !arg.sequenceStartPositions);
+    }
+    size_t numSeqs = batch.getNumSequences();
+    VLOG(1) << "numSeqs=" << numSeqs;
+    for (size_t seq2 = 0; seq2 < numSeqs; ++seq1, ++seq2) {
+      int64_t begin1 = seq1;
+      int64_t end1 = seq1 + 1;
+      if (sequenceStartPositions1) {
+        begin1 = sequenceStartPositions1->getElement(seq1);
+        end1 = sequenceStartPositions1->getElement(seq1 + 1);
+        EXPECT_LT(seq1, sequenceStartPositions1->getSize() - 1);
+      }
+
+      int64_t begin2 = seq2;
+      int64_t end2 = seq2 + 1;
+      if (sequenceStartPositions2) {
+        begin2 = sequenceStartPositions2->getElement(seq2);
+        end2 = sequenceStartPositions2->getElement(seq2 + 1);
+      }
+      VLOG(1) << " begin1=" << begin1 << " end1=" << end1
+              << " begin2=" << begin2 << " end2=" << end2;
+      EXPECT_EQ(end1 - begin1, end2 - begin2);
+      for (int i = 0; i < end1 - begin1; ++i) {
+        checkSample(args1, begin1 + i, args2, begin2 + i, useGpu);
+      }
+    }
+  }
+
+  EXPECT_EQ(seq1, (size_t)data.getNumSequences());
+  rmDir(kTestDir);
+}
+
+TEST(ProtoDataProvider, test) {
+  int numSlotsArray[] = {0, 3};
+  int numTwoArray[] = {0, 1};
+  int numSlotsArraySize = sizeof(numSlotsArray) / sizeof(numSlotsArray[0]);
+  const int numSlot = 5;
+  int combination[numSlot] = {0};
+  int k = numSlot - 1;
+  while (k >= 0) {
+    int numDenseVecSlots = numSlotsArray[combination[0]];
+    int numSparseNonValueVecSlots = numSlotsArray[combination[1]];
+    int numSparseValueVectorSlots = numSlotsArray[combination[2]];
+    int numStrSlots = numSlotsArray[combination[3]];
+    int numIdSlots = numSlotsArray[combination[4]];
+    // while loop : traverse all cases
+    k = numSlot - 1;
+    while (k >= 0) {
+      if (combination[k] < (numSlotsArraySize - 1)) {
+        ++combination[k];
+        break;
+      } else {
+        combination[k] = 0;
+        --k;
+      }
+    }
+    if (numDenseVecSlots + numSparseNonValueVecSlots +
+            numSparseValueVectorSlots + numStrSlots + numIdSlots <
+        1)
+      continue;
+    for (int iid : numTwoArray) {
+      for (int async : numTwoArray) {
+        for (int useGpu : numTwoArray) {
+          for (int dataCompression : numTwoArray) {
+            if (async && useGpu) {
+              // Currently in async mode, useGpu is not supported
+              continue;
+            }
+#ifdef PADDLE_ONLY_CPU
+            if (useGpu) {
+              continue;
+            }
+#endif
+            LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
+                      << " numSparseNonValueVecSlots="
+                      << numSparseNonValueVecSlots
+                      << " numSparseValueVectorSlots="
+                      << numSparseValueVectorSlots
+                      << " numStrSlots=" << numStrSlots
+                      << " numIdSlots=" << numIdSlots << " iid=" << iid
+                      << " async=" << async << " useGpu=" << useGpu
+                      << " dataCompression=" << dataCompression;
+            int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
+            numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
+            numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
+                numSparseNonValueVecSlots;
+            numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE] =
+                numSparseValueVectorSlots;
+            numPerSlotType[SlotDef::INDEX] = numIdSlots;
+            numPerSlotType[SlotDef::STRING] = numStrSlots;
+            testProtoDataProvider(numPerSlotType, iid, async, useGpu,
+                                  dataCompression);
+          }  // end for (int dataCompression : numTwoArray)
+        }    // end for (int useGpu : numTwoArray)
+      }      // end for (int async : numTwoArray)
+    }        // end for (int iid : numTwoArray)
+  }          // end for (while, traverse all slots)
+}
+
+TEST(ProtoDataProvider, constant_slots) {
+  int numSlotsArray[] = {0, 3};
+  int numTwoArray[] = {0, 1};
+  for (int numDenseVecSlots : numSlotsArray) {
+    for (int numSparseNonValueVecSlots : numSlotsArray) {
+      if (numDenseVecSlots + numSparseNonValueVecSlots < 1) continue;
+      for (int numConstantSlots : {1, 2}) {
+        for (int useGpu : numTwoArray) {
+          for (int dataCompression : numTwoArray) {
+#ifdef PADDLE_ONLY_CPU
+            if (useGpu) {
+              continue;
+            }
+#endif
+            LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
+                      << " numSparseNonValueVecSlots="
+                      << numSparseNonValueVecSlots
+                      << " numConstantSlogs=" << numConstantSlots
+                      << " useGpu=" << useGpu
+                      << " dataCompression=" << dataCompression;
+            int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
+            numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
+            numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
+                numSparseNonValueVecSlots;
+            numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE] = 1;
+            numPerSlotType[SlotDef::INDEX] = 1;
+            testProtoDataProvider(numPerSlotType,
+                                  /* iid= */ true,
+                                  /* async= */ false, useGpu, dataCompression,
+                                  numConstantSlots);
+          }  // end for (int dataCompression : numTwoArray)
+        }    // end for (int useGpu : numTwoArray)
+      }      // end for (int numConstantSlots : {1, 2})
+    }        // end for (int numSparseNonValueVecSlots : numSlotsArray)
+  }          // end for (int numDenseVecSlots : numSlotsArray)
+}
+
+void checkSampleSequence(const vector<Argument>& args1,
+                         const vector<Argument>& args2, int64_t offset,
+                         int64_t numSeqs, bool useGpu) {
+  // check slot num are equal
+  EXPECT_EQ(args1.size(), args2.size());
+  for (size_t i = 0; i < args1.size(); i++) {
+    auto type = getSlotType(args1[i]);
+    // check for args2: sequenceStartPositions vs numSeqs
+    // (1) size
+    EXPECT_EQ(args2[i].sequenceStartPositions->getSize(),
+              (size_t)numSeqs + 1);
+    // (2) content
+    auto checkArgContent = [&](const Argument& args, int numSeqs) {
+      for (int j = 0; j <= numSeqs; j++) {
+        int start_pos = args.sequenceStartPositions->getElement(j);
+        EXPECT_EQ(start_pos, j);
+      }
+    };
+    switch (type) {
+      case SlotDef::INDEX: {
+        // args1: for label
+        checkArgContent(args2[i], numSeqs);
+        // check for args2: ids are equal to args1[offset]
+        // (1) size
+        EXPECT_EQ(args2[i].ids->getSize(), (size_t)numSeqs);
+        // (2) content
+        for (int j = 0; j < numSeqs; j++) {
+          EXPECT_EQ(args2[i].ids->get(j), args1[i].ids->get(offset + j));
+        }
+        break;
+      }
+      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
+        // args1: for sparse_non_value
+        // args2 should put sparse indexes in ids
+        int colNum1;
+        const int* rowCols1;
+        const real* rowValues1;  // nullptr
+        int totalLength = 0;
+        for (int j = 0; j < numSeqs; j++) {
+          getColRow(args1[i], offset + j, useGpu, &colNum1, &rowCols1,
+                    &rowValues1);
+          // (1) lengths
+          EXPECT_EQ(totalLength,
+                    args2[i].sequenceStartPositions->getElement(j));
+          EXPECT_EQ(totalLength,
+                    args2[i].subSequenceStartPositions->getElement(j));
+          // (2) content
+          for (int k = 0; k < colNum1; k++) {
+            EXPECT_EQ(rowCols1[k], args2[i].ids->get(totalLength + k));
+          }
+          totalLength += colNum1;
+          if (colNum1 == 0) {
+            // special case here: we will put a "-1" into ids when column num is
+            // zero. see ProtoSequenceDataProvider::getNextBatchInternal.
+            EXPECT_EQ(-1, args2[i].ids->get(totalLength));
+            totalLength++;
+          }
+        }
+        EXPECT_EQ(totalLength,
+                  args2[i].sequenceStartPositions->getElement(numSeqs));
+        EXPECT_EQ(totalLength,
+                  args2[i].subSequenceStartPositions->getElement(numSeqs));
+        break;
+      }
+      case SlotDef::VECTOR_DENSE: {
+        // args1: for dense vector
+        checkArgContent(args2[i], numSeqs);
+        // check for args2: values are equal to args1[offset]
+        // (1) size
+        EXPECT_EQ(args2[i].value->getHeight(), (size_t)numSeqs);
+        EXPECT_EQ(args2[i].value->getWidth(), (size_t)getSlotDim(args1[i]));
+        // (2) content
+        for (int j = 0; j < numSeqs; j++) {
+          for (size_t k = 0; k < args2[i].value->getWidth(); k++) {
+            EXPECT_EQ(
+                static_cast<float>(args1[i].value->getElement(j + offset, k)),
+                static_cast<float>(args2[i].value->getElement(j, k)));
+          }
+        }
+        break;
+      }
+      default: { EXPECT_EQ(true, false) << "should not reach here"; }
+    }
+  }
+}
+
+void testProtoSequenceDataProvider(int* numPerSlotType, bool async,
+                                   bool useGpu) {
+  mkDir(kTestDir);
+  DataBatch data;
+
+  prepareData(&data, numPerSlotType,
+              /* iid */ true, useGpu);
+  writeData(data, useGpu, /* dataCompression */ false);
+
+  DataConfig config;
+  config.set_type("proto_sequence");
+  config.set_files(kProtoFileList);
+  config.set_async_load_data(async);
+
+  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
+  dataProvider->setSkipShuffle();
+
+  EXPECT_EQ(data.getSize(), dataProvider->getSize());
+
+  int64_t batchSize = 10;
+  DataBatch batch;
+
+  vector<Argument>& args1 = data.getStreams();
+  ICpuGpuVectorPtr sequenceStartPositions1 =
+      args1[0].sequenceStartPositions;
+
+  dataProvider->reset();
+
+  size_t args1Offset = 0;
+  while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
+    CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
+    vector<Argument>& args2 = batch.getStreams();
+    ICpuGpuVectorPtr sequenceStartPositions2 =
+        args2[0].sequenceStartPositions;
+    for (auto& arg : args1) {
+      // args1 should not has sequence
+      EXPECT_EQ(true, !arg.sequenceStartPositions);
+    }
+    for (auto& arg : args2) {
+      // args2 should has sequence
+      EXPECT_NE(true, !arg.sequenceStartPositions);
+    }
+    size_t numSeqs = batch.getNumSequences();
+    checkSampleSequence(args1, args2, args1Offset, numSeqs, useGpu);
+    args1Offset += numSeqs;
+  }
+
+  EXPECT_EQ(args1Offset, (size_t)data.getNumSequences());
+  rmDir(kTestDir);
+}
+
+TEST(ProtoSequenceDataProvider, test) {
+  int numSlotsArray[] = {0, 3};
+  int numTwoArray[] = {0, 1};
+  for (int numSparseNonValueVecSlots : numSlotsArray) {
+    for (int numIdSlots : numSlotsArray) {
+      for (int numDenseVecSlots : numSlotsArray) {
+        if (numDenseVecSlots + numSparseNonValueVecSlots + numIdSlots < 1)
+          continue;
+        for (int async : numTwoArray) {
+          for (int useGpu : numTwoArray) {
+            if (async && useGpu) {
+              // Currently in async mode, useGpu is not supported
+              continue;
+            }
+#ifdef PADDLE_ONLY_CPU
+            if (useGpu) {
+              continue;
+            }
+#endif
+            LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
+                      << " numSparseNonValueVecSlots="
+                      << numSparseNonValueVecSlots
+                      << " numIdSlots=" << numIdSlots << " async=" << async
+                      << " useGpu=" << useGpu;
+            int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
+            numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
+            numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
+                numSparseNonValueVecSlots;
+            numPerSlotType[SlotDef::INDEX] = numIdSlots;
+            testProtoSequenceDataProvider(numPerSlotType, async, useGpu);
+          }  // end for (int useGpu : numTwoArray)
+        }    // end for (int async : numTwoArray)
+      }      // end for (int numDenseVecSlots : numSlotsArray)
+    }        // end for (int numIdSlots : numSlotsArray)
+  }          // end for (int numSparseNonValueVecSlots : numSlotsArray)
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_PyDataProvider.cpp b/paddle/gserver/tests/test_PyDataProvider.cpp
new file mode 100644
index 00000000000000..6ad45e3a65a627
--- /dev/null
+++ b/paddle/gserver/tests/test_PyDataProvider.cpp
@@ -0,0 +1,175 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <memory>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "paddle/gserver/dataproviders/PyDataProvider.h"
+#include "paddle/utils/Util.h"
+
+#include "TestUtil.h"
+
+using namespace std;     // NOLINT
+using namespace paddle;  // NOLINT
+
+void simpleValueCheck(const vector<Argument>& argumentList, bool useGpu);
+void simpleSequenceCheck(const vector<Argument>& argumentList, int sample_num);
+
+TEST(PyDataProvider, py_fill_slots) {
+  DataConfig config;
+  config.set_type("py");
+  config.set_async_load_data(false);
+  config.set_load_data_module(std::string("pyDataProvider"));
+  config.set_load_data_object(std::string("SimpleDataProvider"));
+  config.clear_files();
+  std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList";
+  config.set_files(dataFile);
+#ifdef PADDLE_ONLY_CPU
+  bool useGpu = false;
+#else
+  bool useGpu = true;
+#endif
+  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
+  DataBatch dataBatch;
+  dataProvider->getNextBatchInternal(2, &dataBatch);
+  const std::vector<Argument>& argumentList = dataBatch.getStreams();
+  // Check size
+  EXPECT_EQ(argumentList.size(), 3UL);
+  EXPECT_EQ(argumentList[0].value->getWidth(), 3UL);
+  EXPECT_EQ(argumentList[0].value->getHeight(), 2UL);
+  EXPECT_EQ(argumentList[0].value->getElementCnt(), 6UL);
+  EXPECT_EQ(argumentList[1].value->getWidth(), 7UL);
+  EXPECT_EQ(argumentList[1].value->getHeight(), 2UL);
+  EXPECT_EQ(argumentList[1].value->getElementCnt(), 4UL);
+  EXPECT_EQ(argumentList[2].ids->getSize(), 2UL);
+  // Check value
+  simpleValueCheck(argumentList, useGpu);
+  // Check sequenceStartPositions
+  simpleSequenceCheck(argumentList, 2);
+}
+
+TEST(PyDataProvider, py_fill_nest_slots) {
+  DataConfig config;
+  config.set_type("py");
+  config.set_async_load_data(false);
+  config.set_load_data_module(std::string("pyDataProvider"));
+  config.set_load_data_object(std::string("SimpleNestDataProvider"));
+  config.clear_files();
+  std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList";
+  config.set_files(dataFile);
+  EXPECT_EQ(config.IsInitialized(), true);
+#ifdef PADDLE_ONLY_CPU
+  bool useGpu = false;
+#else
+  bool useGpu = true;
+#endif
+  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
+  DataBatch dataBatch;
+  dataProvider->getNextBatchInternal(2, &dataBatch);
+  const std::vector<Argument>& argumentList = dataBatch.getStreams();
+  // Check size
+  EXPECT_EQ(argumentList.size(), 3UL);
+  EXPECT_EQ(argumentList[0].value->getWidth(), 3UL);
+  EXPECT_EQ(argumentList[0].value->getHeight(), 4UL);
+  EXPECT_EQ(argumentList[0].value->getElementCnt(), 12UL);
+  EXPECT_EQ(argumentList[1].value->getWidth(), 7UL);
+  EXPECT_EQ(argumentList[1].value->getHeight(), 4UL);
+  EXPECT_EQ(argumentList[1].value->getElementCnt(), 8UL);
+  EXPECT_EQ(argumentList[2].ids->getSize(), 4UL);
+  // Check value
+  simpleValueCheck(argumentList, useGpu);
+  // Check sequenceStartPositions
+  simpleSequenceCheck(argumentList, 4);
+  // Check subSequenceStartPositions
+  EXPECT_EQ(argumentList[0].subSequenceStartPositions->getSize(), 4UL);
+  EXPECT_EQ(argumentList[1].subSequenceStartPositions->getSize(), 3UL);
+  EXPECT_EQ(argumentList[2].subSequenceStartPositions->getSize(), 4UL);
+  for (size_t i = 0; i < argumentList.size(); i++) {
+    EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(0), 0);
+    EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(1), 1);
+    if (i != 1) {
+      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(2), 2);
+      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(3), 4);
+    } else {
+      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(2), 4);
+    }
+  }
+}
+
+void simpleValueCheck(const vector<Argument>& argumentList, bool useGpu) {
+  // Dense
+  real* data;
+  if (useGpu) {
+    MatrixPtr cpuMatrixPtr =
+        Matrix::create(argumentList[0].value->getHeight(),
+                       argumentList[0].value->getWidth(), 0, 0);
+    cpuMatrixPtr->copyFrom(*argumentList[0].value);
+    data = cpuMatrixPtr->getData();
+  } else {
+    data = argumentList[0].value->getData();
+  }
+  for (size_t i = 0; i < argumentList[0].value->getElementCnt(); ++i) {
+    EXPECT_EQ(*(data + i), (float)(i % 3 + 1));
+  }
+  // Sparse without value
+  GpuSparseMatrixPtr matGpu;
+  CpuSparseMatrixPtr matCpu;
+  if (useGpu) {
+    matGpu = dynamic_pointer_cast<GpuSparseMatrix>(argumentList[1].value);
+    ASSERT_TRUE(matGpu != NULL);
+  } else {
+    data = argumentList[0].value->getData();
+    matCpu = dynamic_pointer_cast<CpuSparseMatrix>(argumentList[1].value);
+    ASSERT_TRUE(matCpu != NULL);
+  }
+  for (size_t i = 0; i < argumentList[1].value->getHeight(); ++i) {
+    size_t colNum = useGpu ? matGpu->getColNum(i) : matCpu->getColNum(i);
+    EXPECT_EQ(colNum, (size_t)2);
+    const int* buf = useGpu ? matGpu->getRowCols(i) : matCpu->getRowCols(i);
+    for (size_t j = 0; j < colNum; ++j) {
+      EXPECT_EQ((size_t)buf[j], (size_t)(j + 1));
+    }
+  }
+  // Index
+  for (size_t j = 0; j < argumentList[2].ids->getSize(); ++j) {
+    EXPECT_EQ((size_t)argumentList[2].ids->get(j), 0UL);
+  }
+}
+
+void simpleSequenceCheck(const vector<Argument>& argumentList, int sample_num) {
+  EXPECT_EQ(argumentList[0].sequenceStartPositions->getSize(), 3UL);
+  EXPECT_EQ(argumentList[1].sequenceStartPositions->getSize(), 2UL);
+  EXPECT_EQ(argumentList[2].sequenceStartPositions->getSize(), 3UL);
+  for (size_t i = 0; i < argumentList.size(); i++) {
+    EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(0), 0);
+    if (i != 1) {
+      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(1), 1);
+      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(2),
+                sample_num);
+    } else {
+      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(1),
+                sample_num);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  initPython(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
new file mode 100644
index 00000000000000..824295eb6e9f24
--- /dev/null
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -0,0 +1,251 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_NO_PYTHON
+#include <gtest/gtest.h>
+#include <fstream>
+#include "paddle/utils/Util.h"
+#include "paddle/utils/PythonUtil.h"
+#include "paddle/gserver/dataproviders/DataProvider.h"
+
+P_DEFINE_string(train_list, "unittest.list", "file list for unittest");
+const paddle::real epsilon = 1e-5;
+
+static inline int64_t readDataBatch(
+    paddle::DataBatch* batch,
+    const std::string& funcName,
+    int64_t batchSize = 65535) {
+
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object(funcName);
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->setSkipShuffle();
+  provider->reset();
+  return provider->getNextBatchInternal(batchSize, batch);
+}
+
+TEST(PyDataProvider2, dense_no_seq) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_dense_no_seq");
+
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+
+  provider->setSkipShuffle();  // skip shuffle for unittest.
+
+  paddle::DataBatch batch;
+  for (size_t pass=0; pass < 2; ++pass) {  // read 2 passes
+    provider->reset();
+    int64_t num = provider->getNextBatchInternal(100, &batch);
+    ASSERT_NE(num, 0);
+    ASSERT_EQ((size_t)batch.getStreams().size(), (size_t)1);
+    ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
+    // Check batch data.
+    for (size_t i=0; i < 100; ++i) {
+      for (size_t j=0; j < 200; ++j) {
+        paddle::real tmp = (paddle::real)((j-100.0) * (i+1) / 200.0);
+        ASSERT_NEAR(batch.getStreams()[0].value->getData()[i*200 + j],
+                    tmp, epsilon);}
+    }
+
+    num = provider->getNextBatchInternal(100, &batch);
+    ASSERT_NE(num, 0);
+    ASSERT_EQ(batch.getStreams().size(), (size_t)1);
+    ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
+    // Check batch data.
+    for (size_t i=0; i < 100; ++i) {
+      size_t ii = i + 100;
+      for (size_t j=0; j < 200; ++j) {
+        paddle::real tmp = (paddle::real)((j-100.0) * (ii+1) / 200.0);
+        ASSERT_NEAR(batch.getStreams()[0].value->getData()[i*200 + j],
+                    tmp, epsilon);}
+    }
+    num = provider->getNextBatchInternal(100, &batch);
+    ASSERT_EQ(num, 0);
+  }
+}
+
+TEST(PyDataProvider2, index_no_seq) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_index_no_seq");
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+
+  provider->setSkipShuffle();  // skip shuffle for unittest.
+  paddle::DataBatch batch;
+  for (size_t pass=0; pass < 2; ++pass) {
+    provider->reset();
+    int64_t num = provider->getNextBatchInternal(10000, &batch);
+    CHECK_EQ(num, 200);
+    for (int i=0; i < 200; ++i) {
+      CHECK_EQ(i, batch.getStreams()[0].ids->getData()[i]);
+    }
+  }
+}
+
+TEST(PyDataProvider2, init_hook) {
+  paddle::PyObjectPtr pickle(PyImport_ImportModule("pickle"));
+  paddle::PyObjectPtr globals(
+      PyModule_GetDict(PyImport_AddModule("__main__")));
+  PyDict_SetItemString(globals.get(), "pickle", pickle.get());
+  paddle::PyObjectPtr locals(PyDict_New());
+  paddle::PyObjectPtr mdl(PyRun_String(
+      "dumps = pickle.dumps({'value':[float(x) for x in xrange(20)]})",
+      Py_file_input, globals.get(), locals.get()));
+  CHECK_PY(mdl) << "Error!";
+  paddle::PyObjectPtr dps(PyDict_GetItemString(locals.get(), "dumps"));
+  CHECK_PY(dps) << "Error!";
+
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_init_hook");
+  config.set_load_data_args(PyString_AsString(dps.get()));
+
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->setSkipShuffle();  // skip shuffle for unittest.
+  provider->reset();
+  paddle::DataBatch batch;
+  int64_t num = provider->getNextBatchInternal(100000, &batch);
+  ASSERT_EQ(num, 200);
+  auto& mat = batch.getStreams()[0].value;
+  ASSERT_EQ((size_t)mat->getWidth(), (size_t)20);
+  for (size_t i=0; i < 200; ++i) {
+    for (size_t j=0; j < 20; ++j) {
+      ASSERT_NEAR((paddle::real)j, mat->getData()[i*20 + j], epsilon);
+    }
+  }
+}
+
+TEST(PyDataProvider2, sparse_no_value_no_seq) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_sparse_non_value_no_seq");
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->setSkipShuffle();
+  provider->reset();
+  paddle::DataBatch batch;
+  int64_t num = provider->getNextBatchInternal(10000, &batch);
+  CHECK_EQ(num, 200);
+  auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
+      batch.getStreams()[0].value);
+  CHECK(csm != nullptr);
+  for (int i=0; i < 200; ++i) {
+    CHECK_EQ(csm->getColNum(i), (size_t)10);
+    int* cols = csm->getRowCols(i);
+    for (int j=0; j < 10; ++j) {
+      CHECK_EQ(cols[j], (i+1)*(j+1));
+    }
+  }
+}
+
+TEST(PyDataProvider2, sparse_value_no_seq) {
+  paddle::DataBatch batch;
+  CHECK_EQ(readDataBatch(&batch, "test_sparse_value_no_seq"), 200);
+  auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
+      batch.getStreams()[0].value);
+  CHECK(csm != nullptr);
+  for (int i=0; i < 200; ++i) {
+    CHECK_EQ(csm->getColNum(i), (size_t)10);
+    int* cols = csm->getRowCols(i);
+    real* dat = csm->getRowValues(i);
+    for (int j=0; j < 10; ++j) {
+      EXPECT_EQ(cols[j], (i+1)*(j+1));
+      EXPECT_EQ(dat[j], real(j)/real(i+1));
+    }
+  }
+}
+
+TEST(PyDataProvider2, index_seq) {
+  paddle::DataBatch batch;
+  CHECK_EQ(readDataBatch(&batch, "test_index_seq"), 200);
+  auto& arg = batch.getStreams()[0];
+  CHECK_EQ((int)arg.ids->getSize(), (200 + 1) * 200 /2);
+  size_t tmp = 0;
+  for (size_t i=0; i < 200; ++i) {  // CHECK DATA CORRECT
+    for (size_t j=0; j < i+1; ++j) {
+      ASSERT_EQ((size_t)arg.ids->getData()[tmp], j);
+      ++tmp;
+    }
+  }
+  ASSERT_EQ(arg.sequenceStartPositions->getSize(), (size_t)201);
+  tmp = 0;
+  for (size_t i = 0; i < 200; ++i) {
+    tmp += i;
+    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i], tmp);
+  }
+  tmp += 200;
+  ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[200], tmp);
+}
+
+TEST(PyDataProvider2, index_sub_seq) {
+  paddle::DataBatch batch;
+  ASSERT_EQ(readDataBatch(&batch, "test_index_sub_seq"), 200);
+  auto& arg = batch.getStreams()[0];
+  size_t tmp = 0;
+  for (size_t i=0; i < 200; ++i) {
+    for (size_t j=0; j < i+1; ++j) {
+      for (size_t k=0; k < j+1; ++k) {
+        CHECK_EQ((size_t)arg.ids->getData()[tmp++], k);
+      }
+    }
+  }
+
+  CHECK_EQ(tmp, arg.ids->getSize());
+
+  ASSERT_EQ((size_t)arg.sequenceStartPositions->getSize(), (size_t)201);
+  ASSERT_EQ(arg.subSequenceStartPositions->getData(false)[0], 0);
+  ASSERT_EQ(arg.sequenceStartPositions->getData(false)[0], 0);
+  size_t idx = 1;
+  tmp = 0;
+  for (size_t i=0; i < 200; ++i) {
+    for (size_t j=0; j < i+1; ++j) {
+      tmp += j+1;
+      ASSERT_EQ((size_t)arg.subSequenceStartPositions->getData(false)[idx],
+          (size_t)tmp);
+      ++idx;
+    }
+    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i+1], tmp);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  paddle::initPython(argc, argv);
+
+  std::ofstream fout(FLAGS_train_list);
+  CHECK(fout.is_open());
+  fout << "stub file name" << std::endl;  // in unittest, filename is not used.
+  fout.close();
+
+  return RUN_ALL_TESTS();
+}
+
+#endif
diff --git a/paddle/gserver/tests/test_PyDataProvider2.py b/paddle/gserver/tests/test_PyDataProvider2.py
new file mode 100644
index 00000000000000..a88c48cb4e295d
--- /dev/null
+++ b/paddle/gserver/tests/test_PyDataProvider2.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+
+
+@provider(input_types=[dense_vector(200, seq_type=SequenceType.NO_SEQUENCE)])
+def test_dense_no_seq(setting, filename):
+    for i in xrange(200):
+        yield [(float(j - 100) * float(i + 1)) / 200.0 for j in xrange(200)]
+
+
+@provider(input_types=[integer_value(200, seq_type=SequenceType.NO_SEQUENCE)])
+def test_index_no_seq(setting, filename):
+    for i in xrange(200):
+        yield i
+
+
+def test_init_hooker(setting, value, **kwargs):
+    setting.value = value
+
+
+@provider(input_types=[dense_vector(20, seq_type=SequenceType.NO_SEQUENCE)],
+          init_hook=test_init_hooker)
+def test_init_hook(setting, filename):
+    for i in xrange(200):
+        yield setting.value
+
+
+@provider(
+    input_types=[sparse_binary_vector(30000, seq_type=SequenceType.NO_SEQUENCE)])
+def test_sparse_non_value_no_seq(setting, filename):
+    for i in xrange(200):
+        yield [(i + 1) * (j + 1) for j in xrange(10)]
+
+
+@provider(input_types=[sparse_vector(30000, seq_type=SequenceType.NO_SEQUENCE)])
+def test_sparse_value_no_seq(setting, filename):
+    for i in xrange(200):
+        yield [((i + 1) * (j + 1), float(j) / float(i + 1)) for j in xrange(10)]
+
+
+@provider(input_types=[integer_value(200, seq_type=SequenceType.SEQUENCE)])
+def test_index_seq(setting, filename):
+    for i in xrange(200):
+        yield range(i + 1)
+
+
+@provider(input_types=[index_slot(200, seq_type=SequenceType.SUB_SEQUENCE)])
+def test_index_sub_seq(setting, filename):
+    def gen_sub_seq(l):
+        l += 1
+        for j in xrange(l):
+            yield range(j + 1)
+
+    for i in xrange(200):
+        yield list(gen_sub_seq(i))
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
new file mode 100644
index 00000000000000..35d6ee7f4a402d
--- /dev/null
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -0,0 +1,124 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <gtest/gtest.h>
+#include <paddle/utils/Util.h>
+#include <paddle/utils/Version.h>
+#include <paddle/utils/PythonUtil.h>
+#include <paddle/trainer/Trainer.h>
+#include <paddle/trainer/TrainerInternal.h>
+#include <paddle/gserver/gradientmachines/GradientMachine.h>
+
+using namespace paddle;  // NOLINT
+using namespace std;  // NOLINT
+class TrainerForTest : public paddle::Trainer {
+public:
+  void startTrain() {
+    GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
+    gm.start(this->getConfig(), dataProvider_);
+  }
+
+  void finishTrain() {
+    GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
+    gm.finish();
+  }
+
+  /**
+   * Get total dimension of all parameters.
+   *
+   * @return the total dimension of all parameters
+   */
+  size_t getTotalParameterSize() const {
+    auto p = const_cast<TrainerForTest*>(this);
+    auto & params = p->getGradientMachine()->getParameters();
+    return std::accumulate(params.begin(), params.end(), 0UL,
+                           [](size_t a, const ParameterPtr& p){
+      return a+p->getSize();
+    });
+  }
+};
+
+void CalCost(const string& conf, const string& dir, real* cost,
+             int num_passes) {
+  auto config = std::make_shared<TrainerConfigHelper>(conf);
+  TrainerForTest trainer;
+  trainer.init(config);
+  mkDir(dir.c_str());
+  config->setSaveDir(dir);
+  auto dataProvider = trainer.getDataProvider();
+  int32_t batchSize = config->getOptConfig().batch_size();
+  real learningRate = config->getOptConfig().learning_rate();
+  real momentum = 0;
+  real decayRate = 0;
+  int64_t dim = trainer.getTotalParameterSize();
+  CpuVector vecW(dim);
+  CpuVector vecGradient(dim);
+  CpuVector vecMomentum(dim);
+
+  // vecW needs to be assigned, otherwise the variable is an uncertain value.
+  vecW.zeroMem();
+
+  trainer.startTrain();
+  for (int i = 0; i < num_passes; ++i) {
+    real totalCost = 0;
+    dataProvider->reset();
+    while (true) {
+      DataBatch dataBatch;
+      int num = dataProvider->getNextBatch(batchSize, &dataBatch);
+      if (num == 0) break;
+      totalCost += trainer.calcGradient(dataBatch, vecW, vecGradient);
+      sgdUpdate(learningRate, momentum, decayRate, &vecW, &vecGradient,
+                &vecMomentum);
+    }
+    cost[i] = totalCost;
+  }
+  trainer.finishTrain();
+  rmDir(dir.c_str());
+}
+
+TEST(RecurrentGradientMachine, HasSubSequence) {
+  int num_passes = 5;
+  real* cost1 = new real[num_passes];
+  const string conf1 = "gserver/tests/sequence_layer_group.conf";
+  const string dir1 = "gserver/tests/t1";
+  CalCost(conf1, dir1, cost1, num_passes);
+
+  real* cost2 = new real[num_passes];
+  const string conf2 = "gserver/tests/sequence_nest_layer_group.conf";
+  const string dir2 = "gserver/tests/t2";
+  CalCost(conf2, dir2, cost2, num_passes);
+
+  for (int i = 0; i < num_passes; i++) {
+    LOG(INFO) << "num_passes: " << i << ", cost1=" << cost1[i]
+              << ", cost2=" << cost2[i];
+    ASSERT_NEAR(cost1[i], cost2[i], 1e-3);
+  }
+  delete[] cost1;
+  delete[] cost2;
+}
+
+int main(int argc, char** argv) {
+  if (paddle::version::isWithPyDataProvider()) {
+    if (!paddle::version::isWithGpu()) {
+      FLAGS_use_gpu = false;
+    }
+    initMain(argc, argv);
+    initPython(argc, argv);
+    testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+  } else {
+    return 0;
+  }
+}
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
new file mode 100644
index 00000000000000..2cea190b859496
--- /dev/null
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -0,0 +1,409 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <vector>
+#include <paddle/utils/Version.h>
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/gserver/layers/Layer.h"
+#include "ModelConfig.pb.h"
+
+#include "TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+P_DECLARE_bool(use_gpu);
+P_DECLARE_bool(rnn_use_batch);
+P_DECLARE_int32(fixed_seq_length);
+
+void checkError(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
+        count++;
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+void checkError(const CpuVector& vector1, const CpuVector& vector2) {
+  CHECK(vector1.getSize() == vector2.getSize());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int size = vector1.getSize();
+  const real* data1 = vector1.getData();
+  const real* data2 = vector2.getData();
+  int count = 0;
+  for (int i = 0; i < size; i++) {
+    if (fabs(data1[i] - data2[i]) > err) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+LayerPtr creatDataLayer(string name, size_t batchSize, int layerSize,
+                        bool useGpu) {
+  LayerConfig dataConfig;
+  dataConfig.set_name(name);
+  dataConfig.set_type("data");
+  dataConfig.set_size(layerSize);
+  LayerPtr layer = LayerPtr(new DataLayer(dataConfig));
+
+  Argument data;
+  data.value = Matrix::create(batchSize, layer->getSize(), false, useGpu);
+  data.grad = Matrix::create(batchSize, layer->getSize(), false, useGpu);
+  data.value->randomizeUniform();
+  data.value->add(-0.5);
+  data.value->sigmoid(*data.value);
+  data.grad->zeroMem();
+
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+
+  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  dataLayer->setData(data);
+  dataLayer->forward(PASS_GC);
+
+  return layer;
+}
+
+ParameterPtr creatParameter(string name, int pid, size_t paraSize,
+                            bool useGpu) {
+  ParameterConfig paraConfig;
+  paraConfig.set_name(name);
+  paraConfig.set_size(paraSize);
+
+  ParameterPtr parameter =
+      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ false);
+  parameter->enableType(PARAMETER_VALUE);
+  parameter->enableType(PARAMETER_GRADIENT);
+  parameter->randomize();
+  parameter->setID(pid);
+
+  return parameter;
+}
+
+ParameterPtr creatParameterBias(string name, int pid, size_t paraSize,
+                                bool useGpu) {
+  ParameterConfig paraConfig;
+  paraConfig.set_name(name);
+  paraConfig.set_size(paraSize);
+  paraConfig.set_initial_std(1);
+
+  ParameterPtr parameter =
+      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ true);
+  parameter->randomize();
+  parameter->setID(pid);
+
+  return parameter;
+}
+
+LayerPtr initRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
+                            int layerSize, bool useGpu) {
+  FLAGS_use_gpu = useGpu;
+  LayerMap layerMap;
+  ParameterMap parameterMap;
+  LayerPtr dataLayer = creatDataLayer("layer_0", batchSize, layerSize, useGpu);
+  layerMap[dataLayer->getName()] = dataLayer;
+
+  ParameterPtr para =
+      creatParameter("para_0", 0, layerSize * layerSize, useGpu);
+  parameterMap[para->getName()] = para;
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name("layer_0");
+  input.set_input_parameter_name("para_0");
+  LayerPtr testLayer = Layer::create(layerConfig);
+  layerMap[testLayer->getName()] = testLayer;
+
+  testLayer->init(layerMap, parameterMap);
+  testLayer->setNeedGradient(true);
+
+  return testLayer;
+}
+
+void checkRecurrentLayer(LayerPtr testLayer) {
+  const VectorPtr& weightGrad =
+      (testLayer->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
+  const MatrixPtr& inputGrad = testLayer->getPrev(0)->getOutputGrad();
+  CpuVector seqPara(weightGrad->getSize());
+  CpuVector batPara(weightGrad->getSize());
+  CpuMatrix seqInputGrad(inputGrad->getHeight(), inputGrad->getWidth());
+  CpuMatrix batInputGrad(inputGrad->getHeight(), inputGrad->getWidth());
+
+  CpuMatrix outputGrad(inputGrad->getHeight(), inputGrad->getWidth());
+  outputGrad.randomizeUniform();
+
+  /* use sequence calculate */
+  FLAGS_rnn_use_batch = false;
+  weightGrad->zero();
+  inputGrad->zero();
+  testLayer->forward(PASS_GC);
+  testLayer->getOutputGrad()->copyFrom(outputGrad);
+  testLayer->backward();
+  seqPara.copyFrom(*weightGrad);
+  seqInputGrad.copyFrom(*inputGrad);
+
+  /* use batch calculate */
+  FLAGS_rnn_use_batch = true;
+  weightGrad->zero();
+  inputGrad->zero();
+  testLayer->forward(PASS_GC);
+  testLayer->getOutputGrad()->copyFrom(outputGrad);
+  testLayer->backward();
+  batPara.copyFrom(*weightGrad);
+  batInputGrad.copyFrom(*inputGrad);
+
+  /* check */
+  checkError(seqInputGrad, batInputGrad);
+  checkError(seqPara, batPara);
+}
+
+TEST(Layer, RecurrentLayer) {
+  LayerConfig layerConfig;
+  layerConfig.set_name("rnn");
+  layerConfig.set_type("recurrent");
+  layerConfig.set_active_type("tanh");
+  for (auto layerSize : {1, 10, 64, 128, 256, 512}) {
+    for (auto batchSize : {1, 5, 20, 100, 128}) {
+      for (auto useGpu : {false, true}) {
+        for (auto reversed : {false, true}) {
+          LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize
+                    << " useGpu=" << useGpu << " reversed=" << reversed;
+          layerConfig.set_size(layerSize);
+          layerConfig.set_reversed(reversed);
+          LayerPtr testLayer =
+              initRecurrentLayer(layerConfig, batchSize, layerSize, useGpu);
+          checkRecurrentLayer(testLayer);
+        }
+      }
+    }
+  }
+}
+
+#define protected public
+#include "paddle/gserver/layers/LstmLayer.h"
+#include "paddle/gserver/layers/GatedRecurrentLayer.h"
+template<class T>
+class TestRecurrentLayer {
+public:
+  LayerConfig config_;
+  bool useGpu_;
+  bool useBatch_;
+  LayerPtr testLayer_;
+  LayerPtr dataLayer_;
+  ParameterPtr para_;
+  ParameterPtr bias_;
+  LayerMap layerMap_;
+  ParameterMap parameterMap_;
+  TestRecurrentLayer(const LayerConfig& config,
+    bool useGpu, bool useBatch = false)
+    : config_(config), useGpu_(useGpu), useBatch_(useBatch) {}
+  void init(size_t batchSize) {
+    FLAGS_use_gpu = useGpu_;
+    testLayer_ = Layer::create(config_);
+    if (typeid(T) == typeid(GatedRecurrentLayer)) {
+      dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
+                                  batchSize, config_.size() * 3, useGpu_);
+      para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
+                             0, config_.size() * config_.size() * 3, useGpu_);
+      bias_ = creatParameterBias(config_.bias_parameter_name(),
+                                 1, config_.size() * 3, useGpu_);
+    } else if (typeid(T) == typeid(LstmLayer)) {
+      dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
+                                  batchSize, config_.size() * 4, useGpu_);
+      para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
+                             0, config_.size() * config_.size() * 4, useGpu_);
+      bias_ = creatParameterBias(config_.bias_parameter_name(),
+                                 1, config_.size() * 7, useGpu_);
+    }
+    layerMap_[dataLayer_->getName()] = dataLayer_;
+    parameterMap_[para_->getName()] = para_;
+    parameterMap_[bias_->getName()] = bias_;
+
+    layerMap_[testLayer_->getName()] = testLayer_;
+    testLayer_->init(layerMap_, parameterMap_);
+    testLayer_->setNeedGradient(true);
+    (dynamic_cast<T*>(testLayer_.get()))->useBatch_ = useBatch_;
+  }
+  void forward() {
+    FLAGS_use_gpu = useGpu_;
+    testLayer_->forward(PASS_GC);
+  }
+  void backward() {
+    FLAGS_use_gpu = useGpu_;
+    testLayer_->backward(nullptr);
+  }
+};
+
+template<class T>
+void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
+                         bool cpuBatch, bool gpuBatch) {
+  TestRecurrentLayer<T> testCpu(layerConfig, false, cpuBatch);
+  TestRecurrentLayer<T> testGpu(layerConfig, true, gpuBatch);
+  testCpu.init(batchSize);
+  testGpu.init(batchSize);
+  auto checkError = [](MatrixPtr cpu, MatrixPtr gpu,
+                       int numSequences, const char* str) {
+    CpuMatrix check(gpu->getHeight(), gpu->getWidth());
+    check.copyFrom(*gpu);
+    int height = cpu->getHeight();
+    int width = cpu->getWidth();
+    const real* data1 = cpu->getData();
+    const real* data2 = check.getData();
+    int count = 0;
+    for (int i = 0; i < height; i++) {
+      for (int j = 0; j < width; j++) {
+        if (fabs(data1[i * width + j] - data2[i * width + j]) / numSequences >
+            1e-4) {
+          count++;
+        }
+      }
+    }
+    EXPECT_EQ(count, 0) << "[" << str << "]" <<
+      "There are " << count << " different element.";
+  };
+  T* cpuLayer = dynamic_cast<T*>(testCpu.testLayer_.get());
+  T* gpuLayer = dynamic_cast<T*>(testGpu.testLayer_.get());
+
+  Argument& cpuInput = testCpu.dataLayer_->getOutput();
+  Argument& gpuInput = testGpu.dataLayer_->getOutput();
+  gpuInput.resizeAndCopyFrom(cpuInput, true);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+
+  const VectorPtr& cpuVec = testCpu.para_->getBuf(PARAMETER_VALUE);
+  const VectorPtr& gpuVec = testGpu.para_->getBuf(PARAMETER_VALUE);
+  gpuVec->copyFrom(*cpuVec);
+
+  const VectorPtr& cpuBiasVec = testCpu.bias_->getBuf(PARAMETER_VALUE);
+  const VectorPtr& gpuBiasVec = testGpu.bias_->getBuf(PARAMETER_VALUE);
+  gpuBiasVec->copyFrom(*cpuBiasVec);
+
+  /* check forward */
+  testCpu.forward();
+  testGpu.forward();
+
+  checkError(cpuLayer->getOutputValue(),
+             gpuLayer->getOutputValue(), 1, "outputValue");
+
+  /* check backward */
+  cpuLayer->getOutputGrad()->randomizeUniform();
+  gpuLayer->getOutputGrad()->copyFrom(*cpuLayer->getOutputGrad());
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+
+  testCpu.backward();
+  testGpu.backward();
+
+  // check input grad
+  checkError(cpuInput.grad, gpuInput.grad, 1, "inputGrad");
+  // check weight grad
+  int numSequences = cpuInput.getNumSequences();
+  checkError(cpuLayer->weight_->getWGrad(), gpuLayer->weight_->getWGrad(),
+             numSequences, "weightGrad");
+  // check bias grad
+  checkError(cpuLayer->bias_->getWGrad(), gpuLayer->bias_->getWGrad(),
+             numSequences, "biasGrad");
+}
+
+TEST(Layer, GatedRecurrentLayer) {
+  LayerConfig layerConfig;
+  layerConfig.set_type("gated_recurrent");
+  layerConfig.set_active_type("sigmoid");
+  layerConfig.set_active_gate_type("sigmoid");
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name("layer_0");
+  input.set_input_parameter_name("para_0");
+  layerConfig.set_bias_parameter_name("bias");
+
+  for (auto frameSize : {32, 64, 128, 256, 512}) {
+    for (auto batchSize : {1, 5, 100, 500}) {
+      for (auto reversed : {false, true}) {
+        for (auto cpuBatch : {false, true}) {
+          for (auto gpuBatch : {false, true}) {
+            LOG(INFO) << " batchSize=" << batchSize
+                      << " frameSize=" << frameSize << " reversed=" << reversed
+                      << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch;
+            layerConfig.set_size(frameSize);
+            layerConfig.set_reversed(reversed);
+            checkRecurrentLayer<GatedRecurrentLayer>(
+              layerConfig, batchSize, cpuBatch, gpuBatch);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Layer, LstmLayer) {
+  LayerConfig layerConfig;
+  layerConfig.set_type("lstmemory");
+  layerConfig.set_active_type("relu");
+  layerConfig.set_active_state_type("sigmoid");
+  layerConfig.set_active_gate_type("sigmoid");
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name("layer_0");
+  input.set_input_parameter_name("para_0");
+  layerConfig.set_bias_parameter_name("bias");
+
+  for (auto frameSize : {32, 64, 128, 256, 512}) {
+    for (auto batchSize : {1, 5, 100, 500}) {
+      for (auto reversed : {false, true}) {
+        for (auto cpuBatch : {false, true}) {
+          for (auto gpuBatch : {false, true}) {
+            LOG(INFO) << " batchSize=" << batchSize
+                      << " frameSize=" << frameSize << " reversed=" << reversed
+                      << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch;
+            layerConfig.set_size(frameSize);
+            layerConfig.set_reversed(reversed);
+            checkRecurrentLayer<LstmLayer>
+              (layerConfig, batchSize, cpuBatch, gpuBatch);
+          }
+        }
+      }
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  if (version::isWithGpu()) {
+    testing::InitGoogleTest(&argc, argv);
+    initMain(argc, argv);
+    return RUN_ALL_TESTS();
+  } else {
+    return 0;
+  }
+}
diff --git a/paddle/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
new file mode 100644
index 00000000000000..9a83217f1a8471
--- /dev/null
+++ b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
@@ -0,0 +1,446 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <paddle/utils/PythonUtil.h>
+#include <cstdlib>
+#include <ctime>
+#include <math.h>
+#include <gtest/gtest.h>
+#include <algorithm>
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/gserver/layers/Layer.h"
+#include "paddle/gserver/layers/FullyConnectedLayer.h"
+#include "paddle/gserver/layers/SelectiveFullyConnectedLayer.h"
+#include "ModelConfig.pb.h"
+#include "paddle/math/CpuSparseMatrix.h"
+#include "paddle/trainer/Trainer.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+P_DECLARE_bool(use_gpu);
+P_DECLARE_int32(num_passes);
+P_DECLARE_string(config);
+P_DECLARE_string(init_model_path);
+P_DECLARE_string(config_args);
+
+size_t fcLayerWidth = 1024;
+
+struct ComData {
+  vector<Argument> outArgs;
+  vector<ParameterPtr> parameters;
+};
+
+int randint(int* data, size_t int_max, size_t size) {
+  srand((size_t)(time(NULL)));
+  if (int_max < size) {
+    return -1;
+  }
+  size_t count = 0;
+  std::map<int, int> tmp;
+  int this_int = 0;
+
+  while (count < size) {
+    this_int = std::rand() % int_max; // NOLINT
+    if (tmp.find(this_int) == tmp.end()) {
+      tmp[this_int] = 0;
+      count += 1;
+    }
+  }
+
+  if (tmp.size() != size) {
+    return -1;
+  }
+  count = 0;
+  for (auto itr = tmp.begin(); itr != tmp.end(); ++itr) {
+    data[count] = itr->first;
+    count += 1;
+  }
+  return 0;
+}
+
+void calcOutput(ComData& comData, const string configFile,
+    const string configArgs, bool useGpu) {
+  FLAGS_config = configFile;
+  FLAGS_config_args = configArgs;
+  FLAGS_use_gpu = useGpu;
+  FLAGS_init_model_path = "gserver/tests/SelectiveFcTest/model";
+  *ThreadLocalRand::getSeed() = 0;
+  srand(0);
+
+  Trainer trainer;
+  trainer.init(TrainerConfigHelper::createFromFlags(), false);
+
+  comData.parameters = trainer.getGradientMachine()->getParameters();
+
+  auto dataProvider = trainer.getDataProvider();
+  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
+  DataBatch dataBatch;
+  dataProvider->setSkipShuffle();
+  dataProvider->reset();
+  dataProvider->getNextBatch(batchSize, &dataBatch);
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+
+  vector<Argument>& inArgs = dataBatch.getStreams();
+  trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
+  trainer.getGradientMachine()->forwardBackward(inArgs, &comData.outArgs,
+                                                PASS_TRAIN);
+  trainer.getGradientMachine()->finish();
+}
+
+void checkMatrix(real* A, real* B, size_t matSize) {
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+  int diffNum = 0;
+  for (size_t i = 0; i < matSize; ++i) {
+    if (std::isinf(A[i]) || std::isnan(A[i])
+        || std::isinf(B[i]) || std::isnan(B[i])) {
+    } else if (fabs(A[i] - B[i]) > err) {
+      diffNum++;
+    }
+  }
+  EXPECT_EQ(0, diffNum);
+}
+
+void checkTranspose(real* matrix, real* transpose,
+    size_t width, size_t matSize) {
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+  size_t height = matSize / width;
+  int diffNum = 0;
+  size_t rowId = 0;
+  size_t colId = 0;
+  for (size_t i = 0; i < matSize; ++i) {
+    if (i % width == 0 && i) {
+      rowId++;
+    }
+    colId = i % width;
+    if (fabs(matrix[i] - transpose[colId * height + rowId]) > err) {
+      diffNum++;
+      LOG(INFO) << i << " diff : " << matrix[i] << "\t"
+                << transpose[colId * height + rowId];
+    }
+  }
+  EXPECT_EQ(0, diffNum);
+}
+
+void compareOutput(ComData& fcData, ComData& selFcData) {
+  vector<Argument> outArgsFc = fcData.outArgs;
+  vector<Argument> outArgsSelfc = selFcData.outArgs;
+
+  // check cost
+  LOG(INFO) << "Check cost";
+  CpuMatrix fcCost(outArgsFc[0].value->getHeight(),
+                 outArgsFc[0].value->getWidth());
+  CpuMatrix selfcCost(outArgsSelfc[0].value->getHeight(),
+                    outArgsSelfc[0].value->getWidth());
+  fcCost.copyFrom(*outArgsFc[0].value);
+  selfcCost.copyFrom(*outArgsSelfc[0].value);
+  checkMatrix(fcCost.getData(), selfcCost.getData(), fcCost.getElementCnt());
+
+  // check selective fc output and fc output
+  LOG(INFO) << "Compare output of SelectiveFullyConectedLayer " <<
+    "with FullyConectedLayer";
+  CpuMatrix fcOut(outArgsFc[1].value->getHeight(),
+                 outArgsFc[1].value->getWidth());
+  CpuMatrix selfcOut(outArgsSelfc[1].value->getHeight(),
+                    outArgsSelfc[1].value->getWidth());
+
+  fcOut.copyFrom(*outArgsFc[1].value);
+  selfcOut.copyFrom(*outArgsSelfc[1].value);
+  checkMatrix(fcOut.getData(), selfcOut.getData(), fcOut.getElementCnt());
+
+  // check gradient math
+  vector<ParameterPtr>& fcParam = fcData.parameters;
+  vector<ParameterPtr>& selfcParam = selFcData.parameters;
+  for (size_t i = 0; i < fcParam.size(); ++i) {
+    ParameterPtr p1, p2;
+    p1 = fcParam[i];
+    p2 = selfcParam[i];
+
+    string paramName = p1->getName();
+    LOG(INFO) << "check parameter : " << paramName;
+
+    // check parameter value
+    CpuVector paraValue1(p1->getSize());
+    CpuVector paraValue2(p2->getSize());
+    paraValue1.copyFrom(*p1->getBuf(PARAMETER_VALUE));
+    paraValue2.copyFrom(*p2->getBuf(PARAMETER_VALUE));
+
+    // check gradient
+    CpuVector paraGrad1(*p1->getBuf(PARAMETER_GRADIENT));
+    CpuVector paraGrad2(*p2->getBuf(PARAMETER_GRADIENT));
+    if (paramName == "rand_fc_param.bias") {
+      checkMatrix(paraValue1.getData(),
+                  paraValue2.getData(),
+                  paraValue1.getSize());
+      checkMatrix(paraGrad1.getData(),
+                 paraGrad2.getData(),
+                 paraGrad1.getSize());
+    } else {
+      checkTranspose(paraValue1.getData(), paraValue2.getData(),
+          fcLayerWidth, paraValue1.getSize());
+      checkTranspose(paraGrad1.getData(), paraGrad2.getData(),
+          fcLayerWidth, paraGrad1.getSize());
+    }
+  }
+}
+
+void compareSparseMulOutput(real* fcOutput, real* selOutput, size_t nnz,
+    const std::shared_ptr<std::vector<std::pair<int*, size_t> > > &selCols) {
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+  size_t nnzCount = std::accumulate(selCols->begin(), selCols->end(), 0UL,
+                            [](size_t a, const std::pair<int*, size_t>& arr){
+    return a+arr.second;
+  });
+  EXPECT_EQ(nnz, nnzCount);
+
+  size_t sampleNum = selCols->size();
+  int diffNum = 0;
+  size_t count = 0;
+  for (size_t i = 0; i < sampleNum; ++i) {
+    for (size_t j = 0; j < (*selCols)[i].second; ++j) {
+      size_t selIdx = (*selCols)[i].first[j];
+      if (fabs(fcOutput[i * fcLayerWidth + selIdx] - selOutput[count]) > err) {
+        diffNum++;
+        LOG(INFO) << count << " diff : "
+                  << fcOutput[i * fcLayerWidth + selIdx] << "\t"
+                  << selOutput[count];
+       }
+      count++;
+    }
+  }
+  EXPECT_EQ(0, diffNum);
+}
+
+LayerPtr creatDataLayer(string name, size_t batchSize, size_t layerSize,
+    std::vector<real>& values, bool useGpu) {
+  LayerConfig dataConfig;
+  dataConfig.set_name(name);
+  dataConfig.set_type("data");
+  dataConfig.set_size(layerSize);
+  LayerPtr layer = LayerPtr(new DataLayer(dataConfig));
+
+  Argument data;
+  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.value->copyFrom(values.data(), batchSize * layerSize);
+
+  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  dataLayer->setData(data);
+  dataLayer->forward(PASS_TEST);
+  return layer;
+}
+
+ParameterPtr creatParameter(string name, int pid, size_t paraSize,
+        string paramFile, bool useGpu) {
+  ParameterConfig paraConfig;
+  paraConfig.set_name(name);
+  paraConfig.set_size(paraSize);
+
+  ParameterPtr parameter =
+      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ false);
+  parameter->enableType(PARAMETER_VALUE);
+  parameter->randomize();
+  parameter->setID(pid);
+  parameter->load(paramFile);
+  return parameter;
+}
+
+LayerPtr initFcLayer(LayerPtr dataLayer, LayerConfig layerConfig,
+    int dataLayerSize, int fcLayerSize,
+    string paraName, string paraFile, bool useGpu) {
+  LayerMap layerMap;
+  ParameterMap parameterMap;
+
+  layerMap[dataLayer->getName()] = dataLayer;
+  ParameterPtr para =
+      creatParameter(paraName, 0, dataLayerSize * fcLayerSize,
+      paraFile, useGpu);
+  parameterMap[para->getName()] = para;
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name(dataLayer->getName());
+  input.set_input_parameter_name(paraName);
+
+  LayerPtr testLayer = Layer::create(layerConfig);
+  layerMap[testLayer->getName()] = testLayer;
+
+  testLayer->setNeedGradient(false);
+  testLayer->init(layerMap, parameterMap);
+  return testLayer;
+}
+
+#ifndef PADDLE_TYPE_DOUBLE
+// The parameter file used in fc.conf and selective_fc.conf is float
+TEST(Layer, SelectiveFcLayer_train_dense_mul) {
+  const string& fcConfig =
+      "gserver/tests/SelectiveFcTest/conf/fc.conf";
+  const string& fcConfigArgs =
+    "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
+  const string& selFcConfig =
+      "gserver/tests/SelectiveFcTest/conf/selective_fc.conf";
+  const string& selConfigArgs =
+    "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
+
+  for (auto useGpu : {false, true}) {
+#ifdef PADDLE_ONLY_CPU
+    if (useGpu) {
+      break;
+    }
+#endif
+    LOG(INFO) << "FullyConnectedLayer forwardBackward()";
+    ComData fcData;
+    calcOutput(fcData, fcConfig, fcConfigArgs, useGpu);
+
+    LOG(INFO) << "SelectiveFullyConnectedLayer forwardBackward()";
+    ComData selFcData;
+    calcOutput(selFcData, selFcConfig, selConfigArgs, useGpu);
+    compareOutput(fcData, selFcData);
+  }
+}
+#endif  // PADDLE_TYPE_DOUBLE
+
+void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config,
+                                        bool useGpu) {
+  FLAGS_use_gpu = useGpu;
+  size_t batchSize = 100;
+  size_t dataLayerSize = 512;
+  std::vector<real> values(batchSize * dataLayerSize);
+  for (size_t j = 0; j < batchSize * dataLayerSize; ++j) {
+    values[j] = std::rand() / real(RAND_MAX);
+  }
+  LayerPtr dataLayer = creatDataLayer(
+      "data", batchSize, dataLayerSize, values, useGpu);
+
+  const string& selfcParaFile =
+    "gserver/tests/SelectiveFcTest/model/rand_fc_param.w.transpose";
+  const string& selfcParaName = "rand_fc_param.w.transpose";
+
+  std::shared_ptr<SelectiveFullyConnectedLayer> selfcLayer =
+    std::dynamic_pointer_cast<SelectiveFullyConnectedLayer>(initFcLayer(
+        dataLayer, config, dataLayerSize, fcLayerWidth,
+        selfcParaName, selfcParaFile, useGpu));
+
+  // create selected columns
+  std::shared_ptr<std::vector<std::pair<int*, size_t> > > selCols(
+     new std::vector<std::pair<int*, size_t> > (batchSize));
+  size_t maxNNZ = 30;
+  srand((size_t)(time(NULL)));
+  int total = 0;
+  while (total == 0) {
+    for (size_t i = 0; i < batchSize; ++i) {
+      size_t num = std::rand() % maxNNZ;
+      int* data = new int[num];
+      randint(data, fcLayerWidth, num);
+      (*selCols)[i] = std::make_pair(data, num);
+      total += num;
+    }
+  }
+  selfcLayer->fillSelectiveData(selCols);
+  selfcLayer->forward(PASS_TEST);
+
+  MatrixPtr outMatSelfc = selfcLayer->getOutputValue();
+  CpuSparseMatrixPtr cpuOutMatSelfc(
+    new CpuSparseMatrix(outMatSelfc->getHeight(), outMatSelfc->getWidth(),
+                        outMatSelfc->getElementCnt()));
+  cpuOutMatSelfc->copyFrom(*outMatSelfc, HPPL_STREAM_DEFAULT);
+#ifndef PADDLE_ONLY_CPU
+  if (useGpu) {
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  }
+#endif
+  real* outValueSelfc = cpuOutMatSelfc->getValue();
+  size_t nnz = cpuOutMatSelfc->getElementCnt();
+
+  const string& fcParaFile =
+    "gserver/tests/SelectiveFcTest/model/rand_fc_param.w";
+  const string& fcParaName = "rand_fc_param.w";
+  LayerConfig fcLayerConfig;
+  fcLayerConfig.set_name("fc_layer");
+  fcLayerConfig.set_type("fc");
+  fcLayerConfig.set_active_type("linear");
+  fcLayerConfig.set_size(fcLayerWidth);
+
+  LayerPtr fcLayer = initFcLayer(dataLayer, fcLayerConfig,
+      dataLayerSize, fcLayerWidth, fcParaName, fcParaFile, useGpu);
+  fcLayer->forward(PASS_TEST);
+
+  MatrixPtr outMatFc = fcLayer->getOutputValue();
+  MatrixPtr cpuOutMatFc(
+    new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth()));
+  cpuOutMatFc->copyFrom(*outMatFc, HPPL_STREAM_DEFAULT);
+#ifndef PADDLE_ONLY_CPU
+  if (useGpu) {
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  }
+#endif
+  real* outValueFc = cpuOutMatFc->getData();
+
+  compareSparseMulOutput(outValueFc, outValueSelfc, nnz, selCols);
+  for (size_t i = 0; i < batchSize; ++i) {
+    delete [](*selCols)[i].first;
+  }
+}
+
+#ifndef PADDLE_TYPE_DOUBLE
+// The parameter file used in testSelectiveFcLayerTrainSparseMul is float
+TEST(Layer, SelectiveFcLayer_train_sparse_mul) {
+  LayerConfig selLayerConfig;
+  selLayerConfig.set_name("sel_fc");
+  selLayerConfig.set_type("selective_fc");
+  selLayerConfig.set_active_type("linear");
+  selLayerConfig.set_has_selected_colums(false);
+  selLayerConfig.set_selective_fc_pass_generation(true);
+  selLayerConfig.set_size(fcLayerWidth);
+
+  testSelectiveFcLayerTrainSparseMul(selLayerConfig, false);
+#ifndef PADDLE_ONLY_CPU
+  testSelectiveFcLayerTrainSparseMul(selLayerConfig, true);
+#endif
+}
+#endif  // PADDLE_TYPE_DOUBLE
+
+// TODO(dangqingqing) test multi threads after support in matrix
+// TEST(Layer, SelectiveFcLayer_train_sparse_mul_parallel) {
+//   LayerConfig selLayerConfig;
+//   selLayerConfig.set_name("sel_fc");
+//   selLayerConfig.set_type("selective_fc");
+//   selLayerConfig.set_active_type("linear");
+//   selLayerConfig.set_has_selected_colums(false);
+//   selLayerConfig.set_selective_fc_pass_generation(true);
+//   selLayerConfig.set_selective_fc_parallel_plain_mul_thread_num(10);
+//   selLayerConfig.set_selective_fc_full_mul_ratio(1000);
+//   selLayerConfig.set_size(fcLayerWidth);
+//   SelectiveFcLayer_test(selLayerConfig, false);
+// }
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  initPython(argc, argv);
+  int ret = RUN_ALL_TESTS();
+  return ret;
+}
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
new file mode 100644
index 00000000000000..36166236e9effd
--- /dev/null
+++ b/paddle/math/Allocator.h
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <mutex>
+#include <malloc.h>
+#include "hl_gpu.h"
+#include "paddle/utils/Logging.h"
+
+namespace paddle {
+
+/**
+ * @brief Allocator base class.
+ *
+ * This is the base class of all Allocator class.
+ */
+class Allocator {
+public:
+  virtual ~Allocator() {}
+  virtual void* alloc(size_t size) = 0;
+  virtual void free(void* ptr) = 0;
+  virtual std::string getName() = 0;
+};
+
+/**
+ * @brief CPU allocator implementation.
+ */
+class CpuAllocator : public Allocator {
+public:
+  ~CpuAllocator() {}
+
+  /**
+   * @brief Aligned allocation on CPU.
+   * @param size Size to be allocated.
+   * @return Pointer to the allocated memory
+   */
+  virtual void* alloc(size_t size) {
+    void* ptr = memalign(32ul, size);
+    CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
+    return ptr;
+  }
+
+  /**
+   * @brief Free the memory space.
+   * @param ptr  Pointer to be free.
+   */
+  virtual void free(void* ptr) {
+    if (ptr) { ::free(ptr); }
+  }
+
+  virtual std::string getName() {
+    return "cpu_alloc";
+  }
+};
+
+/**
+ * @brief GPU allocator implementation.
+ */
+class GpuAllocator : public Allocator {
+public:
+  ~GpuAllocator() {}
+
+  /**
+   * @brief Allocate GPU memory.
+   * @param size Size to be allocated.
+   * @return Pointer to the allocated memory
+   */
+  virtual void* alloc(size_t size) {
+    void* ptr = hl_malloc_device(size);
+    CHECK(ptr)<< "Fail to allocate GPU memory " << size << " bytes";
+    return ptr;
+  }
+
+  /**
+   * @brief Free the GPU memory.
+   * @param ptr  Pointer to be free.
+   */
+  virtual void free(void* ptr) {
+    if (ptr) {
+      hl_free_mem_device(ptr);
+    }
+  }
+
+  virtual std::string getName() {
+    return "gpu_alloc";
+  }
+};
+
+/**
+ * @brief CPU pinned memory allocator implementation.
+ */
+class CudaHostAllocator : public Allocator {
+public:
+  ~CudaHostAllocator() {}
+
+  /**
+   * @brief Allocate pinned memory.
+   * @param size Size to be allocated.
+   * @return Pointer to the allocated memory
+   */
+  virtual void* alloc(size_t size) {
+    void* ptr = hl_malloc_host(size);
+    CHECK(ptr) << "Fail to allocate pinned memory " << size << " bytes";
+    return ptr;
+  }
+
+  /**
+   * @brief Free the pinned memory.
+   * @param ptr  Pointer to be free.
+   */
+  virtual void free(void* ptr) {
+    if (ptr) {
+      hl_free_mem_host(ptr);
+    }
+  }
+
+  virtual std::string getName() {
+    return "cuda_host_alloc";
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
new file mode 100644
index 00000000000000..8b888b1ee5e46e
--- /dev/null
+++ b/paddle/math/BaseMatrix.cu
@@ -0,0 +1,1540 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cmath>
+#include <string.h>
+#include <paddle/utils/Logging.h>
+#include "BaseMatrix.h"
+#include "hl_matrix_ops.cuh"
+#include "hl_matrix_base.cuh"
+#include "hl_matrix_apply.cuh"
+#include "SIMDFunctions.h"
+#include "MathFunctions.h"
+
+namespace paddle {
+
+const char* SPARSE_SUPPORT_ERROR = "Sparse Matrix/Vector is not supported.";
+
+template<class T>
+template <class Op>
+int BaseMatrixT<T>::applyUnary(Op op) {
+  MatrixOffset offset(0, 0);
+  applyUnary(op, height_, width_, offset);
+  return 0;
+}
+
+template<class T>
+template <class Op>
+int BaseMatrixT<T>::applyUnary(Op op, int numRows, int numCols,
+                               MatrixOffset& offset) {
+  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
+  int dimM = numRows;
+  int dimN = numCols;
+  int lda = stride_;
+
+  T* A = data_;
+  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
+
+  CHECK_LE(dimM + offset.aRow_, this->height_);
+  CHECK_LE(dimN + offset.aCol_, this->width_);
+  if (true == useGpu_) {
+    hl_gpu_apply_unary_op(op, A, dimM, dimN, lda);
+  } else {
+    hl_cpu_apply_unary_op(op, A, dimM, dimN, lda);
+  }
+  return 0;
+}
+
+template<class T>
+template <class Op>
+int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
+  CHECK(height_ == b.height_ && width_ == b.width_)
+      << "Matrix dimensions are not equal";
+
+  MatrixOffset offset(0, 0, 0, 0);
+  applyBinary(op, b, height_, width_, offset);
+  return 0;
+}
+
+template<class T>
+template <class Op>
+int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
+                                MatrixOffset& offset) {
+  applyBinary(op, b, numRows, numCols, offset, false_type(), false_type());
+  return 0;
+}
+
+template<class T>
+template <class Op, class bAsRowVector, class bAsColVector>
+int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
+                            MatrixOffset& offset, bAsRowVector, bAsColVector) {
+  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK(useGpu_ == b.useGpu_) << "Matrix type mismatch";
+
+  int dimM = numRows;
+  int dimN = numCols;
+  int lda = stride_;
+  int ldb = b.stride_;
+
+  T* A = data_;
+  T* B = b.data_;
+  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
+                           offset.bRow_);
+  CHECK_LE(dimM + offset.aRow_, this->height_);
+  CHECK_LE(dimN + offset.aCol_, this->width_);
+  if (!bAsRowVector::value && !bAsColVector::value) {
+    CHECK_LE(dimM + offset.bRow_, b.height_);
+    CHECK_LE(dimN + offset.bCol_, b.width_);
+  } else if (bAsRowVector::value && !bAsColVector::value) {
+    CHECK_LE(dimN + offset.bCol_, b.width_);
+  } else if (!bAsRowVector::value && bAsColVector::value) {
+    CHECK_LE(dimM + offset.bRow_, b.height_);
+  } else {
+  }
+  if (true == useGpu_) {
+    hl_gpu_apply_binary_op<T, Op, bAsRowVector::value, bAsColVector::value>(
+        op, A, B, dimM, dimN, lda, ldb);
+  } else {
+    hl_cpu_apply_binary_op<T, Op, bAsRowVector::value, bAsColVector::value>(
+        op, A, B, dimM, dimN, lda, ldb);
+  }
+
+  return 0;
+}
+
+template<class T>
+template <class Op>
+int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
+  CHECK_EQ(height_, b.height_);
+  CHECK_EQ(width_, b.width_);
+  CHECK_EQ(height_, c.height_);
+  CHECK_EQ(width_, c.width_);
+
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  applyTernary(op, b, c, height_, width_, offset);
+
+  return 0;
+}
+
+template<class T>
+template <class Op>
+int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
+                                 int numRows, int numCols,
+                                 MatrixOffset& offset) {
+  applyTernary(op, b, c, numRows, numCols, offset, false_type(), false_type());
+
+  return 0;
+}
+
+template<class T>
+template <class Op, class cAsRowVector, class cAsColVector>
+int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
+                                 int numRows, int numCols, MatrixOffset& offset,
+                                 cAsRowVector, cAsColVector) {
+  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK_EQ(useGpu_, b.useGpu_);
+  CHECK_EQ(useGpu_, c.useGpu_);
+
+  int dimM = numRows;
+  int dimN = numCols;
+  int lda = stride_;
+  int ldb = b.stride_;
+  int ldc = c.stride_;
+
+  T* A = data_;
+  T* B = b.data_;
+  T* C = c.data_;
+  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
+                           offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_,
+                           offset.cRow_);
+
+  CHECK_LE(dimM + offset.aRow_, this->height_);
+  CHECK_LE(dimN + offset.aCol_, this->width_);
+  CHECK_LE(dimM + offset.bRow_, b.height_);
+  CHECK_LE(dimN + offset.bCol_, b.width_);
+  if (!cAsRowVector::value && !cAsColVector::value) {
+    CHECK_LE(dimM + offset.cRow_, c.height_);
+    CHECK_LE(dimN + offset.cCol_, c.width_);
+  } else if (cAsRowVector::value && !cAsColVector::value) {
+    CHECK_LE(dimN + offset.cCol_, c.width_);
+  } else if (!cAsRowVector::value && cAsColVector::value) {
+    CHECK_LE(dimM + offset.cRow_, c.height_);
+  } else {
+  }
+
+  if (true == useGpu_) {
+    hl_gpu_apply_ternary_op
+      <T, Op, cAsRowVector::value, cAsColVector::value>(
+        op, A, B, C, dimM, dimN, lda, ldb, ldc);
+  } else {
+    hl_cpu_apply_ternary_op
+      <T, Op, cAsRowVector::value, cAsColVector::value>(
+        op, A, B, C, dimM, dimN, lda, ldb, ldc);
+  }
+
+  return 0;
+}
+
+template<class T>
+template <class Op>
+int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
+                                    BaseMatrixT& d) {
+  CHECK_EQ(height_, b.height_);
+  CHECK_EQ(width_, b.width_);
+  CHECK_EQ(height_, c.height_);
+  CHECK_EQ(width_, c.width_);
+  CHECK_EQ(height_, d.height_);
+  CHECK_EQ(width_, d.width_);
+
+  MatrixOffset offset(0, 0, 0, 0, 0, 0, 0, 0);
+  applyQuaternary(op, b, c, d, height_, width_, offset);
+
+  return 0;
+}
+
+template<class T>
+template <class Op>
+int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
+                                    BaseMatrixT& d, int numRows, int numCols,
+                                    MatrixOffset& offset) {
+  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK(!d.isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK_EQ(useGpu_, b.useGpu_);
+  CHECK_EQ(useGpu_, c.useGpu_);
+  CHECK_EQ(useGpu_, d.useGpu_);
+
+  int dimM = numRows;
+  int dimN = numCols;
+  int lda = stride_;
+  int ldb = b.stride_;
+  int ldc = c.stride_;
+  int ldd = d.stride_;
+
+  T* A = data_;
+  T* B = b.data_;
+  T* C = c.data_;
+  T* D = d.data_;
+  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
+                           offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_,
+                           offset.cRow_);
+  CAL_MATRIX_START_ADDRESS(D, d.height_, d.width_, ldd, offset.dCol_,
+                           offset.dRow_);
+
+  CHECK_LE(dimM + offset.aRow_, this->height_);
+  CHECK_LE(dimN + offset.aCol_, this->width_);
+  CHECK_LE(dimM + offset.bRow_, b.height_);
+  CHECK_LE(dimN + offset.bCol_, b.width_);
+  CHECK_LE(dimM + offset.cRow_, c.height_);
+  CHECK_LE(dimN + offset.cCol_, c.width_);
+  CHECK_LE(dimM + offset.dRow_, d.height_);
+  CHECK_LE(dimN + offset.dCol_, d.width_);
+  if (true == useGpu_) {
+    hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb,
+                               ldc, ldd);
+  } else {
+    hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb,
+                               ldc, ldd);
+  }
+
+  return 0;
+}
+
+template<class T>
+template <class Agg, class Op, class Saver, class aAsRowVector,
+          class aAsColVector>
+int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
+                              int numRows, int numCols, MatrixOffset& offset,
+                              aAsRowVector, aAsColVector) {
+  CHECK_EQ(useGpu_, b.useGpu_);
+
+  int ld = stride_;
+  int ldb = b.stride_;
+
+  T* dst = data_;
+  T* B = b.data_;
+  CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_,
+                           offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
+                           offset.bRow_);
+
+  if (aAsRowVector::value && !aAsColVector::value) {
+    if (useGpu_) {
+      hl_gpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, ldb);
+    } else {
+      hl_cpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, ldb);
+    }
+  } else if (!aAsRowVector::value && aAsColVector::value) {
+    if (useGpu_) {
+      hl_gpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, ldb);
+    } else {
+      hl_cpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, ldb);
+    }
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+
+  return 0;
+}
+
+template<class T>
+template <class Agg, class Op, class Saver, class aAsRowVector,
+          class aAsColVector>
+int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
+                              BaseMatrixT& c, int numRows, int numCols,
+                              MatrixOffset& offset, aAsRowVector,
+                              aAsColVector) {
+  CHECK_EQ(useGpu_, b.useGpu_);
+  CHECK_EQ(useGpu_, c.useGpu_);
+
+  int ld = stride_;
+  int ldb = b.stride_;
+  int ldc = c.stride_;
+
+  T* dst = data_;
+  T* B = b.data_;
+  T* C = c.data_;
+  CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_,
+                           offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
+                           offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_,
+                           offset.cRow_);
+
+  if (aAsRowVector::value && !aAsColVector::value) {
+    if (useGpu_) {
+      hl_gpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B,
+                              ldb, C, ldc);
+    } else {
+      hl_cpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B,
+                              ldb, C, ldc);
+    }
+  } else if (!aAsRowVector::value && aAsColVector::value) {
+    if (useGpu_) {
+      hl_gpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B,
+                           ldb, C, ldc);
+    } else {
+      hl_cpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B,
+                           ldb, C, ldc);
+    }
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+
+  return 0;
+}
+
+/**
+ * @brief   unary operator.
+ *
+ */
+
+DEFINE_MATRIX_UNARY_OP(Neg, a = -a);
+template<class T>
+void BaseMatrixT<T>::neg() { applyUnary(unary::Neg<T>()); }
+
+DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a));
+template<>
+void BaseMatrixT<real>::exp() { applyUnary(unary::Exp<real>()); }
+
+DEFINE_MATRIX_UNARY_OP(Log, a = log(a));
+template<>
+void BaseMatrixT<real>::log() {
+  if (useGpu_) {
+    applyUnary(unary::Log<real>());
+  } else {
+    vLog(height_ * width_, data_, data_);
+  }
+}
+
+DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a));
+template<>
+void BaseMatrixT<real>::sqrt() { applyUnary(unary::Sqrt<real>()); }
+
+DEFINE_MATRIX_UNARY_OP(Square, a = a * a);
+template<class T>
+void BaseMatrixT<T>::square() { applyUnary(unary::Square<T>()); }
+
+DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a);
+template<class T>
+void BaseMatrixT<T>::reciprocal() { applyUnary(unary::Reciprocal<T>()); }
+
+DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a);
+template<class T>
+void BaseMatrixT<T>::abs() { applyUnary(unary::Abs<T>()); }
+
+DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0));
+template<class T>
+void BaseMatrixT<T>::sign() { applyUnary(unary::Sign<T>()); }
+
+DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
+template<class T>
+void BaseMatrixT<T>::zero() { applyUnary(unary::Zero<T>()); }
+
+template<class T>
+void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
+  int numRows = height_;
+  int numCols = numColumns;
+  MatrixOffset offset(columnOffset, 0);
+  applyUnary(unary::Zero<T>(), numRows, numCols, offset);
+}
+
+DEFINE_MATRIX_UNARY_OP(One, a = 1);
+template<class T>
+void BaseMatrixT<T>::one() { applyUnary(unary::One<T>()); }
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p));
+template<>
+void BaseMatrixT<real>::pow(real p) {
+  if (useGpu_) {
+    applyUnary(unary::Pow<real>(p));
+  } else {
+    vPow(height_ * width_, data_, p, data_);
+  }
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a -= p);
+template<class T>
+void BaseMatrixT<T>::subScalar(T p) { applyUnary(unary::SubScalar<T>(p)); }
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a *= p);
+template<class T>
+void BaseMatrixT<T>::mulScalar(T p) { applyUnary(unary::MulScalar<T>(p)); }
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a /= p);
+template<class T>
+void BaseMatrixT<T>::divScalar(T p) { applyUnary(unary::DivScalar<T>(p)); }
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(Assign, ONE_PARAMETER, a = p);
+template<class T>
+void BaseMatrixT<T>::assign(T p) { applyUnary(unary::Assign<T>(p)); }
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(Add, ONE_PARAMETER, a += p);
+template<class T>
+void BaseMatrixT<T>::add(T p) { applyUnary(unary::Add<T>(p)); }
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = a * p1 + p2);
+template<class T>
+void BaseMatrixT<T>::add(T p1, T p2) { applyUnary(unary::Add2<T>(p1, p2)); }
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip, TWO_PARAMETER,
+                                 a = a < p1 ? p1 : (a > p2 ? p2 : a));
+template<class T>
+void BaseMatrixT<T>::clip(T p1, T p2) { applyUnary(unary::Clip<T>(p1, p2)); }
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar, ONE_PARAMETER,
+                                 a = a > p ? 1.0f : 0.0f);
+template<class T>
+void BaseMatrixT<T>::biggerThanScalar(T p) {
+  applyUnary(unary::BiggerThanScalar<T>(p));
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER,
+                                 a = a > p ? a : p);
+template<class T>
+void BaseMatrixT<T>::downClip(T p) {
+  applyUnary(unary::DownClip<T>(p));
+}
+
+/**
+ * @brief   binary operator.
+ *
+ */
+
+DEFINE_MATRIX_BINARY_OP(Add, a += b);
+template<class T>
+void BaseMatrixT<T>::add(BaseMatrixT& b) {
+  applyBinary(binary::Add<T>(), b);
+}
+
+template<>
+void BaseMatrixT<real>::add(BaseMatrixT& b) {
+  if (useGpu_) {
+    applyBinary(binary::Add<real>(), b);
+  } else {  // cpu branch
+    CHECK_EQ(height_, b.height_);
+    CHECK_EQ(width_, b.width_);
+    vAdd(height_ * width_, data_, b.data_, data_);
+  }
+}
+
+template<class T>
+void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
+  if (columnOffset + b.width_ <= width_) {
+    int numRows = height_;
+    int numCols = b.width_;
+    MatrixOffset offset(columnOffset, 0, 0, 0);
+    applyBinary(binary::Add<T>(), b, numRows, numCols, offset);
+  } else if (columnOffset + width_ <= b.width_) {
+    int numRows = height_;
+    int numCols = width_;
+    MatrixOffset offset(0, 0, columnOffset, 0);
+    applyBinary(binary::Add<T>(), b, numRows, numCols, offset);
+  } else {
+    LOG(FATAL) << "Wrong argument "
+               << " a.width=" << width_ << " b.width=" << b.width_
+               << " columnOffset=" << columnOffset;
+  }
+}
+
+template<class T>
+void BaseMatrixT<T>::addP2P(BaseMatrixT& b) {
+  T* A = data_;
+  T* B = b.data_;
+  int dimM = height_;
+  int dimN = width_;
+
+  hl_gpu_apply_binary_op<T, binary::Add<T>, 0, 0>
+    (binary::Add<T>(), A, B, dimM, dimN, dimN, dimN);
+}
+
+template<class T>
+void BaseMatrixT<T>::addColVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::Add<T>(), b, numRows, numCols, offset, false_type(),
+              true_type() /* bAsColVector */);
+}
+
+template<class T>
+void BaseMatrixT<T>::addRowVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::Add<T>(), b, numRows, numCols, offset,
+              true_type() /* bAsRowVector */, false_type());
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Add1, ONE_PARAMETER, a += b * p);
+template<class T>
+void BaseMatrixT<T>::add(BaseMatrixT& b, T p) {
+  applyBinary(binary::Add1<T>(p), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p));
+template<>
+void BaseMatrixT<real>::pow(BaseMatrixT& b, real p) {
+  if (useGpu_) {
+    applyBinary(binary::Pow<real>(p), b);
+  } else {
+    vPow(height_ * width_, b.data_, p, data_);
+  }
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = p1 * a + p2 * b);
+template<class T>
+void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, T p2) {
+  applyBinary(binary::Add2<T>(p1, p2), b);
+}
+
+template<class T>
+void BaseMatrixT<T>::addBias(BaseMatrixT& b, T scale) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::Add1<T>(scale), b, numRows, numCols, offset,
+              true_type() /* bAsRowVector */, false_type());
+}
+
+DEFINE_MATRIX_BINARY_OP(Sub, a -= b);
+template<class T>
+void BaseMatrixT<T>::sub(BaseMatrixT& b) { applyBinary(binary::Sub<T>(), b); }
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Sub1, ONE_PARAMETER, a -= b * p);
+template<class T>
+void BaseMatrixT<T>::sub(BaseMatrixT& b, T p) {
+  applyBinary(binary::Sub1<T>(p), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Relu, b = a > 0.0f ? a : 0.0f);
+template<class T>
+void BaseMatrixT<T>::relu(BaseMatrixT& b) { applyBinary(binary::Relu<T>(), b); }
+
+DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f));
+template<class T>
+void BaseMatrixT<T>::reluDerivative(BaseMatrixT& b) {
+  applyBinary(binary::ReluDerivative<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0;
+                        b = log(1.0 + exp((a > THRESHOLD)
+                                              ? THRESHOLD
+                                              : ((a < -THRESHOLD) ? (-THRESHOLD)
+                                                                  : a))));
+template<>
+void BaseMatrixT<real>::softrelu(BaseMatrixT& b) {
+  applyBinary(binary::Softrelu<real>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(
+    SoftreluDerivative, const T THRESHOLD = 40.0;
+    a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
+                                ? THRESHOLD
+                                : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
+template<>
+void BaseMatrixT<real>::softreluDerivative(BaseMatrixT& b) {
+  applyBinary(binary::SoftreluDerivative<real>(), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Brelu, TWO_PARAMETER, b = a > p1 ? a : p1;
+                                  b = b < p2 ? b : p2);
+template<class T>
+void BaseMatrixT<T>::brelu(BaseMatrixT& b) {
+  int p1 = 0, p2 = 24;    //! TODO(yuyang18): Make p1,p2 configuable.
+  applyBinary(binary::Brelu<T>(p1, p2), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative, TWO_PARAMETER,
+                                  a *= (b > p1 && b < p2) ? 1.0 : 0.0);
+template<class T>
+void BaseMatrixT<T>::breluDerivative(BaseMatrixT& b) {
+  int p1 = 0, p2 = 24;
+  applyBinary(binary::BreluDerivative<T>(p1, p2), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Square, b = a * a);
+template<class T>
+void BaseMatrixT<T>::square(BaseMatrixT& b) {
+  applyBinary(binary::Square<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(SquareDerivative, a *= 2.0 * b);
+template<class T>
+void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) {
+  applyBinary(binary::SquareDerivative<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Tanh, b = 2.0 / (1.0 + exp(-2 * a)) - 1.0);
+template<>
+void BaseMatrixT<real>::tanh(BaseMatrixT& b) {
+  applyBinary(binary::Tanh<real>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(TanhDerivative, a *= 1 - b * b);
+template<class T>
+void BaseMatrixT<T>::tanhDerivative(BaseMatrixT& b) {
+  applyBinary(binary::TanhDerivative<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanh, TWO_PARAMETER,
+                                  b = p1 *
+                                      (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0));
+template<>
+void BaseMatrixT<real>::scaledTanh(BaseMatrixT& b, real p1, real p2) {
+  applyBinary(binary::ScaledTanh<real>(p1, p2), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative, TWO_PARAMETER,
+                                  a *= p2 * (p1 - b * b));
+template<class T>
+void BaseMatrixT<T>::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) {
+  applyBinary(binary::ScaledTanhDerivative<T>(p1 * p1, p2 / p1), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a);
+template<class T>
+void BaseMatrixT<T>::reciprocal(BaseMatrixT& b) {
+  applyBinary(binary::Reciprocal<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(ReciprocalDerivative, a *= -b * b);
+template<class T>
+void BaseMatrixT<T>::reciprocalDerivative(BaseMatrixT& b) {
+  applyBinary(binary::ReciprocalDerivative<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a);
+template<class T>
+void BaseMatrixT<T>::abs(BaseMatrixT& b) { applyBinary(binary::Abs<T>(), b); }
+
+DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0);
+template<class T>
+void BaseMatrixT<T>::absDerivative(BaseMatrixT& b) {
+  applyBinary(binary::AbsDerivative<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(
+    Sigmoid, const T THRESHOLD_MIN = -40.0; const T THRESHOLD_MAX = 13.0;
+    T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN
+                                   : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
+    b = 1.0f / (1.0f + exp(-tmp)));
+template<>
+void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
+  if (useGpu_) {
+    applyBinary(binary::Sigmoid<real>(), b);
+  } else {  // cpu versioni
+    size_t numSamples = this->height_;
+    size_t dim = this->width_;
+    CHECK_EQ(b.height_, numSamples);
+    CHECK_EQ(b.width_, dim);
+    const real* in = this->data_;
+    real* out = b.data_;
+
+    // out = - in
+    const float THRESHOLD_MIN = -40.0;  // make sure sigmoid(x) > 0
+    const float THRESHOLD_MAX = 13.0;   // make sure sigmoid(x) < 1
+    for (size_t i = 0; i < numSamples * dim; ++i) {
+      real tmp = in[i];
+      tmp = (tmp < THRESHOLD_MIN)
+                ? THRESHOLD_MIN
+                : ((tmp > THRESHOLD_MAX) ? THRESHOLD_MAX : tmp);
+      out[i] = -tmp;
+    }
+
+    // out = exp(out)
+    vExp(numSamples * dim, out, out);
+
+    // out = 1 / (1 + out)
+    for (size_t i = 0; i < numSamples * dim; ++i) {
+      out[i] = 1 / (1 + out[i]);
+    }
+  }
+}
+
+DEFINE_MATRIX_BINARY_OP(SigmoidDerivative, a *= b * (1 - b));
+template<class T>
+void BaseMatrixT<T>::sigmoidDerivative(BaseMatrixT& b) {
+  applyBinary(binary::SigmoidDerivative<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(ExpDerivative, a *= b);
+template<class T>
+void BaseMatrixT<T>::expDerivative(BaseMatrixT& b) {
+  applyBinary(binary::ExpDerivative<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f);
+template<class T>
+void BaseMatrixT<T>::sign(BaseMatrixT& b) { applyBinary(binary::Sign<T>(), b); }
+
+DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b));
+template<>
+void BaseMatrixT<real>::exp(BaseMatrixT& b) {
+  applyBinary(binary::Exp<real>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Log, a = log(b));
+template<>
+void BaseMatrixT<real>::log(BaseMatrixT& b) {
+  if (useGpu_) {
+    applyBinary(binary::Log<real>(), b);
+  } else {
+    vLog(height_ * width_, b.data_, data_);
+  }
+}
+
+DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b));
+template<>
+void BaseMatrixT<real>::sqrt(BaseMatrixT& b) {
+  applyBinary(binary::Sqrt<real>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(InvSqrt, a = 1.0f / sqrt(b));
+template<>
+void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
+  if (useGpu_) {
+    applyBinary(binary::InvSqrt<real>(), b);
+  } else {  // cpu branch
+    CHECK_EQ(height_, b.height_);
+    CHECK_EQ(width_, b.width_);
+    vInvSqrt(height_ * width_, b.data_, data_);
+  }
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(IsEqual, ONE_PARAMETER, a = (b == p));
+template<class T>
+void BaseMatrixT<T>::isEqualTo(BaseMatrixT& b, T value) {
+  applyBinary(binary::IsEqual<T>(value), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(AddScalar, ONE_PARAMETER, a = b + p);
+template<class T>
+void BaseMatrixT<T>::addScalar(BaseMatrixT& b, T p) {
+  applyBinary(binary::AddScalar<T>(p), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a = b - p);
+template<class T>
+void BaseMatrixT<T>::subScalar(BaseMatrixT& b, T p) {
+  applyBinary(binary::SubScalar<T>(p), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a = b * p);
+template<class T>
+void BaseMatrixT<T>::mulScalar(BaseMatrixT& b, T p) {
+  applyBinary(binary::MulScalar<T>(p), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a = b / p);
+template<class T>
+void BaseMatrixT<T>::divScalar(BaseMatrixT& b, T p) {
+  applyBinary(binary::DivScalar<T>(p), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ScalarDiv, ONE_PARAMETER, a = p / b);
+template<class T>
+void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
+  applyBinary(binary::ScalarDiv<T>(p), b);
+}
+
+/**
+ * @brief   ternary operator.
+ *
+ */
+
+DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropy,
+                         a = -c * log(b) - (1 - c) * log(1 - b));
+template<>
+void BaseMatrixT<real>::softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::SoftCrossEntropy<real>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropyBp, a += (b - c) / (b * (1 - b)));
+template<class T>
+void BaseMatrixT<T>::softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::SoftCrossEntropyBp<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropy,
+                         a = c > 0.5 ? -log(b) : -log(1.0 - b));
+template<>
+void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
+                                                BaseMatrixT& c) {
+  if (useGpu_) {
+    applyTernary(ternary::BinaryCrossEntropy<real>(), b, c);
+  } else {
+    CHECK_EQ(height_, b.height_);
+    CHECK_EQ(height_, c.height_);
+    CHECK_EQ(width_, b.width_);
+    CHECK_EQ(width_, c.width_);
+
+    size_t size = height_ * width_;
+    real* out = b.data_;
+    real* label = c.data_;
+    real* cost = data_;
+
+    for (size_t i = 0; i < size; ++i) {
+      cost[i] = label[i] > 0.5 ? out[i] : 1.0 - out[i];
+    }
+    vLog(size, cost, cost);
+    for (size_t i = 0; i < size; ++i) {
+      cost[i] *= -1.0;
+    }
+  }
+}
+
+DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropyBp,
+                         a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b));
+template<class T>
+void BaseMatrixT<T>::binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::BinaryCrossEntropyBp<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(Add, a = b + c);
+template<class T>
+void BaseMatrixT<T>::add(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::Add<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add1, TWO_PARAMETER, a = p1 * b + p2 * c);
+template<class T>
+void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
+  applyTernary(ternary::Add1<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(Sub, a = b - c);
+template<class T>
+void BaseMatrixT<T>::sub(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::Sub<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(Sub1, TWO_PARAMETER, a = p1 * b - p2 * c);
+template<class T>
+void BaseMatrixT<T>::sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
+  applyTernary(ternary::Sub1<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(Add2, a = a + b + c);
+template<class T>
+void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::Add2<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3, THREE_PARAMETER,
+                                   a = p1 * a + p2 * b + p3 * c);
+template<class T>
+void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
+  applyTernary(ternary::Add3<T>(p1, p2, p3), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER,
+                                   c = p2 * c - p1 * (b + p3 * a);
+                                   a = a + c);
+template<class T>
+void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad
+                               BaseMatrixT& c,  // mom
+                               T p1,        // learningRate,
+                               T p2,        // momentum,
+                               T p3) {      // decayRate
+  applyTernary(ternary::SgdUpdate<T>(p1, p2, p3), b, c);
+}
+
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER,
+                                      c = p2 * c - p1 * d * (b + p3 * a);
+                                      a += c);
+template<class T>
+void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad,
+                               BaseMatrixT& c,  // mom,
+                               BaseMatrixT& d,  // lr,
+                               T p1,        // learningRate,
+                               T p2,        // momentum,
+                               T p3) {      // decayRate
+  applyQuaternary(quaternary::SgdUpdate<T>(p1, p2, p3), b, c, d);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b;
+                                  a = (a > lambda)
+                                          ? (a - lambda)
+                                          : (a < -lambda) ? (a + lambda) : 0);
+template<class T>
+void BaseMatrixT<T>::applyL1(BaseMatrixT& lr, T learningRate, T decayRate) {
+  applyBinary(binary::ApplyL1<T>(learningRate * decayRate), lr);
+}
+
+template<>
+void BaseMatrixT<real>::applyL1(BaseMatrixT& lr,
+                                real learningRate,
+                                real decayRate) {
+  if (useGpu_) {
+    applyBinary(binary::ApplyL1<real>(learningRate * decayRate), lr);
+  } else {
+    simd::decayL1(this->data_, this->data_, lr.data_, learningRate * decayRate,
+                  height_ * width_);
+  }
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p;
+                                 a = (a > lambda)
+                                         ? (a - lambda)
+                                         : (a < -lambda) ? (a + lambda) : 0);
+template<class T>
+void BaseMatrixT<T>::applyL1(T learningRate, T decayRate) {
+  applyUnary(unary::ApplyL1<T>(learningRate * decayRate));
+}
+
+template<>
+void BaseMatrixT<real>::applyL1(real learningRate, real decayRate) {
+  if (useGpu_) {
+    applyUnary(unary::ApplyL1<real>(learningRate * decayRate));
+  } else {
+    simd::decayL1(this->data_, this->data_, learningRate * decayRate,
+                  height_ * width_);
+  }
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2, ONE_PARAMETER,
+                                  a *= (1.0f / (1.0f + p * b)));
+template<class T>
+void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
+  if (useGpu_) {
+    applyBinary(binary::ApplyL2<T>(learningRate * decayRate), lr);
+  } else {
+    size_t size = this->height_ * this->width_;
+    T decay = learningRate * decayRate;
+    for (size_t j = 0; j < size; ++j) {
+      this->data_[j] *= 1.0f / (1.0f + decay * lr.data_[j]);
+    }
+  }
+}
+
+template<class T>
+void BaseMatrixT<T>::applyL2(T learningRate, T decayRate) {
+  BaseMatrixT<T>::mulScalar(1.0f / (1.0f + learningRate * decayRate));
+}
+
+DEFINE_MATRIX_BINARY_OP(DotMul, a *= b);
+template<class T>
+void BaseMatrixT<T>::dotMul(BaseMatrixT& b) {
+  applyBinary(binary::DotMul<T>(), b);
+}
+
+DEFINE_MATRIX_TERNARY_OP(DotMul, a = b * c);
+template<class T>
+void BaseMatrixT<T>::dotMul(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::DotMul<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(DotDiv, a = (b == 0.0) ? 0.0 : b / c);
+template<class T>
+void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::DotDiv<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P, TWO_PARAMETER,
+                                   a = (b + p1) / (c + p2));
+template<class T>
+void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
+  applyTernary(ternary::DotDiv2P<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c;
+                            a = (a > THRESHOLD)
+                                    ? THRESHOLD
+                                    : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
+                            a = log(1 + exp(a)) - a * d);
+template<>
+void BaseMatrixT<real>::rankLoss(BaseMatrixT& b,
+                                 BaseMatrixT& c,
+                                 BaseMatrixT& d) {
+  applyQuaternary(quaternary::RankLoss<real>(), b, c, d);
+}
+
+DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c;
+                            a = (a > THRESHOLD)
+                                    ? THRESHOLD
+                                    : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
+                            a = exp(a); a = (a / (1 + a) - d));
+template<>
+void BaseMatrixT<real>::rankLossBp(BaseMatrixT& b,
+                                   BaseMatrixT& c,
+                                   BaseMatrixT& d) {
+  applyQuaternary(quaternary::RankLossBp<real>(), b, c, d);
+}
+
+/* this = log(1 + exp(b)) - c * b */
+DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0;
+                         T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
+                                                                 ? -THRESHOLD
+                                                                 : b;
+                         a = log(1 + exp(x)) - c * x);
+template<>
+void BaseMatrixT<real>::logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::LogisticRegressionLoss<real>(), b, c);
+}
+
+/* this = exp(b)/(1+exp(b)) - c */
+DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0;
+                         T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
+                                                                 ? -THRESHOLD
+                                                                 : b;
+                         x = exp(x); a = x / (1 + x) - c);
+template<>
+void BaseMatrixT<real>::logisticRegressionLossBp(BaseMatrixT& b,
+                                                 BaseMatrixT& c) {
+  applyTernary(ternary::LogisticRegressionLossBp<real>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(BiggerThan, a = (b > c) ? 1.0f : 0.0f);
+template<class T>
+void BaseMatrixT<T>::biggerThan(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::BiggerThan<T>(), b, c);
+}
+
+DEFINE_MATRIX_QUATERNARY_OP(
+    BiggerThan, a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
+template<class T>
+void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
+                                BaseMatrixT& c,
+                                BaseMatrixT& d) {
+  applyQuaternary(quaternary::BiggerThan<T>(), b, c, d);
+}
+
+DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c);
+template<class T>
+void BaseMatrixT<T>::max(BaseMatrixT& b, BaseMatrixT& c) {  // NOLINT
+  applyTernary(ternary::Max<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError, ONE_PARAMETER,
+                                   c += ((a > p) == (b > p)) ? 0.0f : 1.0f);
+template<class T>
+void BaseMatrixT<T>::binaryClassificationError2(size_t destCol, BaseMatrixT& b,
+                                                BaseMatrixT& c, T p) {
+  CHECK(!useGpu_) << "do not support gpu";
+  MatrixOffset offset(0, 0, 0, 0, destCol, 0);
+  int numRows = b.height_;
+  int numCols = b.width_;
+  b.applyTernary(ternary::BinaryClassificationError<T>(p), c, *this, numRows,
+                 numCols, offset, false_type(), true_type() /*cAsColVector*/);
+}
+
+template<>
+void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
+                                                  BaseMatrixT& b,
+                                                  BaseMatrixT& c,
+                                                  real p) {
+  MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
+  int numRows = b.height_;
+  int numCols = b.width_;
+  aggregate(aggregate::sum(), base::binary::classificationError(p),
+            base::binary::add(), b, c, numRows, numCols, offset, false_type(),
+            true_type() /*aAsColVector*/);
+}
+
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3, THREE_PARAMETER,
+                                      a = p1 * b + p2 * c + p3 * d);
+template<class T>
+void BaseMatrixT<T>::add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1,
+                          T p2, T p3) {
+  applyQuaternary(quaternary::Add3<T>(p1, p2, p3), b, c, d);
+}
+
+DEFINE_MATRIX_TERNARY_OP(DotMulSquare, a = b * c * c);
+template<class T>
+void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::DotMulSquare<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(DotSquareSquare, a = b * b * c * c);
+template<class T>
+void BaseMatrixT<T>::dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::DotSquareSquare<T>(), b, c);
+}
+
+DEFINE_MATRIX_BINARY_OP(DotMulSquare, a *= b * b);
+template<class T>
+void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b) {
+  applyBinary(binary::DotMulSquare<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(DotSquareMul, a = a * a * b);
+template<class T>
+void BaseMatrixT<T>::dotSquareMul(BaseMatrixT& b) {
+  applyBinary(binary::DotSquareMul<T>(), b);
+}
+
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum, THREE_PARAMETER,
+                                      T tmp = p1 * b + p2 * c + p3 * d;
+                                      a += tmp * tmp);
+template<class T>
+void BaseMatrixT<T>::addSquareSum(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d,
+                                  T p1, T p2, T p3) {
+  applyQuaternary(quaternary::AddSquareSum<T>(p1, p2, p3), b, c, d);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(AddSquare, ONE_PARAMETER, a += p * b * b);
+template<class T>
+void BaseMatrixT<T>::addSquare(BaseMatrixT& b, T p) {
+  applyBinary(binary::AddSquare<T>(p), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare, TWO_PARAMETER,
+                                  a = p1 * a + p2 * b * b);
+template<class T>
+void BaseMatrixT<T>::decayAddSquare(BaseMatrixT& b, T p1, T p2) {
+  applyBinary(binary::DecayAddSquare<T>(p1, p2), b);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul, TWO_PARAMETER,
+                                   a = p1 * a + p2 * b * b * c * c);
+template<class T>
+void BaseMatrixT<T>::decayAddSquareMul(BaseMatrixT& b, BaseMatrixT& c, T p1,
+                                       T p2) {
+  applyTernary(ternary::DecayAddSquareMul<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum, THREE_PARAMETER,
+                                   a = 1 / (p1 * b + p2 * c + p3));
+template<class T>
+void BaseMatrixT<T>::reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2,
+                                   T p3) {
+  applyTernary(ternary::ReciprocalSum<T>(p1, p2, p3), b, c);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2, TWO_PARAMETER,
+                                  a = 1 / (p1 * b + p2));
+template<class T>
+void BaseMatrixT<T>::reciprocal(BaseMatrixT& b, T p1, T p2) {
+  applyBinary(binary::Reciprocal2<T>(p1, p2), b);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum, TWO_PARAMETER,
+                                   T tmp = p1 * b + p2 * c;
+                                   a *= tmp * tmp);
+template<class T>
+void BaseMatrixT<T>::dotMulSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1,
+                                     T p2) {
+  applyTernary(ternary::DotMulSquareSum<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum, TWO_PARAMETER,
+                                   T tmp = p1 * b + p2 * c;
+                                   a = tmp * tmp);
+template<class T>
+void BaseMatrixT<T>::dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
+  applyTernary(ternary::DotSquareSum<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum, TWO_PARAMETER,
+                                   a *= p1 * b + p2 * c);
+template<class T>
+void BaseMatrixT<T>::dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
+  applyTernary(ternary::DotMulSum<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_BINARY_OP(CopyAndClear, b = a; a = 0);
+template<class T>
+void BaseMatrixT<T>::copyAndClear(BaseMatrixT& b) {
+  applyBinary(binary::CopyAndClear<T>(), b);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul, TWO_PARAMETER,
+                                   a = p1 * a + p2 * b * c);
+template<class T>
+void BaseMatrixT<T>::addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
+  applyTernary(ternary::AddDotMul<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_BINARY_OP(Assign, a = b;);
+template<class T>
+void BaseMatrixT<T>::assign(BaseMatrixT& b) {
+  if (useGpu_) {
+    applyBinary(binary::Assign<T>(), b);
+  } else {  // cpu version
+    CHECK_EQ(this->height_, b.height_);
+    CHECK_EQ(this->width_, b.width_);
+    memcpy(data_, b.data_, sizeof(T) * height_ * width_);
+  }
+}
+
+template<class T>
+void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
+  if (columnOffset + b.width_ <= width_) {
+    int numRows = height_;
+    int numCols = b.width_;
+    MatrixOffset offset(columnOffset, 0, 0, 0);
+    applyBinary(binary::Assign<T>(), b, numRows, numCols, offset);
+  } else if (columnOffset + width_ <= b.width_) {
+    int numRows = height_;
+    int numCols = width_;
+    MatrixOffset offset(0, 0, columnOffset, 0);
+    applyBinary(binary::Assign<T>(), b, numRows, numCols, offset);
+  } else {
+    LOG(FATAL) << "Wrong argument "
+               << " a.width=" << width_ << " b.width=" << b.width_
+               << " columnOffset=" << columnOffset;
+  }
+}
+
+template<>
+void BaseMatrixT<real>::rowDotMul(size_t destCol,
+                                  BaseMatrixT& b,
+                                  BaseMatrixT& c) {
+  int numRows = b.height_;
+  int numCols = b.width_;
+  MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
+  aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c,
+            numRows, numCols, offset, false_type(),
+            true_type() /*aAsColVector*/);
+}
+
+template<class T>
+void BaseMatrixT<T>::rowDotMul2(size_t destCol,
+                                BaseMatrixT& b,
+                                BaseMatrixT& c) {
+  CHECK(!useGpu_) << "do not support gpu";
+
+  size_t height = this->height_;
+  CHECK_LT(destCol, this->width_);
+  CHECK_EQ(height, b.height_);
+  CHECK_EQ(height, c.height_);
+  CHECK_EQ(b.width_, c.width_);
+  size_t width = b.width_;
+  T* A = this->data_;
+  const T* B = b.data_;
+  const T* C = c.data_;
+  for (size_t i = 0; i < height;
+       ++i, A += this->width_, B += width, C += width) {
+    for (size_t j = 0; j < width; ++j) {
+      A[destCol] += B[j] * C[j];
+    }
+  }
+}
+
+template<>
+void BaseMatrixT<real>::addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  int numRows = b.height_;
+  int numCols = b.width_;
+  aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c,
+            numRows, numCols, offset, true_type() /*aAsRowVector*/,
+            false_type());
+}
+
+template<class T>
+void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
+  CHECK(!useGpu_) << "do not support gpu";
+
+  CHECK_EQ(height_, 1LU);
+  CHECK_EQ(b.height_, c.height_);
+  CHECK_EQ(width_, b.width_);
+  CHECK_EQ(width_, c.width_);
+  size_t height = b.height_;
+  size_t width = b.width_;
+  T* A = this->data_;
+  const T* B = b.data_;
+  const T* C = c.data_;
+  for (size_t i = 0; i < height; ++i, B += width, C += width) {
+    for (size_t j = 0; j < width; ++j) {
+      A[j] += B[j] * C[j];
+    }
+  }
+}
+
+DEFINE_MATRIX_TERNARY_OP(addDotMulMMV, a += b * c);
+template<class T>
+void BaseMatrixT<T>::addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyTernary(ternary::addDotMulMMV<T>(), b, c, numRows, numCols, offset,
+               true_type() /*cAsRowVector*/, false_type());
+}
+
+template<class T>
+void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
+  CHECK(!useGpu_) << "do not support gpu";
+
+  CHECK_EQ(c.height_, 1LU);
+  CHECK_EQ(height_, b.height_);
+  CHECK_EQ(width_, b.width_);
+  CHECK_EQ(width_, c.width_);
+  size_t height = height_;
+  size_t width = width_;
+  T* A = this->data_;
+  const T* B = b.data_;
+  const T* C = c.data_;
+  for (size_t i = 0; i < height; ++i, A += width, B += width) {
+    for (size_t j = 0; j < width; ++j) {
+      A[j] += B[j] * C[j];
+    }
+  }
+}
+
+template<class T>
+void BaseMatrixT<T>::rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyTernary(ternary::DotMul<T>(), b, c, numRows, numCols, offset,
+    false_type(), true_type() /*cAsColVector*/);
+}
+
+template<class T>
+void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
+  CHECK(!useGpu_) << "do not support gpu";
+
+  size_t height = this->height_;
+  size_t width = this->width_;
+  CHECK_EQ(height, b.height_);
+  CHECK_EQ(width, b.width_);
+  CHECK_LT(cCol, c.width_);
+  CHECK_EQ(height, c.height_);
+  T* A = this->data_;
+  const T* B = b.data_;
+  const T* C = c.data_;
+  for (size_t i = 0; i < height; ++i, A += width, B += width, C += c.width_) {
+    for (size_t j = 0; j < width; ++j) {
+      A[j] = B[j] * C[cCol];
+    }
+  }
+}
+
+template<class T>
+void BaseMatrixT<T>::colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, 0, cRow);
+  int numRows = height_;
+  int numCols = width_;
+  applyTernary(ternary::DotMul<T>(), b, c, numRows, numCols, offset,
+               true_type() /* cAsRowVector */, false_type() /* cAsColVector */);
+}
+
+template<class T>
+void BaseMatrixT<T>::addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, 0, cRow);
+  int numRows = height_;
+  int numCols = width_;
+  applyTernary(ternary::addDotMulMMV<T>(), b, c, numRows, numCols, offset,
+               true_type() /* cAsRowVector */, false_type() /* cAsColVector */);
+}
+
+template<class T>
+void BaseMatrixT<T>::addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyTernary(ternary::addDotMulMMV<T>(), b, c, numRows, numCols, offset,
+               false_type(), true_type() /*cAsColVector*/);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(RowAdd, ONE_PARAMETER, a = b + p * c);
+template<class T>
+void BaseMatrixT<T>::rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p) {
+  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyTernary(ternary::RowAdd<T>(p), b, c, numRows, numCols, offset,
+    false_type(), true_type() /*cAsColVector*/);
+}
+
+DEFINE_MATRIX_TERNARY_OP(RowPow, a = pow(b, c));
+template<>
+void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
+  if (useGpu_) {
+    MatrixOffset offset(0, 0, 0, 0, cCol, 0);
+    int numRows = height_;
+    int numCols = width_;
+    applyTernary(ternary::RowPow<real>(), b, c, numRows, numCols, offset,
+                 false_type(), true_type() /*cAsColVector*/);
+  } else {
+    size_t height = this->height_;
+    size_t width = this->width_;
+    CHECK_EQ(height, b.height_);
+    CHECK_EQ(width, b.width_);
+    CHECK_LT(cCol, c.width_);
+    CHECK_EQ(height, c.height_);
+    real* A = this->data_;
+    const real* B = b.data_;
+    const real* C = c.data_;
+    for (size_t i = 0; i < height; ++i, A += width, B += width, C += c.width_) {
+      vPow(width, B, C[cCol], A);
+    }
+  }
+}
+
+template<class T>
+void BaseMatrixT<T>::mulRowVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotMul<T>(), b, numRows, numCols, offset,
+              true_type() /* bAsRowVector */, false_type());
+}
+
+DEFINE_MATRIX_BINARY_OP(DotDiv, a /= b);
+template<class T>
+void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotDiv<T>(), b, numRows, numCols, offset,
+              true_type() /* bAsRowVector */, false_type());
+}
+
+template<>
+template <class Agg>
+int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  int numRows = b.height_;
+  int numCols = b.width_;
+  aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
+            numCols, offset, false_type(), true_type() /*aAsColVector*/);
+
+  return 0;
+}
+
+template<>
+template <class Agg, class Saver>
+int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  int numRows = b.height_;
+  int numCols = b.width_;
+  aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
+            false_type(), true_type() /*aAsColVector*/);
+
+  return 0;
+}
+
+template<>
+template <class Agg>
+int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  int numRows = b.height_;
+  int numCols = b.width_;
+  aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
+            numCols, offset, true_type() /*aAsRowVector*/, false_type());
+
+  return 0;
+}
+
+template<>
+template <class Agg, class Saver>
+int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  int numRows = b.height_;
+  int numCols = b.width_;
+  aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
+            true_type() /*aAsRowVector*/, false_type());
+
+  return 0;
+}
+
+template<>
+void BaseMatrixT<real>::sumRows(BaseMatrixT& b) {
+  applyRow(aggregate::sum(), b);
+}
+
+template<>
+void BaseMatrixT<real>::maxRows(BaseMatrixT& b) {
+  applyRow(aggregate::max(), b);
+}
+
+template<>
+void BaseMatrixT<real>::minRows(BaseMatrixT& b) {
+  applyRow(aggregate::min(), b);
+}
+
+template<>
+void BaseMatrixT<real>::sumCols(BaseMatrixT& b) {
+  applyCol(aggregate::sum(), b);
+}
+
+template<>
+void BaseMatrixT<real>::maxCols(BaseMatrixT& b) {
+  applyCol(aggregate::max(), b);
+}
+
+template<>
+void BaseMatrixT<real>::minCols(BaseMatrixT& b) {
+  applyCol(aggregate::min(), b);
+}
+
+template<>
+void BaseMatrixT<real>::sumCols(BaseMatrixT& b, real scale) {
+  applyCol(aggregate::sum(), base::binary::add2(1.0, scale), b);
+}
+
+template<>
+void BaseMatrixT<real>::sumOfSquares(BaseMatrixT& b, BaseMatrixT& c) {
+  int numRows = b.height_;
+  int numCols = b.width_;
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  aggregate(aggregate::sum(), base::binary::squaredDiff(), base::binary::add(),
+            b, c, numRows, numCols, offset, false_type(),
+            true_type() /*aAsColVector*/);
+}
+
+template class BaseMatrixT<real>;
+template class BaseMatrixT<int>;
+}  // namespace paddle
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
new file mode 100644
index 00000000000000..2dd2c2c7a9b985
--- /dev/null
+++ b/paddle/math/BaseMatrix.h
@@ -0,0 +1,961 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+#include <cstddef>
+#include <stdint.h>
+#include "paddle/utils/TypeDefs.h"
+
+namespace paddle {
+
+/*
+ * nvcc currently does not support C++11,
+ * so I realized false_type and true_type.
+ */
+template <class T, T v>
+struct bool_constant {
+  static const T value = v;
+};
+typedef bool_constant<bool, false> false_type;
+typedef bool_constant<bool, true> true_type;
+
+/**
+ * @brief   Calculate matrix element address.
+ *
+ * For instance, address of A[i][j] = i * ld + j.
+ *
+ */
+#define CAL_MATRIX_START_ADDRESS(address, height, width, ld, col, row) \
+  CHECK_LE(col, width);                                                \
+  CHECK_LE(row, height);                                               \
+  address += row * ld + col;
+
+class MatrixOffset {
+public:
+  size_t aCol_;
+  size_t aRow_;
+  size_t bCol_;
+  size_t bRow_;
+  size_t cCol_;
+  size_t cRow_;
+  size_t dCol_;
+  size_t dRow_;
+  MatrixOffset(size_t aCol = 0, size_t aRow = 0, size_t bCol = 0,
+               size_t bRow = 0, size_t cCol = 0, size_t cRow = 0,
+               size_t dCol = 0, size_t dRow = 0)
+      : aCol_(aCol),
+        aRow_(aRow),
+        bCol_(bCol),
+        bRow_(bRow),
+        cCol_(cCol),
+        cRow_(cRow),
+        dCol_(dCol),
+        dRow_(dRow) {}
+};
+
+template<class T>
+class BaseMatrixT {
+public:
+  size_t height_, width_;
+  size_t stride_;
+  T* data_;
+  bool trans_;
+  bool useGpu_;
+
+public:
+  virtual ~BaseMatrixT() {}
+  BaseMatrixT(size_t height, size_t width, T* data, bool trans, bool useGpu)
+      : height_(height),
+        width_(width),
+        stride_(width),
+        data_(data),
+        trans_(trans),
+        useGpu_(useGpu) {}
+
+  /**
+   * @note This constructor is for temporarily making a matrix with different
+   *       useGpu flag as the original matrix so that mixed gpu/cpu operations
+   *       can be performed successfully.
+   */
+  BaseMatrixT(BaseMatrixT& mat, bool useGpu)
+      : height_(mat.height_),
+        width_(mat.width_),
+        stride_(mat.stride_),
+        data_(mat.data_),
+        trans_(mat.trans_),
+        useGpu_(useGpu) {}
+
+  BaseMatrixT(size_t height, size_t width, size_t stride, T* data, bool trans,
+             bool use_gpu)
+      : height_(height),
+        width_(width),
+        stride_(stride),
+        data_(data),
+        trans_(trans),
+        useGpu_(use_gpu) {
+    /* CHECK_LE(width_, stride_); */
+  }
+
+  /// caller should make sure that the size of data is at least height*width
+  void setData(T* data) { data_ = data; }
+
+  /**
+   * unary operator: element wise op(a).
+   *
+   * @code
+   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
+   * @endcode
+   */
+  template <class Op>
+  int applyUnary(Op op);
+
+  /**
+   * unary operator: element wise op(a).
+   *
+   * @code
+   * for 0 <= i < numRows & for 0 <= j < numCols.
+   * While matrix start address is:
+   *  A = this->data_ + offset.aRow_*ld + offset.aCol_;
+   * @endcode
+   */
+  template <class Op>
+  int applyUnary(Op op, int numRows, int numCols, MatrixOffset& offset);
+
+  /**
+   * binary operator: element wise op(a, b).
+   *
+   * @code
+   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
+   * While this->height_ == b.height_ && this->width_ == b.width_.
+   * @endcode
+   */
+  template <class Op>
+  int applyBinary(Op op, BaseMatrixT& b);
+
+  /**
+   * binary operator: element wise op(a, b)
+   *
+   * @code
+   * for 0 <= i < numRows & for 0 <= j < numCols.
+   * While matrix start address is:
+   *   A = this->data_ + offset.aRow_*lda + offset.aCol_;
+   *   B = b->data_ + offset.bRow_*ldb + offset.bCol_;
+   *
+   * if (bAsRowVector == false_type && bAsColVector == false_type)
+   *   op(A[i * lda + j], B[i * ldb + j])
+   *
+   * if (bAsRowVector == true_type && bAsColVector == false_type)
+   *   op(A[i * lda + j], B[j])
+   *
+   * if (bAsRowVector == false_type && bAsColVector == true_type)
+   *   op(A[i * lda + j], B[i * ldb])
+   *
+   * if (bAsRowVector == true_type && bAsColVector == true_type)
+   *   op(A[i * lda + j], B[0])
+   * @endcode
+   */
+  template <class Op, class bAsRowVector, class bAsColVector>
+  int applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
+                  MatrixOffset& offset, bAsRowVector, bAsColVector);
+
+  template <class Op>
+  int applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
+                  MatrixOffset& offset);
+
+  /**
+   * ternary operator: element wise op(a, b, c).
+   *
+   * @code
+   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
+   *
+   * While this->height_ == b.height_ && this->width_ == b.width_
+   *    && this->height_ == c.height_ && this->width_ == c.width_
+   * @endcode
+   */
+  template <class Op>
+  int applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * ternary operator: element wise op(a, b, c).
+   *
+   * @code
+   *  for 0 <= i < numRows & for 0 <= j < numCols.
+   *  While matrix start address is:
+   *
+   *    A = this->data_ + offset.aRow_*lda + offset.aCol_;
+   *    B = b->data_ + offset.bRow_*ldb + offset.bCol_;
+   *    C = c->data_ + offset.cRow_*ldc + offset.cCol_;
+   *
+   *    if (cAsRowVector == false_type && cAsColVector == false_type)
+   *      op(A[i*lda + j], B[i*ldb + j], C[i*ldc + j])
+   *
+   *    if (cAsRowVector == true_type && cAsColVector == false_type)
+   *      op(A[i*lda + j], B[i*ldb + j], C[j])
+   *
+   *    if (cAsRowVector == false_type && cAsColVector == true_type)
+   *      op(A[i*lda + j], B[i*ldb + j], C[i*ldc])
+   *
+   *    if (cAsRowVector == 1 && cAsColVector == 1)
+   *      op(A[i*lda + j], B[i*ldb + j], C[0])
+   * @endcode
+   */
+  template <class Op, class cAsRowVector, class cAsColVector>
+  int applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, int numRows,
+                   int numCols, MatrixOffset& offset, cAsRowVector,
+                   cAsColVector);
+
+  template <class Op>
+  int applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, int numRows,
+                   int numCols, MatrixOffset& offset);
+
+  /**
+   * quaternary operator: element wise op(a, b, c, d).
+   *
+   * @code
+   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
+   *
+   * While this->height_ == b.height_ && this->width_ == b.width_
+   *    && this->height_ == c.height_ && this->width_ == c.width_
+   *    && this->height_ == d.height_ && this->width_ == d.width_
+   * @endcode
+   */
+  template <class Op>
+  int applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
+
+  /**
+   * quaternary operator: element wise op(a, b, c, d).
+   *
+   * @code
+   * for 0 <= i < numRows & for 0 <= j < numCols.
+   * While matrix start address is:
+   *    A = this->data_ + offset.aRow_*lda + offset.aCol_;
+   *    B = b->data_ + offset.bRow_*ldb + offset.bCol_;
+   *    C = c->data_ + offset.cRow_*ldc + offset.cCol_;
+   *    D = d->data_ + offset.dRow_*ldd + offset.dCol_;
+   * @endcode
+   */
+  template <class Op>
+  int applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d,
+                      int numRows, int numCols, MatrixOffset& offset);
+
+  /**
+   * a aggregate expression that apply each row(or column) of matrix b.
+   * op and sv is element wise operator.
+   *
+   * @code
+   * if (aAsRowVector == true_type && aAsColVector == false_type)
+   *  for each column j & 0 <= i < numRows, do:
+   *    dst = agg(op(b[i*ldb + j]))
+   *    a[j] = sv(a[j], dst)
+   *
+   * if (aAsRowVector == false_type && aAsColVector == true_type)
+   *  for each row i & 0 <= j < numCols, do:
+   *    dst = agg(op(b[i*ldb + j]))
+   *    a[i] = sv(a[i], dst)
+   * @endcode
+   */
+  template <class Agg, class Op, class Saver, class aAsRowVector,
+            class aAsColVector>
+  int aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, int numRows,
+                int numCols, MatrixOffset& offset, aAsRowVector, aAsColVector);
+
+  /**
+   * a aggregate expression that apply each row(or column) of matrix b and c.
+   *
+   * op and sv is element wise operator.
+   *
+   * @code
+   * if (aAsRowVector == true_type && aAsColVector == false_type)
+   *   for each column j & 0 <= i < numRows, do:
+   *     dst = agg(op(b[i*ldb + j], c[i*ldc + j]))
+   *     a[j] = sv(a[j], dst)
+   *
+   * if (aAsRowVector == false_type && aAsColVector == true_type)
+   *   for each row i & 0 <= j < numCols, do:
+   *     dst = agg(op(b[i*ldb + j], c[i*ldc + j]))
+   *     a[i] = sv(a[i], dst)
+   * @endcode
+   */
+  template <class Agg, class Op, class Saver, class aAsRowVector,
+            class aAsColVector>
+  int aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c,
+                int numRows, int numCols, MatrixOffset& offset, aAsRowVector,
+                aAsColVector);
+
+  /**
+   * a aggregate expression that apply each row of matrix b.
+   *
+   * @code
+   * for each row i & 0 <= j < b.width_, do:
+   *   this[i] = agg(b[i*ldb + j])
+   * @endcode
+   */
+  template <class Agg>
+  int applyRow(Agg agg, BaseMatrixT& b);
+
+  /**
+   * a aggregate expression that apply each row of matrix b.
+   *
+   * @code
+   * for each row i & 0 <= j < b.width_, do:
+   *   dst = agg(b[i*ldb + j])
+   *   this[i] = sv(this[i], dst)
+   * @endcode
+   */
+  template <class Agg, class Saver>
+  int applyRow(Agg agg, Saver sv, BaseMatrixT& b);
+
+  /**
+   * a aggregate expression that apply each column of matrix b.
+   *
+   * @code
+   * for each column j & 0 <= i < b.height_, do:
+   *   this[j] = agg(b[i*ldb + j])
+   * @endcode
+   */
+  template <class Agg>
+  int applyCol(Agg agg, BaseMatrixT& b);
+
+  /**
+   * a aggregate expression that apply each column of matrix b.
+   *
+   * @code
+   * for each column j & 0 <= i < b.height_, do:
+   *   dst = agg(b[i*ldb + j])
+   *   this[j] = sv(this[j], dst)
+   * @endcode
+   */
+  template <class Agg, class Saver>
+  int applyCol(Agg agg, Saver sv, BaseMatrixT& b);
+
+  bool useGpu() const { return useGpu_; }
+
+  const T* rowBuf(size_t row) const { return data_ + width_ * row; }
+
+  T* rowBuf(size_t row) { return data_ + width_ * row; }
+
+  /**
+   * @brief   unary operator.
+   *
+   */
+  void neg();
+  void exp();
+  void pow(T p);
+  void log();
+  void sqrt();
+  void square();
+  void reciprocal();
+  void abs();
+  void sign();
+  void zero();
+
+  /**
+   * @code
+   * this(row, col + columnOffset) = 0 for 0 <= col < numColumns
+   * @endcode
+   */
+  void zeroAtOffset(int64_t columnOffset, int64_t numColumns);
+  void one();
+  void subScalar(T p);
+  void mulScalar(T p);
+  void divScalar(T p);
+
+  /**
+   * @code
+   * this = p
+   * @endcode
+   */
+  void assign(T p);
+
+  /**
+   * @code
+   * this = this + p
+   * @endcode
+   */
+  void add(T p);
+
+  /**
+   * @code
+   * this = this*p1 + p2
+   * @endcode
+   */
+  void add(T p1, T p2);
+
+  /**
+   * this = this < low ? low : this
+   *
+   * this = this > high ? high : this
+   */
+  void clip(T p1, T p2);
+
+  /**
+   * @code
+   * a = a > p ? 1.0f : 0.0f
+   * @endcode
+   */
+  void biggerThanScalar(T p);
+
+  /**
+   * @code
+   * a = a > p ? a : p
+   * @endcode
+   */
+  void downClip(T p);
+
+  /**
+   * @code
+   * this = b
+   * @endcode
+   */
+  void assign(BaseMatrixT& b);
+
+  /**
+   * @code
+   * If b.width + columOffset <= this.width
+   *  this(row, col + columnOffset) = b(row, col) for 0 <= col < b.width
+   *
+   * If this.width + columnOffset <= b.width
+   *  this(row, col) = b(row, col + columnOffset) for 0 <= col < this.width
+   *
+   * Otherwise, FATAL
+   * @endcode
+   */
+  void assignAtOffset(BaseMatrixT& b, int64_t columnOffset);
+
+  /// this = this + b
+  void add(BaseMatrixT& b);
+
+  /**
+   * @code
+   * If b.width + columOffset <= this.width
+   *  this(row, col + columnOffset) += b(row, col) for 0 <= col < b.width
+   *
+   * If this.width + columnOffset <= b.width
+   *  this(row, col) += b(row, col + columnOffset) for 0 <= col < this.width
+   *
+   * Otherwise, FATAL
+   * @endcode
+   */
+  void addAtOffset(BaseMatrixT& b, int64_t columnOffset);
+
+  void addColVector(BaseMatrixT& b);
+  void addRowVector(BaseMatrixT& b);
+  void addBias(BaseMatrixT& b, T scale);
+
+  void mulRowVector(BaseMatrixT& b);
+  void divRowVector(BaseMatrixT& b);
+
+  void addP2P(BaseMatrixT& b);
+
+  /**
+   * @code
+   * this = this + b*p
+   * @endcode
+   */
+  void add(BaseMatrixT& b, T p);
+
+  /**
+   * @code
+   * this = p1*this + p2*b
+   * @endcode
+   */
+  void add(BaseMatrixT& b, T p1, T p2);
+
+  /**
+   * @code
+   * this = this - b
+   * @endcode
+   */
+  void sub(BaseMatrixT& b);
+
+  /**
+   * @code
+   * this = this - b*p
+   * @endcode
+   */
+  void sub(BaseMatrixT& b, T p);
+
+  /**
+   * @code
+   * b = max(0, this)
+   * @endcode
+   */
+  void relu(BaseMatrixT& b);
+  void reluDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = log(1.0 + exp(this))
+   * @endcode
+   */
+  void softrelu(BaseMatrixT& b);
+  void softreluDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = min(max(this, p1), p2)
+   * @endcode
+   */
+  void brelu(BaseMatrixT& b);
+  void breluDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = this * this
+   * @endcode
+   */
+  void square(BaseMatrixT& b);
+  void squareDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = tanh(this)
+   * @endcode
+   */
+  void tanh(BaseMatrixT& b);
+  void tanhDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = p1 * tanh(p2 * this)
+   * @endcode
+   */
+  void scaledTanh(BaseMatrixT& b, T p1, T p2);
+  void scaledTanhDerivative(BaseMatrixT& b, T p1, T p2);
+
+  /**
+   * @code
+   * b = 1.0f / this
+   * @endcode
+   */
+  void reciprocal(BaseMatrixT& b);
+  void reciprocalDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = this > 0.0f ? this : -this
+   * @endcode
+   */
+  void abs(BaseMatrixT& b);
+  void absDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = 1.0f / (1.0f + exp(-this))
+   * @endcode
+   */
+  void sigmoid(BaseMatrixT& b);
+  void sigmoidDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = a
+   * @endcode
+   */
+  void expDerivative(BaseMatrixT& b);
+
+  void sign(BaseMatrixT& b);
+
+  void exp(BaseMatrixT& b);
+  void pow(BaseMatrixT& b, T p);
+  void log(BaseMatrixT& b);
+  void sqrt(BaseMatrixT& b);
+  void addScalar(BaseMatrixT& b, T p);
+  void subScalar(BaseMatrixT& b, T p);
+  void mulScalar(BaseMatrixT& b, T p);
+  void divScalar(BaseMatrixT& b, T p);
+  void scalarDiv(BaseMatrixT& b, T p);
+
+  /**
+   * @code
+   * this = 1.0f / sqrt(b)
+   * @endcode
+   */
+  void invSqrt(BaseMatrixT& b);
+
+  /// this = (b == value)
+  void isEqualTo(BaseMatrixT& b, T value);
+
+  /**
+   * @brief   ternary operator.
+   */
+  void softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c);
+  void softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c);
+  void binaryLabelCrossEntropy(BaseMatrixT& b, BaseMatrixT& c);
+  void binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = b + c
+   * @endcode
+   */
+  void add(BaseMatrixT& b, BaseMatrixT& c);
+  /**
+   * @code
+   * this = b*p1 + c*p2
+   * @endcode
+   */
+  void add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2);
+  /**
+   * @code
+   * this = b - c
+   * @endcode
+   */
+  void sub(BaseMatrixT& b, BaseMatrixT& c);
+  /**
+   * @code
+   * this = b*p1 - c*p2
+   * @endcode
+   */
+  void sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2);
+
+  /**
+   * @code
+   * this = this + b + c
+   * @endcode
+   */
+  void add2(BaseMatrixT& b, BaseMatrixT& c);
+  /**
+   * @code
+   * this = this*p1 + b*p2 + c*p3
+   * @endcode
+   */
+  void add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3);
+
+  /**
+   * @code
+   * this = a*p1 + b*p2 + c*p3
+   * @endcode
+   */
+  void add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2,
+            T p3);
+
+  /**
+   * @code
+   *   c = p2 * c - p1 *  (b + p3 * this)
+   *   this += mom
+   * @endcode
+   */
+  void sgdUpdate(BaseMatrixT& b,  //  grad
+                 BaseMatrixT& c,  //  mom
+                 T p1,        //  learningRate,
+                 T p2,        //  momentum,
+                 T p3);       //  decayRate
+
+  /**
+   * @code
+   *   c = p2 * c - p1 * d * (b + p3 * this)
+   *   this += mom
+   * @endcode
+   */
+  void sgdUpdate(BaseMatrixT& b,  // grad,
+                 BaseMatrixT& c,  // mom,
+                 BaseMatrixT& d,  // lr,
+                 T p1,        // learningRate,
+                 T p2,        // momentum,
+                 T p3);       // decayRate
+
+  /// apply L1/L2 to *this*
+  void applyL1(T learningRate, T decayRate);
+  void applyL1(BaseMatrixT& lr, T learningRate, T decayRate);
+  void applyL2(T learningRate, T decayRate);
+  void applyL2(BaseMatrixT& lr, T learningRate, T decayRate);
+
+  /**
+   * @code
+   * this *= b
+   * @endcode
+   */
+  void dotMul(BaseMatrixT& b);
+
+  /**
+   * @code
+   * this = b * c
+   * @endcode
+   */
+  void dotMul(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = b / c
+   * @endcode
+   */
+  void dotDiv(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = (b + p1) / (c + p2)
+   * @endcode
+   */
+  void dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
+
+  /**
+   * @code
+   * this = log(1 + exp(b - c)) - d * (b - c)
+   * @endcode
+   */
+  void rankLoss(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
+  void rankLossBp(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
+
+  /**
+   * @code
+   * this = log(1 + exp(b)) - c * b
+   * @endcode
+   */
+  void logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this += exp(b)/(1+exp(b)) - c
+   * @endcode
+   */
+  void logisticRegressionLossBp(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = b > c ? 1.0 : 0.0
+   * @endcode
+   */
+  void biggerThan(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = ((b>c && d>0.5) || (b<c && d<0.5)) ? 1 : 0)
+   * @endcode
+   */
+  void biggerThan(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
+
+  /**
+   * @code
+   * this = b>c ? b : c
+   * @endcode
+   */
+   void max(BaseMatrixT& b, BaseMatrixT& c);  //  NOLINT
+
+  /**
+   * @code
+   * this[destCol] += (b>p1 == c>p1) ? 0 : 1)
+   * @endcode
+   */
+  void binaryClassificationError(size_t destCol, BaseMatrixT& b, BaseMatrixT& c,
+                                 T p);
+  void binaryClassificationError2(size_t destCol, BaseMatrixT& b,
+                                  BaseMatrixT& c, T p);
+
+  /**
+   * @code
+   * this = this * b * b
+   * @endcode
+   */
+  void dotMulSquare(BaseMatrixT& b);
+
+  /**
+   * @code
+   * this = this * this * b
+   * @endcode
+   */
+  void dotSquareMul(BaseMatrixT& b);
+
+  /**
+   * @code
+   * this = b * c * c
+   * @endcode
+   */
+  void dotMulSquare(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = b * b * c * c
+   * @endcode
+   */
+  void dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = this * (p1*b + p2*c)^2
+   * @endcode
+   */
+  void dotMulSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
+
+  /**
+   * @code
+   * this = (p1*b + p2*c)^2
+   * @endcode
+   */
+  void dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
+
+  /**
+   * @code
+   * this=  this * (p1*b + p2*c)
+   * @endcode
+   */
+  void dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
+
+  /**
+   * @code
+   * this += sqr(p1*b + p2*c + p3*d)
+   * @endcode
+   */
+  void addSquareSum(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1,
+                    T p2, T p3);
+
+  /**
+   * @code
+   * this += p * sqr(b)
+   * @endcode
+   */
+  void addSquare(BaseMatrixT& b, T p);
+
+  /**
+   * @code
+   * this = p1 * this + p2 * sqr(b)
+   * @endcode
+   */
+  void decayAddSquare(BaseMatrixT& b, T p1, T p2);
+
+  /**
+   * @code
+   * this = p1 * this + p2 * sqr(b * c)
+   * @endcode
+   */
+  void decayAddSquareMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
+
+  /**
+   * @code
+   * this = 1 / (p1 * b + p2)
+   * @endcode
+   */
+  void reciprocal(BaseMatrixT& b, T p1, T p2);
+
+  /**
+   * @code
+   * this = 1 / (p1 * b + p2 * c + p3)
+   * @endcode
+   */
+  void reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3);
+
+  /**
+   * @code
+   * b = this; this = 0
+   * @endcode
+   */
+  void copyAndClear(BaseMatrixT& b);
+
+  /**
+   * @code
+   * this_row[destCol] += dotprod(b_row, c_row)
+   * @endcode
+   */
+  void rowDotMul(size_t destCol, BaseMatrixT& b, BaseMatrixT& c);
+  void rowDotMul2(size_t destCol, BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * this is vector (one row matrix)
+   *
+   * @code
+   *   for each row i, do:
+   *      this_row += dotmul(b_row_i, c_row_i)
+   * @endcode
+   */
+  void addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c);
+  void addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * c is vector (one row matrix)
+   *
+   * @code
+   * for each row i, do:
+   *    this_row_i += dotmul(b_row_i, c_row)
+   * @endcode
+   */
+  void addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c);
+  void addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = p1 * this + p2 * b * c
+   * @endcode
+   */
+  void addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
+
+  /**
+   * @code
+   * this_row = b_row * c_row[cCol]
+   * @endcode
+   */
+  void rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
+  void rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this_col = b_col * c_col[cRow]
+   * @endcode
+   */
+  void colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this_col += b_col * c_col[cRow]
+   * @endcode
+   */
+  void addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this_row += b_row * c_row[cCol]
+   * @endcode
+   */
+  void addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
+
+  /// calculate the sum of each row of the matrix b.
+  void sumRows(BaseMatrixT& b);
+  /// calculate the maximum value of each row of the matrix b.
+  void maxRows(BaseMatrixT& b);
+  /// calculate the minimum value of each row of the matrix b.
+  void minRows(BaseMatrixT& b);
+
+  /// calculate the sum of each column of the matrix b.
+  void sumCols(BaseMatrixT& b);
+  /// calculate the maximum value of each column of the matrix b.
+  void maxCols(BaseMatrixT& b);
+  /// calculate the minimum value of each column of the matrix b.
+  void minCols(BaseMatrixT& b);
+  void sumCols(BaseMatrixT& b, T scale);
+
+  /// calculate the sum of each row of (b - c)^2.
+  void sumOfSquares(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this_row = b_row + p * ones * c_row[cCol]
+   * @endcode
+   */
+  void rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p);
+  /**
+   * @code
+   * this_row = pow(b_row, c_row[cCol])
+   * @endcode
+   */
+  void rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
+
+  virtual bool isSparse() const {
+    return false;
+  }
+};
+
+typedef BaseMatrixT<real> BaseMatrix;
+typedef BaseMatrixT<int> IBaseMatrix;
+
+}  // namespace paddle
diff --git a/paddle/math/Bits.h b/paddle/math/Bits.h
new file mode 100644
index 00000000000000..4114149f6c191a
--- /dev/null
+++ b/paddle/math/Bits.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <type_traits>
+
+namespace paddle {
+
+/**
+ * From Facebook folly:
+ * https://github.com/facebook/folly/blob/master/folly/Bits.h
+ *
+ * findLastSet: return the 1-based index of the highest bit set
+ *
+ * for x > 0:
+ * \f[
+ *    findLastSet(x) = 1 + \floor*{\log_{2}x}
+ * \f]
+ */
+template <class T>
+inline constexpr typename std::enable_if<(std::is_integral<T>::value &&
+                                          std::is_unsigned<T>::value &&
+                                          sizeof(T) <= sizeof(unsigned int)),
+                                         unsigned int>::type
+findLastSet(T x) {
+  return x ? 8 * sizeof(unsigned int) - __builtin_clz(x) : 0;
+}
+
+template <class T>
+inline constexpr
+    typename std::enable_if<(std::is_integral<T>::value &&
+                             std::is_unsigned<T>::value &&
+                             sizeof(T) > sizeof(unsigned int) &&
+                             sizeof(T) <= sizeof(unsigned long)),  // NOLINT
+                            unsigned int>::type
+    findLastSet(T x) {
+  return x ? 8 * sizeof(unsigned long) - __builtin_clzl(x) : 0;  // NOLINT
+}
+
+}  // namespace paddle
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
new file mode 100644
index 00000000000000..db305812a7c036
--- /dev/null
+++ b/paddle/math/CMakeLists.txt
@@ -0,0 +1,37 @@
+# common package contains:
+#   * the utilities:
+#       * Thread Libs
+#       * Memory Manage libs
+#       * CommandLine Parser
+#       * Logging
+#       * Timer/Stats
+#   * the math libraries:
+#       * Matrix/Vector
+#   * the parameter optimizers.
+#   * the parameter updater functions.
+#
+# TODO(yuyang18): separate libs.
+#
+file(GLOB MATH_HEADERS . *.h)
+file(GLOB MATH_SOURCES . *.cpp)
+set(MATH_SOURCES
+    "${PROJ_ROOT}/paddle/math/BaseMatrix.cu"
+    ${MATH_SOURCES})
+if(NOT WITH_GPU)
+    # then compile BaseMatrix.cu as c++ file
+    compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/BaseMatrix.cu")
+    add_library(paddle_math STATIC
+        ${MATH_SOURCES})
+else()
+    add_paddle_culib(paddle_math ${MATH_SOURCES})
+endif()
+
+
+
+add_style_check_target(paddle_math ${MATH_SOURCES})
+add_style_check_target(paddle_math ${MATH_HEADERS})
+
+add_dependencies(paddle_math gen_proto_cpp)  # depends
+if(WITH_TESTING)
+    add_subdirectory(tests)
+endif()
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
new file mode 100644
index 00000000000000..842efdbe3d77ec
--- /dev/null
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -0,0 +1,695 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "hl_gpu.h"
+#include "CpuSparseMatrix.h"
+#include "SparseMatrix.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/utils/Util.h"
+#include "float.h"
+
+namespace paddle {
+
+const size_t CpuSparseMatrix::DEFAULT_AVG_WIDTH;
+
+CpuSparseMatrix::CpuSparseMatrix(size_t height, size_t width, size_t nnz,
+                                 SparseValueType valueType, SparseFormat format,
+                                 bool trans)
+    : Matrix(NULL, height, width, trans, false) {
+  resize(height, width, nnz, valueType, format);
+}
+
+CpuSparseMatrix::CpuSparseMatrix(CpuMemHandlePtr dataHandle, size_t height,
+                                 size_t width, size_t nnz,
+                                 SparseValueType valueType, SparseFormat format,
+                                 bool trans)
+    : Matrix(dataHandle, height, width, trans, false) {
+  resize(height, width, nnz, valueType, format);
+}
+
+CpuSparseMatrix::CpuSparseMatrix(real* data, int* rows, int* cols,
+                                 size_t height, size_t width, size_t nnz,
+                                 SparseValueType valueType, SparseFormat format,
+                                 bool trans)
+    : Matrix(NULL, height, width, trans, false) {
+  cols_ = cols;
+  rows_ = rows;
+  value_ = data;
+  height_ = height;
+  width_ = width;
+  elementCnt_ = nnz;
+  valueType_ = valueType;
+  format_ = format;
+}
+
+void CpuSparseMatrix::resize(size_t newHeight, size_t newWidth, size_t newNnz,
+                             SparseValueType valueType, SparseFormat format) {
+  CHECK_LE(newNnz, newHeight * newWidth);
+  size_t newSize = 0;
+  if (format == SPARSE_CSR) {
+    newSize = (newHeight + 1) * sizeof(int) + newNnz * sizeof(int);
+  } else {
+    newSize = (newWidth + 1) * sizeof(int) + newNnz * sizeof(int);
+  }
+
+  if (NO_VALUE != valueType) {
+    newSize += newNnz * sizeof(real);
+  }
+
+  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
+    memoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
+  }
+
+  height_ = newHeight;
+  width_ = newWidth;
+  elementCnt_ = newNnz;
+  valueType_ = valueType;
+  format_ = format;
+  sparseResize();
+}
+void CpuSparseMatrix::sparseResize() {
+  if (format_ == SPARSE_CSR) {
+    rows_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(memoryHandle_->getBuf()));
+    cols_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(memoryHandle_->getBuf()) +
+        (height_ + 1) * sizeof(int));
+    if (NO_VALUE != valueType_) {
+      value_ = reinterpret_cast<real*>(
+          reinterpret_cast<char*>(memoryHandle_->getBuf()) +
+          (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
+    } else {
+      value_ = NULL;
+    }
+  } else {
+    cols_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(memoryHandle_->getBuf()));
+    rows_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(memoryHandle_->getBuf()) +
+        (width_ + 1) * sizeof(int));
+    if (NO_VALUE != valueType_) {
+      value_ = reinterpret_cast<real*>(
+          reinterpret_cast<char*>(memoryHandle_->getBuf()) +
+          (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
+    } else {
+      value_ = NULL;
+    }
+  }
+}
+
+void CpuSparseMatrix::resize(size_t newHeight, size_t newWidth) {
+  resize(newHeight, newWidth, newHeight * std::min(DEFAULT_AVG_WIDTH, newWidth),
+         valueType_, format_);
+}
+
+MatrixPtr CpuSparseMatrix::getTranspose() {
+  if (!memoryHandle_ && !value_) {
+    MatrixPtr dest(new CpuSparseMatrix(height_, width_, elementCnt_, valueType_,
+                                       format_, true));
+    return dest;
+  } else if (memoryHandle_) {
+    MatrixPtr dest(new CpuSparseMatrix(
+        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_), height_,
+        width_, elementCnt_, valueType_, format_, true));
+    return dest;
+  } else if (value_) {
+    MatrixPtr dest(new CpuSparseMatrix(value_, rows_, cols_, height_, width_,
+                                       elementCnt_, valueType_, format_, true));
+    return dest;
+  } else {
+    return NULL;
+  }
+}
+
+SparseValueType CpuSparseMatrix::getValueType() { return valueType_; }
+
+void CpuSparseMatrix::mul(MatrixPtr a, MatrixPtr b, real scaleAB, real scaleT) {
+  CHECK(!isTransposed()) << "Not supported";
+
+  if (dynamic_cast<CpuMatrix*>(a.get()) && dynamic_cast<CpuMatrix*>(b.get())) {
+    CpuMatrix::mul(dynamic_cast<CpuMatrix*>(a.get()),
+                   dynamic_cast<CpuMatrix*>(b.get()), this, scaleAB, scaleT);
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+}
+
+void CpuSparseMatrix::add3(CpuMatrix* b) {
+  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
+  CHECK(height_ == b->getHeight());
+  CHECK(width_ == b->getWidth());
+  real* A = getValue();
+  real* B = b->getData();
+  int* cols = getCols();
+  for (size_t i = 0; i < height_; i++) {
+    size_t start = getRowStartIdx(i);
+    size_t end = getRowStartIdx(i + 1);
+    for (size_t j = start; j < end; j++) {
+      A[j] = B[i * width_ + cols[j]];
+    }
+  }
+}
+
+void CpuSparseMatrix::add3(MatrixPtr b) {
+  if (dynamic_cast<CpuMatrix*>(b.get())) {
+    add3(dynamic_cast<CpuMatrix*>(b.get()));
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+}
+
+void CpuSparseMatrix::addBias(Matrix& b, real scale) {
+  CHECK_EQ(b.getHeight(), (size_t)1);
+  CHECK_EQ(width_, b.getWidth());
+  real* A = getValue();
+  real* B = b.getData();
+  int* cols = getCols();
+  size_t nnz = getElementCnt();
+  for (size_t i = 0; i < nnz; i++) {
+    A[i] += scale * B[cols[i]];
+  }
+}
+
+template <class T>
+void printBuf(std::ostream& os, T* a, size_t len, const char* name) {
+  os << "\n: " << name << " [";
+  for (size_t i = 0; i < len; i++) {
+    os << a[i] << " ";
+  }
+  os << "]\n";
+}
+
+void CpuSparseMatrix::print(std::ostream& os) const {
+  size_t rowSize = format_ == SPARSE_CSC ? elementCnt_ : height_ + 1;
+  size_t colSize = format_ == SPARSE_CSC ? width_ + 1 : elementCnt_;
+  printBuf(os, rows_, rowSize, "row");
+  printBuf(os, cols_, colSize, "col");
+  if (valueType_ == FLOAT_VALUE) {
+    printBuf(os, value_, elementCnt_, "value");
+  }
+  return;
+}
+
+void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
+  CHECK_LT(idx, height_);
+  if (format_ == SPARSE_CSC) {
+    LOG(FATAL) << "SPARSE_CSC not supported";
+    return;
+  }
+
+  const int* col = getRowCols(idx);
+  size_t num = getColNum(idx);
+  if (num > 0) {
+    if (valueType_ == FLOAT_VALUE) {
+      const real* data = getRowValues(idx);
+      os << col[0] << ":" << data[0];
+      for (size_t i = 1; i < num; ++i) {
+        os << " " << col[i] << ":" << data[i];
+      }
+    } else {
+      os << col[0];
+      for (size_t i = 1; i < num; ++i) {
+        os << " " << col[i];
+      }
+    }
+  }
+  os << ";";
+}
+
+void CpuSparseMatrix::randomizeUniform() {
+  CHECK_LE(elementCnt_, height_ * width_);
+  if (valueType_ == FLOAT_VALUE) {
+    real* data = getValue();
+    for (size_t i = 0; i < elementCnt_; ++i) {
+      *data++ = rand() / static_cast<real>(RAND_MAX);  // NOLINT
+    }
+  }
+  if (format_ == SPARSE_CSR) {
+    sparseRand(rows_, cols_, elementCnt_, height_ + 1, width_, false);
+  } else {
+    sparseRand(cols_, rows_, elementCnt_, width_ + 1, height_, false);
+  }
+}
+
+void CpuSparseMatrix::copyFrom(std::vector<int>& rows, std::vector<int>& cols,
+                               std::vector<real>& values) {
+  size_t size = format_ == SPARSE_CSR ? cols.size() : rows.size();
+  resize(height_, width_, size, valueType_, format_);
+  if (valueType_ == FLOAT_VALUE) {
+    memcpy(&value_[0], &values[0], sizeof(real) * values.size());
+  }
+  memcpy(&cols_[0], &cols[0], sizeof(int) * cols.size());
+  memcpy(&rows_[0], &rows[0], sizeof(int) * rows.size());
+}
+
+// Copy from a CpuMatrix, only supported in sparse_float_value_t
+// SparseMatrix.
+void CpuSparseMatrix::copyFrom(const CpuMatrix& src) {
+  CHECK_EQ(getHeight(), src.getHeight());
+  CHECK_EQ(getWidth(), src.getWidth());
+  CHECK(!src.trans_ && !trans_);
+  if (format_ == SPARSE_CSR) {
+    std::vector<int> rows(getHeight() + 1);
+    std::vector<int> cols;
+    std::vector<real> values;
+    rows[0] = 0;
+    for (size_t r = 0; r < getHeight(); ++r) {
+      for (size_t c = 0; c < getWidth(); ++c) {
+        real v = src.getElement(r, c);
+        if (fabs(v) > FLT_EPSILON) {
+          cols.push_back(c);
+          values.push_back(v);
+        }
+      }
+      rows[r + 1] = values.size();
+    }
+    copyFrom(rows, cols, values);
+  } else {
+    std::vector<int> cols(getWidth() + 1);
+    std::vector<int> rows;
+    std::vector<real> values;
+    cols[0] = 0;
+    for (size_t r = 0; r < getWidth(); ++r) {
+      for (size_t c = 0; c < getHeight(); ++c) {
+        real v = src.getElement(c, r);
+        if (fabs(v) > FLT_EPSILON) {
+          rows.push_back(c);
+          values.push_back(v);
+        }
+      }
+      cols[r + 1] = values.size();
+    }
+    copyFrom(rows, cols, values);
+  }
+}
+
+MatrixPtr CpuSparseMatrix::clone(size_t height, size_t width, bool useGpu) {
+  if (height == 0 && width == 0) {
+    height = height_;
+    width = width_;
+  }
+  CHECK(width && height);
+  if (!useGpu) {
+    return std::make_shared<CpuSparseMatrix>(height, width, 0, valueType_,
+                                             format_);
+  } else {
+    return std::make_shared<GpuSparseMatrix>(height, width, elementCnt_,
+                                             valueType_, format_);
+  }
+}
+
+MatrixPtr CpuSparseMatrix::subMatrix(size_t startRow, size_t numRows) {
+  CHECK_LE(startRow + numRows, height_);
+  CHECK_EQ(format_, SPARSE_CSR);
+  if (valueType_ == NO_VALUE) {
+    return std::make_shared<CpuSparseMatrix>(
+        nullptr, rows_ + startRow, cols_, numRows, width_,
+        rows_[startRow + numRows] - rows_[startRow], valueType_, format_,
+        trans_);
+  } else {
+    return std::make_shared<CpuSparseMatrix>(
+        value_, rows_ + startRow, cols_, numRows, width_,
+        rows_[startRow + numRows] - rows_[startRow], valueType_, format_,
+        trans_);
+  }
+}
+
+/* mem MUST be alloced outside (memAlloc=false) */
+void CpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
+  CHECK(!memAlloc);
+  CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(matTrans.get());
+  if (format_ == SPARSE_CSR) {
+    /*statistic element number in each col*/
+    int* colCounters = mat->getRows() + 1;
+    memset(colCounters, 0, sizeof(int) * width_);
+    for (size_t i = 0; i < elementCnt_; ++i) {
+      int col = cols_[i];
+      colCounters[col]++;
+    }
+    /*fill mat rows */
+    mat->getRows()[0] = 0;
+    for (size_t i = 1; i < width_ + 1; i++) {
+      mat->getRows()[i] = mat->getRows()[i - 1] + mat->getRows()[i];
+    }
+    /*fill mat values and cols*/
+    std::vector<int> colNumVec(width_, 0);
+    if (valueType_ == FLOAT_VALUE) {
+      for (size_t i = 0; i < height_; i++) {
+        for (int j = rows_[i]; j < rows_[i + 1]; j++) {
+          int colIdx = cols_[j];
+          int index = mat->getRows()[colIdx] + colNumVec[colIdx];
+          mat->getCols()[index] = i;
+          mat->getValue()[index] = value_[j];
+          colNumVec[colIdx]++;
+        }
+      }
+    } else {
+      for (size_t i = 0; i < height_; i++) {
+        for (int j = rows_[i]; j < rows_[i + 1]; j++) {
+          int colIdx = cols_[j];
+          int index = mat->getRows()[colIdx] + colNumVec[colIdx];
+          mat->getCols()[index] = i;
+          colNumVec[colIdx]++;
+        }
+      }
+    }
+  } else {
+    /*statistic element number in each row*/
+    int* rowCounters = mat->getCols() + 1;
+    memset(rowCounters, 0, sizeof(int) * height_);
+    for (size_t i = 0; i < elementCnt_; ++i) {
+      int row = rows_[i];
+      rowCounters[row]++;
+    }
+
+    /*fill mat cols */
+    mat->getCols()[0] = 0;
+    for (size_t i = 1; i < height_ + 1; i++) {
+      mat->getCols()[i] = mat->getCols()[i - 1] + mat->getCols()[i];
+    }
+    /*fill mat values and rows*/
+    std::vector<int> rowNumVec(height_, 0);
+    if (valueType_ == FLOAT_VALUE) {
+      for (size_t i = 0; i < width_; i++) {
+        for (int j = cols_[i]; j < cols_[i + 1]; j++) {
+          int rowIdx = rows_[j];
+          int index = mat->getCols()[rowIdx] + rowNumVec[rowIdx];
+          mat->getRows()[index] = i;
+          mat->getValue()[index] = value_[j];
+          rowNumVec[rowIdx]++;
+        }
+      }
+    } else {
+      for (size_t i = 0; i < width_; i++) {
+        for (int j = cols_[i]; j < cols_[i + 1]; j++) {
+          int rowIdx = rows_[j];
+          int index = mat->getCols()[rowIdx] + rowNumVec[rowIdx];
+          mat->getRows()[index] = i;
+          rowNumVec[rowIdx]++;
+        }
+      }
+    }
+  }
+}
+
+void CpuSparseMatrix::setRow(size_t row, size_t colNum,
+                             const unsigned int* cols, const real* values) {
+  if (format_ == SPARSE_CSR) {
+    CHECK_LT(row, height_);
+    CHECK(NULL != cols);
+    for (size_t i = row; i < height_; i++) {
+      CHECK_EQ(rows_[i + 1], rows_[i]);
+    }
+    if (0 == row) {
+      rows_[row] = 0;
+    }
+    rows_[row + 1] = rows_[row] + colNum;
+    for (size_t i = 0; i < colNum; ++i) {
+      cols_[rows_[row] + i] = cols[i];
+    }
+    if (valueType_ == NO_VALUE) {
+      CHECK(!values);
+    } else {
+      for (size_t i = 0; i < colNum; ++i) {
+        value_[rows_[row] + i] = values[i];
+      }
+    }
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+}
+
+void CpuSparseMatrix::fillRowIndices(IVectorPtr& outVec) const {
+  if (format_ == SPARSE_CSR) {
+    auto nnz = getElementCnt();
+    IVector::resizeOrCreate(outVec, nnz, false);
+    auto out = outVec->getData();
+    int* rows = getRows();
+    for (size_t i = 0; i < height_; i++) {
+      for (int j = rows[i]; j < rows[i + 1]; j++) {
+        out[j] = i;
+      }
+    }
+  } else {
+    LOG(FATAL) << "SPARSE_CSC not supported";
+  }
+}
+
+ThreadLocal<std::vector<CpuSparseMatrixPtr>> CpuSparseMatrix::cpuLocalMats_;
+
+CpuSparseMatrixPtr CpuSparseMatrix::getTmpSparseMatrix(size_t height,
+                                                       size_t width) {
+  std::vector<CpuSparseMatrixPtr>* localMats = cpuLocalMats_.get();
+  auto it = localMats->begin();
+  while (it != localMats->end()) {
+    if (it->unique()) {
+      (*it)->resize(height, width, elementCnt_, valueType_, format_);
+      return *it;
+    }
+  }
+  localMats->emplace_back(std::make_shared<CpuSparseMatrix>(
+      height, width, elementCnt_, valueType_, format_, false));
+  return localMats->back();
+}
+
+void CpuSparseMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
+  if (dynamic_cast<const GpuSparseMatrix*>(&src)) {
+    auto tmpSrc = dynamic_cast<const GpuSparseMatrix*>(&src);
+    copyFrom(*tmpSrc, stream);
+  } else if (dynamic_cast<const CpuSparseMatrix*>(&src)) {
+    auto tmpSrc = dynamic_cast<const CpuSparseMatrix*>(&src);
+    copyFrom(*tmpSrc);
+  } else if (dynamic_cast<const CpuMatrix*>(&src)) {
+    auto tmpSrc = dynamic_cast<const CpuMatrix*>(&src);
+    copyFrom(*tmpSrc);
+  } else {
+    LOG(FATAL) << "not implemented";
+  }
+}
+
+void CpuSparseMatrix::copyFrom(const Matrix& src) {
+  if (dynamic_cast<const CpuSparseMatrix*>(&src)) {
+    auto tmpSrc = dynamic_cast<const CpuSparseMatrix*>(&src);
+    copyFrom(*tmpSrc);
+  } else if (dynamic_cast<const CpuMatrix*>(&src)) {
+    auto tmpSrc = dynamic_cast<const CpuMatrix*>(&src);
+    copyFrom(*tmpSrc);
+  } else {
+    LOG(FATAL) << "not implemented";
+  }
+}
+
+void CpuSparseMatrix::copyFrom(const GpuSparseMatrix& src, hl_stream_t stream) {
+  CHECK_EQ(height_, src.getHeight());
+  CHECK_EQ(width_, src.getWidth());
+  CHECK_EQ(size_t(elementCnt_), src.getElementCnt());
+  size_t valSize = valueType_ == NO_VALUE ? 0 : elementCnt_;
+  if (format_ == SPARSE_CSC)
+    hl_memcpy_from_csc_matrix(value_, valSize, rows_, elementCnt_, cols_,
+                              width_ + 1, src.sMatrix_.get(), stream);
+  else
+    hl_memcpy_from_csr_matrix(value_, valSize, rows_, height_ + 1, cols_,
+                              elementCnt_, src.sMatrix_.get(), stream);
+}
+
+void CpuSparseMatrix::copyFrom(const CpuSparseMatrix& src) {
+  CHECK_EQ(height_, src.getHeight());
+  CHECK_EQ(width_, src.getWidth());
+  CHECK_EQ(format_, src.getFormat());
+  int start = format_ == SPARSE_CSR ? src.getRows()[0] : src.getCols()[0];
+  if (format_ == SPARSE_CSR) {
+    size_t totalColNum = 0;
+    for (size_t i = 0; i < height_; ++i) {
+      totalColNum += src.getColNum(i);
+    }
+    resize(height_, width_, totalColNum, valueType_, format_);
+    rows_[0] = 0;
+    for (size_t i = 0; i < height_; ++i) {
+      rows_[i + 1] = rows_[i] + src.getColNum(i);
+    }
+    memcpy(cols_, src.getCols() + start, totalColNum * sizeof(int));
+  } else {
+    size_t totalColNum = 0;
+    for (size_t i = 0; i < width_; ++i) {
+      totalColNum += src.getRowNum(i);
+    }
+    resize(height_, width_, totalColNum, valueType_, format_);
+    cols_[0] = 0;
+    for (size_t i = 0; i < width_; ++i) {
+      cols_[i + 1] = cols_[i] + src.getRowNum(i);
+    }
+    memcpy(rows_, src.getRows() + start, totalColNum * sizeof(int));
+  }
+
+  // if have different value type, only copy rows and cols
+  if (valueType_ == FLOAT_VALUE && src.getValueType() == FLOAT_VALUE) {
+    memcpy(value_, src.getValue() + start, elementCnt_ * sizeof(real));
+  }
+}
+
+void CpuSparseMatrix::copyRow(int offsets, size_t colNum,
+                              const sparse_non_value_t* row) {
+  for (size_t j = 0; j < colNum; j++) {
+    cols_[offsets + j] = row[j].col;
+  }
+}
+
+void CpuSparseMatrix::copyRow(int offsets, size_t colNum,
+                              const sparse_float_value_t* row) {
+  for (size_t j = 0; j < colNum; j++) {
+    cols_[offsets + j] = row[j].col;
+    value_[offsets + j] = row[j].value;
+  }
+}
+
+template <class T>
+void CpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data) {
+  size_t totalColNum = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    int64_t id = ids[i];
+    totalColNum += indices[id + 1] - indices[id];
+  }
+  valueType_ = typeid(T) == typeid(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE;
+
+  resize(height_, width_, totalColNum, valueType_, format_);
+
+  rows_[0] = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    int64_t id = ids[i];
+    T* row = data + indices[id];
+    size_t colNum = indices[id + 1] - indices[id];
+    rows_[i + 1] = rows_[i] + colNum;
+    copyRow(rows_[i], colNum, row);
+  }
+}
+
+template <class T>
+void CpuSparseMatrix::copyFrom(int64_t* indices, T* data) {
+  CHECK(format_ == SPARSE_CSR);
+  size_t totalColNum = indices[height_] - indices[0];
+  valueType_ = typeid(T) == typeid(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE;
+  resize(height_, width_, totalColNum, valueType_, format_);
+
+  rows_[0] = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    T* row = data + indices[i];
+    size_t colNum = indices[i + 1] - indices[i];
+    rows_[i + 1] = rows_[i] + colNum;
+    copyRow(rows_[i], colNum, row);
+  }
+}
+
+void CpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
+  CHECK_EQ(height_, src.getHeight());
+  CHECK_LE(width_, src.getWidth());
+  CHECK_EQ(format_, src.getFormat());
+  CHECK_EQ(valueType_, src.getValueType());
+  if (format_ == SPARSE_CSR) {
+    int* srcCols = src.getCols();
+    size_t numLessWidth =
+        std::count_if(srcCols, srcCols + src.getElementCnt(),
+                      [this](size_t n) { return n < this->width_; });
+    resize(height_, width_, numLessWidth, valueType_, format_);
+    rows_[0] = 0;
+    size_t index = 0;
+    for (size_t r = 0; r < height_; ++r) {
+      for (int i = src.getRows()[r]; i < src.getRows()[r + 1]; ++i) {
+        if (srcCols[i] < static_cast<int>(width_)) {
+          cols_[index] = srcCols[i];
+          if (valueType_ == FLOAT_VALUE) {
+            value_[index] = src.getValue()[i];
+          }
+          ++index;
+        }
+      }
+      rows_[r + 1] = index;
+    }
+    CHECK_EQ(index, numLessWidth);
+  } else {
+    size_t numLessWidth = src.getCols()[width_] - src.getCols()[0];
+    resize(height_, width_, numLessWidth, valueType_, format_);
+    cols_[0] = 0;
+    size_t index = 0;
+    // note: c < width_, not src.getWidth();
+    for (size_t c = 0; c < width_; ++c) {
+      for (int i = src.getCols()[c]; i < src.getCols()[c + 1]; ++i) {
+        rows_[index] = src.getRows()[i];
+        if (valueType_ == FLOAT_VALUE) {
+          value_[index] = src.getValue()[i];
+        }
+        ++index;
+      }
+      cols_[c + 1] = index;
+    }
+    CHECK_EQ(index, numLessWidth);
+  }
+}
+
+void CpuSparseMatrix::zeroMem() {
+  CHECK(valueType_ == FLOAT_VALUE);
+  memset(value_, 0, elementCnt_* sizeof(real));
+}
+
+template void CpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices,
+                                        sparse_non_value_t* data);
+
+template void CpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices,
+                                        sparse_float_value_t* data);
+
+template void CpuSparseMatrix::copyFrom(int64_t* indices,
+                                        sparse_non_value_t* data);
+
+template void CpuSparseMatrix::copyFrom(int64_t* indices,
+                                        sparse_float_value_t* data);
+
+void CpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
+  size_t numSamples = getHeight();
+  size_t beam = maxVal.getWidth();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getHeight(), numSamples);
+  maxVal.zeroMem();
+  int* outids = maxIds.getData();
+  real* outvalues = maxVal.getData();
+
+  typedef std::pair<real, size_t> valuepair;
+  std::vector<valuepair> vec;
+  for (size_t i = 0; i < numSamples; i++) {
+    vec.clear();
+
+    auto num = getColNum(i);
+    auto ids = getRowCols(i);
+    auto values = getRowValues(i);
+    for (size_t j = 0; j < num; j++) {
+      vec.push_back(std::make_pair(values[j], ids[j]));
+    }
+
+    size_t outsize = std::min(num, beam);
+    std::partial_sort(vec.begin(), vec.begin() + outsize, vec.end(),
+                      [](const valuepair& a, const valuepair& b) {
+                        return a.first > b.first;
+                      });
+    for (size_t j = 0; j < outsize; j++) {
+      outids[i * beam + j] = vec[j].second;
+      outvalues[i * beam + j] = vec[j].first;
+    }
+    if (outsize < beam) {
+      // if the number of values to sort are less than the output size,
+      // use -1 to indicate the end of valid sorted values.
+      outids[i * beam + outsize] = -1;
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
new file mode 100644
index 00000000000000..fd3b5030bea7ac
--- /dev/null
+++ b/paddle/math/CpuSparseMatrix.h
@@ -0,0 +1,295 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+#include <cstddef>
+#include "Matrix.h"
+
+namespace paddle {
+
+class CpuSparseMatrix : public Matrix {
+public:
+  CpuSparseMatrix(size_t height, size_t width,
+                  size_t nnz, /* used to allocate space */
+                  SparseValueType valueType = FLOAT_VALUE,
+                  SparseFormat format = SPARSE_CSR, bool trans = false);
+
+  CpuSparseMatrix(CpuMemHandlePtr memHandle, size_t height, size_t width,
+                  size_t nnz, SparseValueType valueType, SparseFormat format,
+                  bool trans);
+
+  CpuSparseMatrix(real* data, int* rows, int* cols, size_t height, size_t width,
+                  size_t nnz, SparseValueType valueType, SparseFormat format,
+                  bool trans);
+
+  ~CpuSparseMatrix() {}
+
+  void resize(size_t newHeight, size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType, SparseFormat format);
+  void resize(size_t newHeight, size_t newWidth);
+
+  MatrixPtr getTranspose();
+
+  SparseValueType getValueType();
+
+  real* getRowValues(size_t i) const {
+    if (format_ == SPARSE_CSR) {
+      return value_ + rows_[i];
+    } else {
+      LOG(FATAL) << "SPARSE_CSC not supported";
+      return 0;
+    }
+  }
+
+  int* getRowCols(size_t i) const {
+    if (format_ == SPARSE_CSR) {
+      return cols_ + rows_[i];
+    } else {
+      LOG(FATAL) << "SPARSE_CSC not supported";
+      return 0;
+    }
+  }
+
+  /// fill row indices of each value in CSR matrix
+  void fillRowIndices(IVectorPtr& outVec) const;
+
+  size_t getColNum(size_t i) const {
+    if (format_ == SPARSE_CSR) {
+      return rows_[i + 1] - rows_[i];
+    } else {
+      LOG(FATAL) << "SPARSE_CSC not supported";
+      return 0;
+    }
+  }
+
+
+
+  real* getColumn(size_t i) const {
+    if (format_ == SPARSE_CSC) {
+      return value_ + cols_[i];
+    } else {
+      LOG(FATAL) << "SPARSE_CSR not supported";
+      return 0;
+    }
+  }
+
+  size_t getColStartIdx(size_t i) const {
+    if (format_ == SPARSE_CSC) {
+      return cols_[i];
+    } else {
+      LOG(FATAL) << "SPARSE_CSR not supported";
+      return 0;
+    }
+  }
+
+  size_t getRowStartIdx(size_t i) const {
+    if (format_ == SPARSE_CSR) {
+      return rows_[i];
+    } else {
+      LOG(FATAL) << "SPARSE_CSC not supported";
+      return 0;
+    }
+  }
+
+  size_t getRowNum(size_t i) const {
+    if (format_ == SPARSE_CSC) {
+      return cols_[i + 1] - cols_[i];
+    } else {
+      LOG(FATAL) << "SPARSE_CSR not supported";
+      return 0;
+    }
+  }
+
+  virtual real getSum() {
+    CHECK(isContiguous());
+    if (valueType_ == NO_VALUE) {
+      return elementCnt_;
+    }
+    double sum = 0;
+    for (size_t i = 0; i < elementCnt_; ++i) {
+      sum += value_[i];
+    }
+    return sum;
+  }
+
+  virtual void square() {
+    CHECK(isContiguous());
+    if (valueType_ == NO_VALUE) {
+      return;
+    }
+    for (size_t i = 0; i < elementCnt_; ++i) {
+      value_[i] = value_[i] * value_[i];
+    }
+  }
+
+  /**
+   * only consider nonzero values.
+   * the actual min value should compare with 0.0.
+   */
+  virtual real getMin() {
+    CHECK(isContiguous());
+    if (valueType_ == NO_VALUE) {
+      return (elementCnt_ > 0 ? 1.0 : 0.0);
+    }
+    real min = value_[0];
+    for (size_t i = 1; i < elementCnt_; ++i) {
+      min = value_[i] < min ? value_[i] : min;
+    }
+    return min;
+  }
+
+  /**
+   * only consider nonzero values.
+   * the actual max value should compare with 0.0.
+   */
+  virtual real getMax() {
+    CHECK(isContiguous());
+    if (valueType_ == NO_VALUE) {
+      return (elementCnt_ > 0 ? 1.0 : 0.0);
+    }
+    real max = value_[0];
+    for (size_t i = 1; i < elementCnt_; ++i) {
+      max = value_[i] > max ? value_[i] : max;
+    }
+    return max;
+  }
+
+  void rowMax(IVector& maxIds, Matrix& maxVal);
+  int* getRows() const { return rows_; }
+  int* getCols() const { return cols_; }
+  real* getValue() const { return value_; }
+  SparseFormat getFormat() const { return format_; }
+  SparseValueType getValueType() const { return valueType_; }
+
+  /**
+   * @brief return value_ of sparse matrix
+   *
+   * Some times CpuSparseMatrix maybe Matrix,
+   * if getValue, must dynamic_cast to CpuSparseMatrix,
+   * getData is convenient to get value
+   */
+  real* getData() { return getValue(); }
+  const real* getData() const { return getValue();}
+
+  /**
+   * @brief only set value_ of FLOAT_VALUE sparse matrix to zero
+   */
+  void zeroMem();
+
+  /// mem MUST be alloced outside (memAlloc=false)
+  void transpose(MatrixPtr matTrans, bool memAlloc);
+
+  void mul(MatrixPtr A, MatrixPtr B, real alpha, real beta);
+
+  /**
+   * @brief sparseMatrix += denseMatrix
+   *
+   *  Named add3 just because add/add2 has been used in BaseMatrix.cu
+   *  and they are not virtual function.
+   *
+   *  Only add value of same (row, col) index in dense matrix
+   *  and do not use others values whoes postions are not in sparse matirx.
+   *
+   * @param[in]  b   dense matrix
+   */
+  void add3(CpuMatrix* b);
+  void add3(MatrixPtr b);
+
+  /**
+   * @brief sparseMatrix[i,j] += bias[j], (j is the col index of sparse matrix)
+   *
+   * @param[in]  b      bias, dense matrix and height = 1
+   * @param[in]  scale  scale of b
+   */
+  void addBias(Matrix& b, real scale);
+
+  void print(std::ostream& os) const;
+
+  void printOneRow(std::ostream& os, size_t idx) const;
+
+  void setRow(size_t row, size_t colNum, const unsigned int* cols,
+              const real* values);
+
+  void randomizeUniform();
+
+  void copyFrom(const GpuSparseMatrix& src, hl_stream_t stream);
+
+  void copyFrom(const Matrix& src, hl_stream_t stream = HPPL_STREAM_DEFAULT);
+
+  void copyFrom(const Matrix& src);
+
+  /**
+   * Get a temporary matrix. This is threadsafe. It should be only used
+   * temporarily, i.e. do not store it or use it as return value.
+   *
+   * @note  Do NOT use large amount of tmp matrix.
+   */
+  CpuSparseMatrixPtr getTmpSparseMatrix(size_t height, size_t width);
+
+  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows);
+
+  void copyFrom(std::vector<int>& rows, std::vector<int>& cols,
+                std::vector<real>& values);
+
+  void copyFrom(const CpuMatrix& src);
+
+  void copyFrom(const CpuSparseMatrix& src);
+
+  // trim the large size
+  void trimFrom(const CpuSparseMatrix& src);
+
+  void copyRow(int offsets, size_t colNum, const sparse_non_value_t* row);
+
+  void copyRow(int offsets, size_t colNum, const sparse_float_value_t* row);
+
+  template <class T>
+  void copyFrom(int64_t* ids, int64_t* indices, T* data);
+
+  template <class T>
+  void copyFrom(int64_t* indices, T* data);
+
+  void copyFrom(const real* data, size_t len) {
+    LOG(FATAL) << "not supported!";
+  }
+
+private:
+  MatrixPtr clone(size_t height = 0, size_t width = 0, bool useGpu = false);
+
+protected:
+  void sparseResize();
+  /*for csr , record row start position, for csc, record row index for every no
+   * zero value*/
+  int* rows_;
+  /*for csc , record col start position, for csr, record col index for every no
+   * zero value*/
+  int* cols_;
+  real* value_;               /*nonzero value*/
+  SparseFormat format_;       /* matrix format */
+  SparseValueType valueType_; /*with value or not  */
+  static const size_t DEFAULT_AVG_WIDTH = 20;
+
+  static ThreadLocal<std::vector<CpuSparseMatrixPtr>> cpuLocalMats_;
+
+  // BaseMatrixT interface
+public:
+  bool isSparse() const {
+    return true;
+  }
+
+private:
+  using Matrix::copyFrom;
+};
+}  // namespace paddle
diff --git a/paddle/math/ExecViaCpu.h b/paddle/math/ExecViaCpu.h
new file mode 100644
index 00000000000000..64e5b831216849
--- /dev/null
+++ b/paddle/math/ExecViaCpu.h
@@ -0,0 +1,191 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+/*
+ execViaCpu is used to do operations on GpuMatirx and/or GpuIVector through
+ cpu functions. It can automatically make a temporary CPU copy for the
+ gpu matrix/vector, and copy back after executing the CPU function.
+
+ Examples:
+ 1. For a function, functor or lambda:
+   r = execViaCpu(&f, mat, vec)
+
+ 2. For member function of CpuMatirx, execViaCpu2 should be used:
+   execViaCpu2(&CpuMatrix::selectElements, *this, table, ids)
+*/
+
+#pragma once
+
+namespace paddle {
+
+template <typename Arg>
+class CopyToCpu {
+public:
+  explicit CopyToCpu(Arg& arg) : arg_(arg) {}
+  Arg& copiedArg() const { return arg_; }
+
+private:
+  Arg& arg_;
+};
+
+template <>
+class CopyToCpu<Matrix> {
+public:
+  explicit CopyToCpu(Matrix& arg) : arg_(arg) {
+    if (arg.useGpu()) {
+      CHECK(!arg.isTransposed()) << "Not supported";
+      copied_ = Matrix::create(arg.getHeight(), arg.getWidth(),
+                               /* trans= */ false, /* useGpu= */ false);
+      copied_->copyFrom(arg);
+    }
+  }
+  ~CopyToCpu() {
+    if (copied_) {
+      arg_.copyFrom(*copied_);
+    }
+  }
+  Matrix& copiedArg() const { return copied_ ? *copied_ : arg_; }
+
+private:
+  Matrix& arg_;
+  MatrixPtr copied_;
+};
+
+template <>
+class CopyToCpu<const Matrix> {
+public:
+  explicit CopyToCpu(const Matrix& arg) : arg_(arg) {
+    if (arg.useGpu()) {
+      CHECK(!arg.isTransposed()) << "Not supported";
+      copied_ = Matrix::create(arg.getHeight(), arg.getWidth(),
+                               /* trans= */ false, /* useGpu= */ false);
+      copied_->copyFrom(arg);
+    }
+  }
+  const Matrix& copiedArg() const { return copied_ ? *copied_ : arg_; }
+
+private:
+  const Matrix& arg_;
+  MatrixPtr copied_;
+};
+
+template <>
+class CopyToCpu<IVector> {
+public:
+  explicit CopyToCpu(IVector& arg) : arg_(arg) {
+    if (arg.useGpu()) {
+      copied_ = IVector::create(arg.getSize(), /* useGpu= */ false);
+      copied_->copyFrom(arg);
+    }
+  }
+  ~CopyToCpu() {
+    if (copied_) {
+      arg_.copyFrom(*copied_);
+    }
+  }
+  IVector& copiedArg() const { return copied_ ? *copied_ : arg_; }
+
+private:
+  IVector& arg_;
+  IVectorPtr copied_;
+};
+
+template <>
+class CopyToCpu<const IVector> {
+public:
+  explicit CopyToCpu(const IVector& arg) : arg_(arg) {
+    if (arg.useGpu()) {
+      copied_ = IVector::create(arg.getSize(), /* useGpu= */ false);
+      copied_->copyFrom(arg);
+    }
+  }
+  const IVector& copiedArg() const { return copied_ ? *copied_ : arg_; }
+
+private:
+  const IVector& arg_;
+  IVectorPtr copied_;
+};
+
+namespace detail {
+
+template <bool isFunction, bool isFunctionPointer, bool isClass, typename F>
+class GpuFuncWrapperImp;
+
+template <typename F, typename R, typename... Args>
+class GpuFuncWrapperBase {
+public:
+  typedef R ResultType;
+  R operator()(F&& f, Args... args) {
+    return f(CopyToCpu<typename std::remove_reference<Args>::type>(args)
+                 .copiedArg()...);
+  }
+};
+
+// function
+template <typename R, typename... Args>
+class GpuFuncWrapperImp<true, false, false, R(Args...)>
+    : public GpuFuncWrapperBase<R(Args...), R, Args...> {};
+
+// function pointer
+template <typename R, typename... Args>
+class GpuFuncWrapperImp<false, true, false, R (*)(Args...)>
+    : public GpuFuncWrapperBase<R (*)(Args...), R, Args...> {};
+
+template <typename F, typename Op>
+class GpuFuncWrapperImp2;
+
+template <typename F, typename C, typename R, typename... Args>
+class GpuFuncWrapperImp2<F, R (C::*)(Args...) const>
+    : public GpuFuncWrapperBase<F, R, Args...> {};
+
+template <typename F, typename C, typename R, typename... Args>
+class GpuFuncWrapperImp2<F, R (C::*)(Args...)>
+    : public GpuFuncWrapperBase<F, R, Args...> {};
+
+// functor or lambda
+template <typename F>
+class GpuFuncWrapperImp<false, false, true, F>
+    : public GpuFuncWrapperImp2<F, decltype(&F::operator())> {};
+
+template <typename F>
+class GpuFuncWrapper2
+    : public GpuFuncWrapperImp<
+          std::is_function<F>::value,
+          std::is_pointer<F>::value &&
+              std::is_function<typename std::remove_pointer<F>::type>::value,
+          std::is_class<F>::value, F> {};
+
+template <typename F>
+class GpuFuncWrapper
+    : public GpuFuncWrapper2<typename std::remove_reference<F>::type> {};
+
+}  // namespace detail
+
+template <typename F, typename... Args>
+typename detail::GpuFuncWrapper<F>::ResultType execViaCpu(F&& f,
+                                                          Args&&... args) {
+  return detail::GpuFuncWrapper<F>()(std::move(f), args...);
+}
+
+// The second version is for F as member function of CpuMatrix
+template <typename R, typename... FArgs, typename... Args>
+R execViaCpu2(R (CpuMatrix::*f)(FArgs...), Args&&... args) {
+  auto lambda = [](R (CpuMatrix::*f)(FArgs...), Matrix& ths, FArgs... args) {
+    return (((CpuMatrix&)ths).*f)(args...);
+  };
+  return execViaCpu(lambda, f, args...);
+}
+
+}  // namespace paddle
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
new file mode 100644
index 00000000000000..da493379e3a37e
--- /dev/null
+++ b/paddle/math/MathFunctions.cpp
@@ -0,0 +1,201 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "MathFunctions.h"
+#include "hl_matrix_ops.cuh"
+#include "hl_matrix_apply.cuh"
+
+namespace paddle {
+
+template<>
+void gemm<float>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
+                 const int M, const int N, const int K,
+                 const float alpha, const float* A, const int lda,
+                 const float* B, const int ldb,
+                 const float beta, float* C, const int ldc) {
+  cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+              beta, C, ldc);
+}
+
+template<>
+void gemm<double>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
+                  const int M, const int N, const int K,
+                  const double alpha, const double* A, const int lda,
+                  const double* B, const int ldb,
+                  const double beta, double* C, const int ldc) {
+  cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+              beta, C, ldc);
+}
+
+template<>
+void axpy<float>(const int n, const float alpha, const float* x, float* y) {
+  cblas_saxpy(n, alpha, x, 1, y, 1);
+}
+
+template<>
+void axpy<double>(const int n, const double alpha, const double* x, double* y) {
+  cblas_daxpy(n, alpha, x, 1, y, 1);
+}
+
+template<>
+float dotProduct<float>(const int n, const float* x, const float* y) {
+  return cblas_sdot(n, x, 1, y, 1);
+}
+
+template<>
+double dotProduct<double>(const int n, const double* x, const double* y) {
+  return cblas_ddot(n, x, 1, y, 1);
+}
+
+#ifdef PADDLE_USE_MKL
+
+template<>
+void vExp<float>(const int n, const float* a, float* r) {
+  vsExp(n, a, r);
+}
+
+template<>
+void vExp<double>(const int n, const double* a, double* r) {
+  vdExp(n, a, r);
+}
+
+template<>
+void vPow<float>(const int n, const float* a, const float b, float* r) {
+  vsPowx(n, a, b, r);
+}
+
+template<>
+void vPow<double>(const int n, const double* a, const double b, double* r) {
+  vdPowx(n, a, b, r);
+}
+
+template<>
+void vLog<float>(const int n, const float* a, float* r) {
+  vsLn(n, a, r);
+}
+
+template<>
+void vLog<double>(const int n, const double* a, double* r) {
+  vdLn(n, a, r);
+}
+
+template<>
+void vAdd<float>(const int n, const float* a, const float* b, float* r) {
+  vsAdd(n, a, b, r);
+}
+
+template<>
+void vAdd<double>(const int n, const double* a, const double* b, double* r) {
+  vdAdd(n, a, b, r);
+}
+
+template<>
+void vInvSqrt<float>(const int n, const float* a, float* r) {
+  vsInvSqrt(n, a, r);
+}
+
+template<>
+void vInvSqrt<double>(const int n, const double* a, double* r) {
+  vdInvSqrt(n, a, r);
+}
+
+template<>
+void vLog1p<float>(const int n, const float* a, float* r) {
+  vsLog1p(n, a, r);
+}
+
+template<>
+void vLog1p<double>(const int n, const double* a, double* r) {
+  vdLog1p(n, a, r);
+}
+
+template<>
+void vTanh<float>(const int n, const float* a, float* r) {
+  vsTanh(n, a, r);
+}
+
+template<>
+void vTanh<double>(const int n, const double* a, double* r) {
+  vdTanh(n, a, r);
+}
+#else
+
+DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
+template<class T>
+void vExp(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
+    binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
+template<class T>
+void vLog(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
+    binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
+template<class T>
+void vInvSqrt(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vInvSqrt<T>, 0, 0>(
+    binary::vInvSqrt<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vLog1p, b = std::log(1.0f + a));
+template<class T>
+void vLog1p(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vLog1p<T>, 0, 0>(
+    binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vTanh, b = 2.0 / (1.0 + std::exp(-2 * a)) - 1.0);
+template<class T>
+void vTanh(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vTanh<T>, 0, 0>(
+    binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
+template<class T>
+void vPow(const int n, const T* a, const T b, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
+    binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
+template<class T>
+void vAdd(const int n, const T* a, const T* b, T* r) {
+  hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
+    const_cast<T*>(a), const_cast<T*>(b), r, 1, n, n, n , n);
+}
+
+template void vExp(const int n, const float* a, float* r);
+template void vExp(const int n, const double* a, double* r);
+template void vLog(const int n, const float* a, float* r);
+template void vLog(const int n, const double* a, double* r);
+template void vInvSqrt(const int n, const double* a, double* r);
+template void vInvSqrt(const int n, const float* a, float* r);
+template void vLog1p(const int n, const float* a, float* r);
+template void vLog1p(const int n, const double* a, double* r);
+template void vTanh(const int n, const float* a, float* r);
+template void vTanh(const int n, const double* a, double* r);
+template void vPow(const int n, const float* a, const float b, float* r);
+template void vPow(const int n, const double* a, const double b, double* r);
+template void vAdd(const int n, const float* a, const float* b, float* r);
+template void vAdd(const int n, const double* a, const double* b, double* r);
+
+#endif
+
+}  // namespace paddle
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
new file mode 100644
index 00000000000000..fe486c741d6f5d
--- /dev/null
+++ b/paddle/math/MathFunctions.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef MATHFUNCTIONS_H_
+#define MATHFUNCTIONS_H_
+
+#ifdef PADDLE_USE_MKL
+#include <mkl.h>
+#else
+extern "C" {
+#include <cblas.h>
+}
+#endif
+
+namespace paddle {
+
+template<class T>
+void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
+          const int M, const int N, const int K,
+          const T alpha, const T* A, const int lda,
+          const T* B, const int ldb,
+          const T beta, T* C, const int ldc);
+
+template<class T>
+void axpy(const int n, const T alpha, const T* x, T* y);
+
+template<class T>
+T dotProduct(const int n, const T* x, const T* y);
+
+template<class T>
+void vExp(const int n, const T* a, T* r);
+
+template<class T>
+void vPow(const int n, const T* a, const T b, T* r);
+
+template<class T>
+void vLog(const int n, const T* a, T* r);
+
+template<class T>
+void vAdd(const int n, const T* a, const T* b, T* r);
+
+template<class T>
+void vInvSqrt(const int n, const T* a, T* r);
+
+template<class T>
+void vLog1p(const int n, const T* a, T* r);
+
+template<class T>
+void vTanh(const int n, const T* a, T* r);
+
+}  // namespace paddle
+
+#endif  // MATHFUNCTIONS_H_
+
diff --git a/paddle/math/MathUtils.cpp b/paddle/math/MathUtils.cpp
new file mode 100644
index 00000000000000..5b78ab1b07bda0
--- /dev/null
+++ b/paddle/math/MathUtils.cpp
@@ -0,0 +1,76 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "MathUtils.h"
+#include <algorithm>
+#include "paddle/utils/Logging.h"
+#include "Vector.h"
+
+namespace paddle {
+
+/*if csc, major is cols and minor is rows, else
+ * major is rows and minor is cols, according to
+ * major value to initialize minor value"
+ */
+void sparseRand(int* major,
+                int* minor,
+                int nnz,
+                int majorLen,
+                int minorMax,
+                bool useGpu) {
+  CHECK(size_t(nnz) > size_t(1));
+  int* cpuMajor;
+  int* cpuMinor;
+  CpuIVector cpuMinorVec(nnz);
+  CpuIVector cpuMajorVec(majorLen);
+  if (useGpu) {
+    cpuMajor = cpuMajorVec.getData();
+    cpuMinor = cpuMinorVec.getData();
+  } else {
+    cpuMajor = major;
+    cpuMinor = minor;
+  }
+
+  /*major value init*/
+  for (int i = 0; i < majorLen - 1; i++) {
+    cpuMajor[i] = 1.0 * i * nnz / (majorLen - 1);
+  }
+  cpuMajor[majorLen - 1] = nnz;
+
+  /*minor value init according to major value*/
+  std::vector<char> used(minorMax, 0);
+  for (int i = 0; i < majorLen - 1; i++) {
+    CHECK_LE(cpuMajor[i + 1] - cpuMajor[i], minorMax);
+    used.assign(minorMax, 0);
+    for (int j = cpuMajor[i]; j < cpuMajor[i + 1]; j++) {
+      int idx = ::rand() % minorMax;
+      while (used[idx]) {
+        idx = ::rand() % minorMax;
+      }
+      cpuMinor[j] = idx;
+      used[idx] = 1;
+    }
+    std::sort(cpuMinor + cpuMajor[i], cpuMinor + cpuMajor[i + 1],
+              [](int a, int b) { return a < b; });
+  }
+  /*memcpy result to gpu*/
+  if (useGpu) {
+    hl_memcpy_host2device(major, cpuMajor, sizeof(int) * majorLen);
+    hl_memcpy_host2device(minor, cpuMinor, sizeof(int) * nnz);
+  }
+}
+
+
+}  // namespace paddle
diff --git a/paddle/math/MathUtils.h b/paddle/math/MathUtils.h
new file mode 100644
index 00000000000000..83375022abbe26
--- /dev/null
+++ b/paddle/math/MathUtils.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+
+/**
+ * this function is for SparseMatrix initialization except data.
+ * It generates a random non-zero pattern for a sparse matrix.
+ *
+ * if format is SPARSE_CSC,
+ *    major is column start index and minor is row index
+ *    for each non zero value.
+ * else
+ *    major is row start index and minor is col
+ *    index for each non zero value.
+ *
+ * Initialize minor value according to major value.
+ *
+ * For example, A is 5*3  CSC matrix, nnz is 10, then
+ *
+ * @code
+ *   cols[i] = i * nnz / 3
+ *   cols=[0, 3, 6, 10]
+ * @endcode
+ *
+ * for column i, we randomly select cols[i+1] - cols[i] rows
+ * as non zero number row index.
+ *
+ * rows is [1, 3, 4, 0, 2, 4, 1, 2, 3, 4]
+ */
+void sparseRand(int* major, int* minor, int nnz, int majorLen, int minorMax,
+                bool useGpu);
+
+}  // namespace paddle
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
new file mode 100644
index 00000000000000..289260b42648e9
--- /dev/null
+++ b/paddle/math/Matrix.cpp
@@ -0,0 +1,3563 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Matrix.h"
+#include "SparseMatrix.h"
+#include "SparseRowMatrix.h"
+#include "MathFunctions.h"
+
+#include <cmath>
+#include <float.h>
+#include <algorithm>
+
+#include "paddle/utils/Logging.h"
+#include <string.h>
+#include "hl_gpu.h"
+#include "hl_table_apply.h"
+#include "hl_top_k.h"
+
+#include "paddle/utils/ThreadLocal.h"
+
+#include "SIMDFunctions.h"
+
+namespace paddle {
+
+inline real _pow(real a, real beta) { return std::pow(a, beta); }
+
+inline real _square(real a) { return a * a; }
+
+inline real _safelog(real a) { return a > 0.0f ? std::log(a) : -40.0f; }
+
+Matrix::Matrix(MemoryHandlePtr memHandle, size_t height, size_t width,
+               bool trans, bool use_gpu)
+    : BaseMatrix(
+        height, width,
+        memHandle ? (reinterpret_cast<real*>(memHandle->getBuf())) : nullptr,
+        trans, use_gpu) {
+  elementCnt_ = width * height;
+  memoryHandle_ = memHandle;
+}
+
+Matrix::Matrix(real* data, size_t height, size_t width, bool trans,
+               bool use_gpu)
+    : BaseMatrix(height, width, data, trans, use_gpu) {
+  elementCnt_ = width * height;
+}
+
+Matrix::Matrix(real* data, size_t height, size_t width, size_t stride,
+               bool trans, bool use_gpu)
+    : BaseMatrix(height, width, stride, data, trans, use_gpu) {
+  elementCnt_ = width * height;
+}
+
+MatrixPtr Matrix::createSparseMatrix(real* data, int* row, int* col,
+                                     size_t height, size_t width,
+                                     size_t nnz, /* used to allocate space */
+                                     SparseValueType valueType, /*value type*/
+                                     SparseFormat format, bool trans,
+                                     bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuSparseMatrix>(data, row, col, height, width, nnz,
+                                             valueType, format, trans);
+  } else {
+    return std::make_shared<CpuSparseMatrix>(data, row, col, height, width, nnz,
+                                             valueType, format, trans);
+  }
+}
+
+MatrixPtr Matrix::createSparseMatrix(size_t height, size_t width,
+                                     size_t nnz, /* used to allocate space */
+                                     SparseValueType valueType, /*value type*/
+                                     SparseFormat format, bool trans,
+                                     bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuSparseMatrix>(height, width, nnz, valueType,
+                                             format, trans);
+  } else {
+    return std::make_shared<CpuSparseMatrix>(height, width, nnz, valueType,
+                                             format, trans);
+  }
+}
+
+MatrixPtr Matrix::create(MemoryHandlePtr memHandle, size_t height, size_t width,
+                         bool trans) {
+  if (auto gpuHandle = std::dynamic_pointer_cast<GpuMemoryHandle>(memHandle)) {
+    return std::make_shared<GpuMatrix>(gpuHandle, height, width, trans);
+  } else if (auto cpuHandle =
+             std::dynamic_pointer_cast<CpuMemoryHandle>(memHandle)) {
+    return std::make_shared<CpuMatrix>(cpuHandle, height, width, trans);
+  } else {
+    LOG(FATAL) << "Wrong";
+    return nullptr;
+  }
+}
+
+MatrixPtr Matrix::create(size_t height, size_t width, bool trans, bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuMatrix>(height, width, trans);
+  } else {
+    return std::make_shared<CpuMatrix>(height, width, trans);
+  }
+}
+
+MatrixPtr Matrix::create(real* data, size_t height, size_t width, bool trans,
+                         bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuMatrix>(data, height, width, trans);
+  } else {
+    return std::make_shared<CpuMatrix>(data, height, width, trans);
+  }
+}
+
+MatrixPtr Matrix::create(real* data, size_t height, size_t width, size_t stride,
+                         bool trans, bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuMatrix>(data, height, width, stride, trans);
+  } else {
+    return std::make_shared<CpuMatrix>(data, height, width, stride, trans);
+  }
+}
+
+MatrixPtr Matrix::createSparseMatrix(size_t height, size_t width, size_t nnz,
+                                     SparseValueType valueType, bool trans,
+                                     bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuSparseMatrix>(height, width, nnz, valueType,
+                                             SPARSE_CSR, trans);
+  } else {
+    return std::make_shared<CpuSparseMatrix>(height, width, nnz, valueType,
+                                             SPARSE_CSR, trans);
+  }
+}
+
+void Matrix::resizeOrCreate(MatrixPtr& matrix, size_t height, size_t width,
+                            bool trans, bool useGpu) {
+  if (!matrix) {
+    matrix = Matrix::create(height, width, trans, useGpu);
+  } else {
+    matrix->resize(height, width);
+  }
+}
+
+void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix, size_t height,
+                                        size_t width, size_t nnz,
+                                        SparseValueType valueType,
+                                        SparseFormat format, bool trans,
+                                        bool useGpu) {
+  if (!matrix) {
+    matrix = Matrix::createSparseMatrix(height, width, nnz, valueType, format,
+                                        trans, useGpu);
+  } else {
+    CHECK(dynamic_cast<CpuSparseMatrix*>(matrix.get()) ||
+          dynamic_cast<GpuSparseMatrix*>(matrix.get()));
+    matrix->resize(height, width, nnz, valueType, format);
+  }
+}
+
+void Matrix::reshape(size_t height, size_t width) {
+  CHECK(isContiguous());
+  CHECK(height_ * width_ == height * width);
+  height_ = height;
+  width_ = width;
+  stride_ = width_;
+}
+
+MatrixPtr Matrix::subMatrix(size_t startRow, size_t endRow, size_t startCol,
+                            size_t endCol) {
+  CHECK_LE(startRow, endRow);
+  CHECK_LE(endRow, getHeight());
+  CHECK_LE(startCol, endCol);
+  CHECK_LE(endCol, getWidth());
+
+  return Matrix::create(getData() + startRow * getStride() + startCol,
+                        endRow - startRow, endCol - startCol, getStride(),
+                        trans_, useGpu_);
+}
+
+GpuMatrix::GpuMatrix(size_t height, size_t width, bool trans)
+    : Matrix(std::make_shared<GpuMemoryHandle>(height * width * sizeof(real)),
+             height, width, trans, true) {}
+
+GpuMatrix::~GpuMatrix() {}
+
+void GpuMatrix::zeroMem() {
+  CHECK(data_ != NULL);
+  zero();
+}
+
+void GpuMatrix::resetOne() {
+  CHECK(data_ != NULL);
+  one();
+}
+void GpuMatrix::resize(size_t newHeight, size_t newWidth) {
+  size_t newSize = newHeight * newWidth;
+  if (NULL == memoryHandle_.get() ||
+      newSize * sizeof(real) > memoryHandle_->getAllocSize()) {
+    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize * sizeof(real));
+    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
+  }
+  height_ = newHeight;
+  width_ = newWidth;
+  elementCnt_ = newSize;
+  stride_ = width_;
+}
+
+real GpuMatrix::getElement(size_t x, size_t y) const {
+  real elem = 0;
+  hl_memcpy_device2host(&elem, &data_[x * stride_ + y], sizeof(real));
+  return elem;
+}
+
+real GpuMatrix::getSum() {
+  CHECK(isContiguous());
+  real sum = 0.0f;
+  hl_vector_sum(data_, &sum, height_ * width_);
+  return sum;
+}
+
+void GpuMatrix::accumulateColSum(Matrix& src) {
+  CHECK_EQ(getWidth(), src.getWidth());
+  CHECK_EQ(getHeight(), (size_t)1);
+  sumCols(src, 1.0);
+}
+
+real GpuMatrix::getAbsSum() {
+  CHECK(isContiguous());
+  real sum = 0.0f;
+  hl_vector_abs_sum(data_, &sum, height_ * width_);
+  return sum;
+}
+
+void GpuMatrix::copyFrom(const Matrix& src) {
+  CHECK(isContiguous());
+  CHECK(src.isContiguous());
+  CHECK(elementCnt_ == src.getElementCnt());
+
+  if (typeid(src) == typeid(CpuMatrix)) {
+    hl_memcpy_host2device(data_, const_cast<real*>(src.getData()),
+                          sizeof(real) * elementCnt_);
+  } else if (typeid(src) == typeid(GpuMatrix)) {
+    hl_memcpy_device2device(data_, const_cast<real*>(src.getData()),
+                            sizeof(real) * elementCnt_);
+  } else {
+    LOG(FATAL) << "Wrong";
+  }
+}
+
+void GpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
+  CHECK(isContiguous());
+  CHECK(src.isContiguous());
+  CHECK(elementCnt_ == src.getElementCnt());
+  hl_memcpy_async(this->getData(), const_cast<real*>(src.getData()),
+                  sizeof(real) * elementCnt_, stream);
+}
+
+void GpuMatrix::copyFrom(const real* hostSrc, size_t size) {
+  CHECK(isContiguous());
+  CHECK(size <= elementCnt_);
+  hl_memcpy_host2device(data_, const_cast<real*>(hostSrc), sizeof(real) * size);
+}
+
+void GpuMatrix::copyFrom(const real* hostSrc, const int64_t* seq) {
+  LOG(FATAL) << "not implemented";
+}
+
+void GpuMatrix::copyFrom(const IVector& src) {
+  CHECK(isContiguous());
+  CpuMatrix matrix(src.getSize(), 1, false);
+  matrix.copyFrom(src);
+  copyFrom(matrix);
+}
+
+void GpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) {
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(b.getWidth(), width);
+  real* dst = getData();
+  real* src = b.getData();
+  int* index = rowIndex.getData();
+  hl_sequence2batch_copy(dst, src, index, width, height, true);
+}
+
+MatrixPtr GpuMatrix::clone(size_t height, size_t width, bool useGpu) {
+  CHECK(isContiguous());
+
+  if (height == 0 && width == 0) {
+    height = height_;
+    width = width_;
+  }
+
+  CHECK(width && height);
+
+  if (useGpu) {
+    return std::make_shared<GpuMatrix>(height, width);
+  } else {
+    return std::make_shared<CpuMatrix>(height, width);
+  }
+}
+
+MatrixPtr GpuMatrix::getTranspose() {
+  if (memoryHandle_.get() != NULL) {
+    MatrixPtr copy_T(
+        new GpuMatrix(std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_),
+                      height_, width_, true));
+    return copy_T;
+  } else {
+    MatrixPtr copy_T(new GpuMatrix(data_, height_, width_, true));
+    return copy_T;
+  }
+}
+
+void GpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
+  if (memAlloc) {
+    matTrans = std::make_shared<GpuMatrix>(width_, height_);
+  } else {
+    CHECK(matTrans != NULL);
+  }
+  real* dataTrans = matTrans->getData();
+  real* data = getData();
+  int lda = getStride();
+  int ldc = matTrans->getStride();
+
+  hl_matrix_transpose(data, dataTrans, height_, width_, lda, ldc);
+}
+
+void GpuMatrix::addBias(Matrix& b, real scale) {
+  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
+  BaseMatrix::addBias(b, scale);
+}
+
+void GpuMatrix::collectBias(Matrix& a, real scale) {
+  CHECK_EQ(getHeight(), (size_t)1);
+  CHECK_EQ(width_, a.getWidth());
+  GpuSparseMatrix* sMatPtr  = dynamic_cast<GpuSparseMatrix*>(&a);
+  if (!sMatPtr) {
+    sumCols(a, scale);
+  } else {
+    real* data = getData();
+    hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
+    hl_sparse_matrix_column_sum(data, A_d, sMatPtr->getHeight(),
+                                width_, scale);
+  }
+}
+
+void GpuMatrix::sequenceAvgForward(Matrix& a,
+                                   const IVector& startsPos,
+                                   int mode) {
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(height, startsPos.getSize() - 1);
+  CHECK_EQ(width, a.getWidth());
+  real* dst = getData();
+  real* src = a.getData();
+  const int* starts = startsPos.getData();
+
+  hl_sequence_avg_forward(dst, src, starts, height, width, mode);
+}
+
+/* this = scaleAB*(a*b) +  scaleT*this */
+void GpuMatrix::mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB,
+                    real scaleT) {
+  CHECK(!isTransposed()) << "Not supported";
+
+  if (!a.isTransposed() && !b.isTransposed()) {
+    CHECK_EQ(width_, b.width_);
+    CHECK_EQ(height_, a.height_);
+    CHECK_EQ(a.width_, b.height_);
+  } else if (a.isTransposed() && !b.isTransposed()) {
+    CHECK_EQ(width_, b.width_);
+    CHECK_EQ(height_, a.width_);
+    CHECK_EQ(a.height_, b.height_);
+  } else if (!a.isTransposed() && b.isTransposed()) {
+    CHECK_EQ(width_, b.height_);
+    CHECK_EQ(height_, a.height_);
+    CHECK_EQ(a.width_, b.width_);
+  } else {
+    LOG(FATAL) << "Is not supported";
+  }
+
+  real* A_d = a.data_;
+  real* B_d = b.data_;
+  real* C_d = data_;
+  int dimM = getHeight();
+  int dimN = getWidth();
+  int dimK = !a.isTransposed() ? a.width_ : a.height_;
+  int lda = a.getStride();
+  int ldb = b.getStride();
+  int ldc = getStride();
+  hl_trans_op_t transa = !a.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
+  hl_trans_op_t transb = !b.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
+
+  hl_matrix_mul(A_d, transa, B_d, transb, C_d, dimM, dimN, dimK,
+                scaleAB, scaleT, lda, ldb, ldc);
+}
+
+void GpuMatrix::mul(const GpuSparseMatrix& a, const GpuMatrix& b, real scaleAB,
+                    real scaleT) {
+  CHECK(isContiguous());
+  CHECK(b.isContiguous());
+  CHECK(b.useGpu_ == true) << "Matrix type are not equal";
+  CHECK(!trans_ && !b.trans_) << "not supported";
+
+  if (!a.trans_) {
+    CHECK(width_ == b.width_ && height_ == a.height_ && a.width_ == b.height_)
+        << "Matrix dimensions are not equal";
+  } else {
+    CHECK(width_ == b.width_ && height_ == a.width_ && a.height_ == b.height_)
+        << "Matrix dimensions are not equal";
+  }
+  hl_trans_op_t transA = a.trans_ ? HPPL_OP_T : HPPL_OP_N;
+  hl_sparse_matrix_s A_d = a.sMatrix_.get();
+  real* B_d = b.data_;
+  real* C_d = data_;
+  hl_matrix_csr_mul_dense(A_d, transA, B_d, HPPL_OP_N, C_d, height_,
+                          width_, b.height_, scaleAB, scaleT);
+}
+
+void GpuMatrix::mul(const GpuMatrix& a, const GpuSparseMatrix& b, real scaleAB,
+                    real scaleT) {
+  CHECK(isContiguous());
+  CHECK(a.isContiguous());
+  CHECK(a.useGpu_ == true) << "Matrix type are not equal";
+
+  hl_sparse_matrix_s B_d = b.sMatrix_.get();
+  real* A_d = a.data_;
+  real* C_d = data_;
+  hl_trans_op_t transB = b.trans_ ? HPPL_OP_T : HPPL_OP_N;
+  if (!b.trans_) {
+    CHECK(width_ == b.width_ && height_ == a.height_ && a.width_ == b.height_)
+        << "Matrix dimensions are not equal";
+  } else {
+    CHECK(width_ == b.height_ && height_ == a.height_ && a.width_ == b.width_)
+        << "Matrix dimensions are not equal";
+  }
+  if (b.format_ == SPARSE_CSC) {
+    hl_matrix_dense_mul_csc(A_d, HPPL_OP_N, B_d, transB, C_d, height_,
+                            width_, a.width_, scaleAB, scaleT);
+  } else {
+    hl_matrix_dense_mul_csr(A_d, HPPL_OP_N, B_d, transB, C_d, height_,
+                            width_, a.width_, scaleAB, scaleT);
+  }
+}
+
+/* this = a*b */
+void GpuMatrix::mul(const MatrixPtr a, const MatrixPtr b) {
+  mul(a, b, 1.0, 0.0);
+}
+
+void GpuMatrix::mul(const MatrixPtr a, const MatrixPtr b, real scaleAB,
+                    real scaleT) {
+  GpuMatrixPtr a_ptr = std::dynamic_pointer_cast<GpuMatrix>(a);
+  GpuMatrixPtr b_ptr = std::dynamic_pointer_cast<GpuMatrix>(b);
+  GpuSparseMatrixPtr a_ptr_s = std::dynamic_pointer_cast<GpuSparseMatrix>(a);
+  GpuSparseMatrixPtr b_ptr_s = std::dynamic_pointer_cast<GpuSparseMatrix>(b);
+
+  if (a_ptr && b_ptr) {
+    mul(*a_ptr, *b_ptr, scaleAB, scaleT);
+  } else if (a_ptr_s && b_ptr) {
+    mul(*a_ptr_s, *b_ptr, scaleAB, scaleT);
+  } else if (a_ptr && b_ptr_s) {
+    mul(*a_ptr, *b_ptr_s, scaleAB, scaleT);
+  } else {
+    LOG(FATAL) << "Not supported";
+  }
+}
+
+/* this = this* b */
+void GpuMatrix::rightMul(Matrix& b) { rightMul(b, 1.0, 0.0); }
+
+/* this = scaleAB*(this*b) +  scaleT*this */
+void GpuMatrix::rightMul(Matrix& b, real scaleAB, real scaleT) {
+  CHECK(dynamic_cast<GpuMatrix*>(&b));
+  CHECK(!isTransposed()) << "Not supported";
+  CHECK(!b.isTransposed()) << "Not supported";
+  mul(*this, *dynamic_cast<GpuMatrix*>(&b), scaleAB, scaleT);
+}
+
+/* this = a*this */
+void GpuMatrix::leftMul(Matrix& a) { leftMul(a, 1.0, 0.0); }
+
+/* this = scaleAB*(a*this) +  scaleT*this */
+void GpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
+  CHECK(dynamic_cast<GpuMatrix*>(&a));
+  CHECK(!isTransposed()) << "Not supported";
+  CHECK(!a.isTransposed()) << "Not supported";
+  mul(*dynamic_cast<GpuMatrix*>(&a), *this, scaleAB, scaleT);
+}
+
+void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
+#ifndef PADDLE_ONLY_CPU
+  CHECK(dynamic_cast<GpuMatrix*>(&table));
+  CHECK(table.useGpu());
+  CHECK(ids.useGpu());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), table.getWidth());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  real* a = getData();
+  size_t tableSize = table.getHeight();
+  int* index = ids.getData();
+
+  hl_matrix_select_rows(a, stride_, table.getData(), table.stride_,
+                        index, numSamples, tableSize, dim);
+#endif
+}
+
+void GpuMatrix::addToRows(Matrix& table, IVector& ids) {
+#ifndef PADDLE_ONLY_CPU
+  CHECK(dynamic_cast<GpuMatrix*>(&table));
+  CHECK(table.useGpu());
+  CHECK(ids.useGpu());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), table.getWidth());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  real* a = getData();
+  size_t tableSize = table.getHeight();
+  int* index = ids.getData();
+
+  hl_matrix_add_to_rows(table.getData(), table.stride_, a, stride_,
+                        index, numSamples, tableSize, dim);
+#endif
+}
+
+void GpuMatrix::colMerge(Matrix& src) {
+  CHECK(src.height_ == height_);
+  if (!trans_ && !src.trans_) {
+    sumRows(src);
+  } else {
+    LOG(FATAL) << "Is not supported";
+  }
+}
+
+void GpuMatrix::rowSum(Matrix& sum) {
+  CHECK_EQ(sum.getHeight(), getHeight());
+  CHECK_EQ(sum.getWidth(), (size_t)1);
+
+  sum.sumRows(*this);
+}
+
+void GpuMatrix::rowMax(Matrix& max) {
+  CHECK_EQ(max.getHeight(), getHeight());
+  CHECK_EQ(max.getWidth(), (size_t)1);
+
+  max.maxRows(*this);
+}
+
+void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
+#ifndef PADDLE_ONLY_CPU
+  CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
+  size_t numSamples = getHeight();
+  size_t beam = maxVal.getWidth();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getHeight(), numSamples);
+
+  hl_matrix_top_k(maxVal.getData(),
+                  maxVal.getStride(),
+                  maxIds.getData(),
+                  this->getData(),
+                  this->getStride(),
+                  this->getWidth(),
+                  beam,
+                  numSamples);
+#endif
+}
+
+void GpuMatrix::colMax(Matrix& max) {
+  CHECK_EQ(max.getWidth(), getWidth());
+  CHECK_EQ(max.getHeight(), (size_t)1);
+
+  max.maxCols(*this);
+}
+
+/*calulate the error of classification */
+void GpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) {
+  GpuMatrixPtr output_ptr = std::dynamic_pointer_cast<GpuMatrix>(output);
+  GpuIVectorPtr label_ptr = std::dynamic_pointer_cast<GpuIVector>(label);
+
+  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
+
+  CHECK(height_ == output_ptr->height_ && width_ == 1)
+      << "Matrix dimensions are not equal";
+  real* output_d = output_ptr->data_;
+  real* recResult_d = data_;
+  int* label_d = label_ptr->getData();
+
+  hl_matrix_classification_error(output_d, label_d, recResult_d,
+                                 height_, output_ptr->width_);
+}
+
+/* copy -log(output[i * width + label]) to this->data[i] */
+void GpuMatrix::oneHotCrossEntropy(Matrix& output, IVector& label) {
+  GpuMatrix* output_ptr = dynamic_cast<GpuMatrix*>(&output);
+  GpuIVector* label_ptr = dynamic_cast<GpuIVector*>(&label);
+
+  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
+
+  CHECK(height_ == label.getSize() && width_ == 1 && height_ == output.height_)
+      << "Matrix dimensions are not equal";
+
+  real* A_d = output_ptr->data_;
+  real* C_d = data_;
+  int* label_d = label_ptr->getData();
+
+  hl_matrix_cross_entropy(A_d, C_d, label_d, height_, output.width_);
+}
+
+/* calculate the error of outputV according to label */
+void GpuMatrix::oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
+  GpuMatrix* output_ptr = dynamic_cast<GpuMatrix*>(&outputV);
+  GpuIVector* label_ptr = dynamic_cast<GpuIVector*>(&label);
+
+  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
+
+  CHECK(height_ == output_ptr->height_ && width_ == output_ptr->width_)
+      << "Matrix dimensions are not equal";
+
+  real* output_d = output_ptr->data_;
+  real* grad_d = data_;
+  int* label_d = label_ptr->getData();
+
+  hl_matrix_cross_entropy_bp(grad_d, output_d, label_d, height_, width_);
+}
+
+void GpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
+                                               real alpha) {
+  LOG(FATAL) << "Not implemented";
+}
+
+void GpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                                 IVector& label, real alpha) {
+  LOG(FATAL) << "Not implemented";
+}
+
+void GpuMatrix::softmax(Matrix& output) {
+  CHECK(output.useGpu()) << "Matrix type are not equal";
+
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK(height == output.getHeight() && width == output.getWidth())
+      << "Matrix dimensions are not equal";
+
+  real* inputData = getData();
+  real* outputData = output.getData();
+  hl_matrix_softmax(inputData, outputData, height, width);
+}
+
+void GpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
+  CHECK_EQ(getWidth(), 1UL);
+  CHECK_EQ(output.getWidth(), 1UL);
+  CHECK(isContiguous());
+
+  real* inputData = getData();
+  real* outputData = output.getData();
+  auto starts = index.getData();
+  int numSequences = index.getSize() - 1;
+  hl_sequence_softmax_forward(inputData, outputData,
+                              starts, numSequences);
+}
+
+void GpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
+  CHECK(output.useGpu_ == true && sftmaxSum.useGpu_ == true)
+      << "Matrix type are not equal";
+
+  CHECK(height_ == output.height_ && width_ == output.width_ &&
+        height_ == sftmaxSum.height_)
+      << "Matrix dimensions are not equal";
+
+  real* output_d = output.data_;
+  real* sftmaxSum_d = sftmaxSum.data_;
+  real* grad_d = data_;
+  hl_matrix_softmax_derivative(grad_d, output_d, sftmaxSum_d, height_,
+                               width_);
+}
+
+void GpuMatrix::softmaxBackward(Matrix& outputV) {
+  CHECK(outputV.useGpu()) << "Matrix type are not equal";
+
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK(height == outputV.getHeight() && width == outputV.getWidth())
+      << "Matrix dimensions are not equal";
+
+  real* output_grad = getData();
+  real* output_value = outputV.getData();
+  hl_softmax_backward(output_value, output_grad, height, width);
+}
+
+void GpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
+  CHECK_EQ(label.getHeight(), height_);
+  CHECK_EQ(output.getHeight(), height_);
+  CHECK_EQ(label.getWidth(), output.getWidth());
+  CHECK_EQ((size_t)1, width_);
+
+  auto labelptr = dynamic_cast<GpuSparseMatrix*>(&label);
+  if (labelptr) {
+    LOG(FATAL) << "not supported: GpuSparseMatrix as label";
+  }
+
+  BaseMatrix::sumOfSquares(output, label);
+}
+
+void GpuMatrix::sumOfSquaresBp(Matrix& outputV, Matrix& label) {
+  add2(outputV, label, 1, 2, -2);
+}
+
+void GpuMatrix::tanh(Matrix& output) { BaseMatrix::tanh(output); }
+
+void GpuMatrix::tanhDerivative(Matrix& output) {
+  BaseMatrix::tanhDerivative(output);
+}
+
+void GpuMatrix::softrelu(Matrix& output) { BaseMatrix::softrelu(output); }
+
+void GpuMatrix::softreluDerivative(Matrix& output) {
+  BaseMatrix::softreluDerivative(output);
+}
+
+void GpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
+  BaseMatrix::scaledTanh(output, p1, p2);
+}
+void GpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) {
+  CHECK(output1.useGpu_ == true && output2.useGpu_ == true)
+  << "Matrix type are not equal";
+  size_t numSamples = getHeight();
+  size_t dim = output1.getWidth();
+  CHECK_EQ(getWidth(), 1UL);
+  CHECK_EQ(output1.getHeight(), numSamples);
+  CHECK_EQ(output1.getWidth(), output2.getWidth());
+  real* out = getData();
+  real* x = output1.getData();
+  real* y = output2.getData();
+  hl_cossim(out, x, y,
+      dim, output1.getHeight(), output2.getHeight(), scale);
+}
+void GpuMatrix::cosSimDerivative(Matrix& output, Matrix& prevOut1,
+                                 Matrix& prevOut2, Matrix& prevGrad1,
+                                 Matrix& prevGrad2, real scale) {
+  CHECK(output.useGpu_ == true && prevOut1.useGpu_ == true &&
+        prevOut2.useGpu_ == true && prevGrad1.useGpu_ == true &&
+        prevGrad2.useGpu_ == true) << "Matrix type are not equal";
+  CHECK_EQ(getWidth(), 1UL);
+  CHECK_EQ(output.getWidth(), 1UL);
+
+  size_t numSamples = getHeight();
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(prevOut1.getHeight(), numSamples);
+  CHECK_EQ(prevGrad1.getHeight(), numSamples);
+
+  size_t dim = prevOut1.getWidth();
+  CHECK_EQ(prevOut2.getWidth(), dim);
+  CHECK_EQ(prevGrad1.getWidth(), dim);
+  CHECK_EQ(prevGrad2.getWidth(), dim);
+
+  real* grad = getData();
+  real* out = output.getData();
+  real* prevOutX = prevOut1.getData();
+  real* prevOutY = prevOut2.getData();
+  real* prevGradX = prevGrad1.getData();
+  real* prevGradY = prevGrad2.getData();
+  hl_cossim_derivative(grad, out, prevOutX, prevOutY,
+      prevGradX, prevGradY, dim,
+      prevOut1.getHeight(), prevOut2.getHeight(), scale);
+}
+
+void GpuMatrix::randomizeUniform() {
+  CHECK(isContiguous());
+  real* data = data_;
+  size_t size = height_ * width_;
+
+  hl_rand(data, size);
+}
+
+void GpuMatrix::print(std::ostream& os) const {
+  CHECK(isContiguous());
+  CpuMatrix cpuMat(getHeight(), getWidth());
+  cpuMat.copyFrom(*this);
+  cpuMat.print(os);
+}
+
+void GpuMatrix::print(std::ostream& os, size_t height, size_t width) const {
+  CHECK(isContiguous());
+  CpuMatrix cpuMat(getHeight(), getWidth());
+  cpuMat.copyFrom(*this);
+  cpuMat.print(os, height, width);
+}
+
+void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
+  CHECK(isContiguous());
+  CHECK(height_ == refMat.getHeight());
+  CHECK(width_ == refMat.getWidth());
+  CpuMatrix cpuRef(height_, width_);
+  GpuMatrix gpuRef(height_, width_);
+  cpuRef.copyFrom(refMat);
+  gpuRef.copyFrom(*this);
+  size_t diffCnt = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    for (size_t j = 0; j < width_; ++j) {
+      real a = gpuRef.getElement(i, j);
+      real b = cpuRef.getElement(i, j);
+      if (fabs(a - b) > 0.00001) {
+        ++diffCnt;
+        if (printDiff) {
+          os << "ref= " << a << "  check= " << b << std::endl;
+        }
+      }
+    }
+  }
+  LOG(INFO) << "the  diffCnt is " << diffCnt;
+}
+
+void GpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
+                           int channels, int blockH, int blockW, int strideH,
+                           int strideW, int paddingH, int paddingW,
+                           int outputH, int outputW) {
+  CHECK(feature.useGpu_ == true) << "Matrix type are not equal";
+
+  CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels),
+           feature.getHeight() * feature.getWidth())
+      << "Matrix dimensions are not equal";
+
+  size_t elemCnt = outputH * outputW * blockH * blockW * channels;
+  CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal";
+
+  hl_expand_feature2col(feature.getData(), channels, feaImgHeight,
+                        feaImgWidth, blockH, blockW, strideH, strideW,
+                        paddingH, paddingW, outputH, outputW,
+                        getData());
+}
+
+void GpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight,
+                           int thisImgWidth, int channels, int blockH,
+                           int blockW, int strideH, int strideW, int paddingH,
+                           int paddingW, int outputH, int outputW,
+                           real alpha, real beta) {
+  CHECK(expandFeat.useGpu_ == true) << "Matrix type are not equal";
+  CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
+           getHeight() * getWidth())
+      << "Matrix dimensions are not equal";
+
+  size_t elemCnt = outputH * outputW * blockW * blockH * channels;
+  CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth())
+      << "Matrix dimensions are not equal";
+  hl_shrink_col2feature(
+      expandFeat.getData(), channels, thisImgHeight, thisImgWidth, blockH,
+      blockW, strideH, strideW, paddingH, paddingW, outputH, outputW,
+      getData(), alpha, beta);
+}
+
+void GpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
+                               size_t imgSizeW, size_t channels, size_t sizeX,
+                               int start, size_t stride, size_t outputH,
+                               size_t outputW) {
+  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  size_t frameNum = inputMat.getHeight();
+  size_t width = imgSizeW;
+  size_t height = imgSizeH;
+  CHECK(height * width * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputH * outputW * channels);
+
+  hl_maxpool_forward(frameNum, inputData, channels, height, width,
+                     outputH, outputW, sizeX, stride, start, data_);
+}
+
+void GpuMatrix::maxPoolBackward(Matrix& inputMat, size_t imgSizeH,
+                                size_t imgSizeW, Matrix& outGrad, Matrix& outV,
+                                size_t sizeX, int start, size_t stride,
+                                size_t outputH, size_t outputW,
+                                real scaleTargets, real scaleOutput) {
+  CHECK(inputMat.useGpu_ == true && outGrad.useGpu_ == true &&
+        outV.useGpu_ == true)
+      << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  real* outData = outV.getData();
+  real* outDiff = outGrad.getData();
+  size_t frameNum = inputMat.getHeight();
+  size_t channels = outV.getWidth() / outputH / outputW;
+  size_t width = imgSizeW;
+  size_t height = imgSizeH;
+  CHECK(height * width * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == width * height * channels);
+  CHECK(outGrad.getHeight() == outV.getHeight() &&
+        outGrad.getWidth() == outV.getWidth());
+
+  hl_maxpool_backward(frameNum, inputData, outData, outDiff, channels,
+                      height, width, outputH, outputW, sizeX, stride,
+                      start, data_, scaleTargets, scaleOutput);
+}
+
+void GpuMatrix::avgPoolForward(Matrix& inputMat, size_t imgSizeH,
+                               size_t imgSizeW, size_t channels, size_t sizeX,
+                               int start, size_t stride, size_t outputH,
+                               size_t outputW) {
+  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  size_t frameNum = inputMat.getHeight();
+  size_t height = imgSizeH;
+  size_t width = imgSizeW;
+  CHECK(height * width * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputH * outputW * channels);
+
+  hl_avgpool_forward(frameNum, inputData, channels, height, width,
+                     outputH, outputW, sizeX, stride, start, data_);
+}
+
+void GpuMatrix::avgPoolBackward(Matrix& outGrad, size_t imgSizeH,
+                                size_t imgSizeW, size_t sizeX, int start,
+                                size_t stride, size_t outputH, size_t outputW,
+                                real scaleTargets, real scaleOutput) {
+  CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
+
+  real* outDiff = outGrad.getData();
+  size_t frameNum = outGrad.getHeight();
+  size_t channels = outGrad.getWidth() / outputH / outputW;
+  size_t height = imgSizeH;
+  size_t width = imgSizeW;
+  CHECK(height * width * channels == width_);
+  CHECK(height_ == outGrad.getHeight());
+  CHECK(outGrad.getWidth() == outputH * outputW * channels);
+
+  hl_avgpool_backward(frameNum, outDiff, channels, height, width,
+                      outputH, outputW, sizeX, stride, start, data_,
+                      scaleTargets, scaleOutput);
+}
+
+void GpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH,
+                                  size_t imgSizeW, Matrix& denoms,
+                                  size_t channels, size_t sizeX, float scale,
+                                  float pow, bool blocked) {
+  size_t num = input.getHeight();
+  size_t height = imgSizeH;
+  size_t width = imgSizeW;
+
+  CHECK(height * width * channels == input.getWidth());
+  CHECK(denoms.getHeight() == input.getHeight() &&
+        denoms.getWidth() == input.getWidth() && input.getHeight() == height_ &&
+        input.getWidth() == width_);
+  hl_CMRNorm_forward(num, input.getData(), denoms.getData(), data_,
+                     channels, height, width, sizeX, scale, -pow);
+}
+
+void GpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
+                                  Matrix& preOutV, Matrix& localOutV,
+                                  size_t channels, size_t imgSizeH,
+                                  size_t imgSizeW, size_t sizeX, float scale,
+                                  float pow, bool blocked) {
+  size_t num = preOutV.getHeight();
+  size_t height = imgSizeH;
+  size_t width = imgSizeW;
+
+  CHECK(width * height * channels == preOutV.getWidth());
+  CHECK(denoms.getHeight() == preOutV.getHeight() &&
+        denoms.getWidth() == preOutV.getWidth() &&
+        preOutV.getHeight() == height_ && preOutV.getWidth() == width_);
+  CHECK(denoms.getHeight() == localGrad.getHeight() &&
+        denoms.getWidth() == localGrad.getWidth());
+
+  hl_CMRNorm_backward(num, preOutV.getData(), denoms.getData(),
+                      localOutV.getData(), localGrad.getData(), data_,
+                      channels, height, width, sizeX, -pow,
+                      2.0f * pow * scale);
+}
+
+void GpuMatrix::maxSequenceForward(Matrix& input,
+                                   const IVector& sequence,
+                                   IVector& index) {
+  CHECK(dynamic_cast<GpuMatrix*>(&input));
+  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
+  CHECK(dynamic_cast<GpuIVector*>(&index));
+
+  real* outData = getData();
+  real* inputData = input.getData();
+  const int* starts = sequence.getData();
+  int* maxIndex = index.getData();
+  size_t numSequences = getHeight();
+  size_t dim = getWidth();
+
+  CHECK_EQ(dim, input.getWidth());
+  CHECK_EQ(numSequences, sequence.getSize() - 1);
+  CHECK_EQ(numSequences * dim, index.getSize());
+
+  hl_max_sequence_forward(inputData, starts, outData, maxIndex,
+                                    numSequences, dim);
+}
+
+void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
+                                    const IVector& sequence,
+                                    IVector& index) {
+  CHECK(dynamic_cast<GpuMatrix*>(&outputGrad));
+  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
+  CHECK(dynamic_cast<GpuIVector*>(&index));
+
+  real* inputGrad = getData();
+  real* outGrad = outputGrad.getData();
+  int* maxIndex = index.getData();
+  size_t dim = getWidth();
+  size_t numSequences = sequence.getSize() - 1;
+
+  CHECK_EQ(dim, outputGrad.getWidth());
+  CHECK_EQ(numSequences, outputGrad.getHeight());
+  CHECK_EQ(numSequences * dim, index.getSize());
+
+  hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim);
+}
+
+void GpuMatrix::contextProjectionForward(MatrixPtr input, MatrixPtr weight,
+                                         const IVector& sequence,
+                                         int contextLength, int contextStart,
+                                         size_t beginPad, bool isPadding) {
+  CHECK(dynamic_cast<GpuMatrix*>(input.get()));
+  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
+  if (weight) CHECK(dynamic_cast<GpuMatrix*>(weight.get()));
+
+  size_t numSequences = sequence.getSize() - 1;
+  int64_t inputDim = input->getWidth();
+  int64_t dim = getWidth();
+  CHECK_EQ(dim, inputDim * contextLength);
+
+  real* outData = getData();
+  real* inputData = input->getData();
+  const int* starts = sequence.getData();
+
+  hl_context_projection_forward(
+      inputData, starts, isPadding ? weight->getData() : NULL, outData,
+      numSequences, inputDim, contextLength, contextStart, beginPad, isPadding);
+}
+
+void GpuMatrix::contextProjectionBackwardData(MatrixPtr inputGrad,
+                                              const IVector& sequence,
+                                              int contextLength,
+                                              int contextStart) {
+  CHECK(dynamic_cast<GpuMatrix*>(inputGrad.get()));
+  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
+
+  size_t numSequences = sequence.getSize() - 1;
+  int64_t inputDim = inputGrad->getWidth();
+  int64_t dim = getWidth();
+  CHECK_EQ(dim, inputDim * contextLength);
+
+  real* outGrad = getData();
+  real* inGrad = inputGrad->getData();
+  const int* starts = sequence.getData();
+
+  hl_context_projection_backward_data(outGrad, starts, inGrad,
+                                      numSequences, inputDim,
+                                      contextLength, contextStart);
+}
+
+void GpuMatrix::contextProjectionBackwardWeight(MatrixPtr weightGrad,
+                                                const IVector& sequence,
+                                                int contextLength,
+                                                int contextStart, int totalPad,
+                                                size_t beginPad) {
+  CHECK(dynamic_cast<GpuMatrix*>(weightGrad.get()));
+  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
+
+  size_t numSequences = sequence.getSize() - 1;
+  int64_t weightDim = weightGrad->getWidth();
+  int64_t dim = getWidth();
+  CHECK_EQ(dim, weightDim * contextLength);
+
+  real* outGrad = getData();
+  real* wtGrad = weightGrad->getData();
+  const int* starts = sequence.getData();
+
+  hl_context_projection_backward_weight(
+      outGrad, starts, wtGrad, numSequences, weightDim, totalPad, contextLength,
+      contextStart, beginPad);
+}
+
+void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
+  CHECK(data.useGpu_ == true && W.useGpu_ == true)
+      << "Matrix type are not equal";
+  real* input = data.getData();
+  real* w = W.getData();
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  real* output = getData();
+  hl_param_relu_forward(output, input, w, numElements, numSamples,
+      partial_sum);
+}
+
+void GpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
+  CHECK(oGrad.useGpu_ == true && data.useGpu_ == true)
+      << "Matrix type are not equal";
+  real* ograd = oGrad.getData();
+  real* input = data.getData();
+  real* wgrad = data_;
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t partial_sum = numElements / (this->getHeight() * this->getWidth());
+  hl_param_relu_backward_w(wgrad, ograd, input,
+      numElements, numSamples, partial_sum);
+}
+
+void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
+  real* diff = data_;
+  real* input = data.getData();
+  real* ograd = oGrad.getData();
+  real* w = W.getData();
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  hl_param_relu_backward_diff(ograd, input, w, diff,
+      numElements, numSamples, partial_sum);
+}
+
+void GpuMatrix::addColumnVector(const Matrix& b) {
+  BaseMatrix::addColVector(const_cast<Matrix&>(b));
+}
+
+/**
+ * CpuMatrix
+ */
+
+CpuMatrix::CpuMatrix(size_t height, size_t width, bool trans)
+    : Matrix(std::make_shared<CpuMemoryHandle>(height * width * sizeof(real)),
+             height, width, trans, false) {}
+
+CpuMatrix::~CpuMatrix() {}
+
+void CpuMatrix::zeroMem() {
+  CHECK(data_ != NULL);
+  if (isContiguous()) {
+    memset(data_, 0, height_ * width_ * sizeof(real));
+  } else {
+    BaseMatrix::zero();
+  }
+}
+void CpuMatrix::resetOne() {
+  CHECK(data_ != NULL);
+  BaseMatrix::one();
+}
+
+void CpuMatrix::copyFrom(const Matrix& src) {
+  CHECK(isContiguous());
+  if (typeid(src) == typeid(GpuMatrix)) {
+    CHECK(src.isContiguous());
+    CHECK(elementCnt_ == src.getElementCnt());
+    hl_memcpy_device2host(data_, const_cast<real*>(src.getData()),
+                          sizeof(real) * elementCnt_);
+  } else if (typeid(src) == typeid(CpuMatrix)) {
+    CHECK(src.isContiguous());
+    CHECK(elementCnt_ == src.getElementCnt());
+    memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
+  } else if (typeid(src) == typeid(CpuSparseMatrix)) {
+    CHECK_GE(elementCnt_, src.getElementCnt());
+    copyFrom(dynamic_cast<CpuSparseMatrix&>(const_cast<Matrix&>(src)));
+  } else {
+    LOG(FATAL) << "Wrong";
+  }
+}
+
+void CpuMatrix::copyFrom(CpuSparseMatrix& src) {
+  CHECK(isContiguous());
+  CHECK(height_ == src.getHeight());
+  CHECK(width_ == src.getWidth());
+  memset(data_, 0, sizeof(real) * height_ * width_);
+  if (src.getValueType() == FLOAT_VALUE) {
+    if (src.getFormat() == SPARSE_CSC) {
+      int* rows = src.getRows();
+      real* vals = src.getValue();
+      for (size_t i = 0; i < width_; i++) {
+        for (size_t j = src.getColStartIdx(i); j < src.getColStartIdx(i + 1);
+             j++) {
+          data_[rows[j] * width_ + i] = vals[j];
+        }
+      }
+    } else {
+      int* cols = src.getCols();
+      real* vals = src.getValue();
+      for (size_t i = 0; i < height_; i++) {
+        for (size_t j = src.getRowStartIdx(i); j < src.getRowStartIdx(i + 1);
+             j++) {
+          data_[i * width_ + cols[j]] = vals[j];
+        }
+      }
+    }
+  } else {
+    if (src.getFormat() == SPARSE_CSC) {
+      int* rows = src.getRows();
+      for (size_t i = 0; i < width_; i++) {
+        for (size_t j = src.getColStartIdx(i); j < src.getColStartIdx(i + 1);
+             j++) {
+          data_[rows[j] * width_ + i] = 1.0;
+        }
+      }
+    } else {
+      int* cols = src.getCols();
+      for (size_t i = 0; i < height_; i++) {
+        for (size_t j = src.getRowStartIdx(i); j < src.getRowStartIdx(i + 1);
+             j++) {
+          data_[i * width_ + cols[j]] = 1.0;
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
+  CHECK(isContiguous());
+  CHECK(src.isContiguous());
+  CHECK(elementCnt_ == src.getElementCnt());
+  if (typeid(src) == typeid(GpuMatrix)) {
+    hl_memcpy_async(this->getData(), const_cast<real*>(src.getData()),
+                    sizeof(real) * elementCnt_, stream);
+  } else if (typeid(src) == typeid(CpuMatrix)) {
+    memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
+  } else {
+    LOG(FATAL) << "Wrong";
+  }
+}
+
+void CpuMatrix::copyFrom(const real* cpuSrc, size_t size) {
+  CHECK(isContiguous());
+  CHECK(size <= elementCnt_);
+  memcpy(data_, cpuSrc, sizeof(real) * size);
+}
+
+void CpuMatrix::copyFrom(const real* cpuSrc, const int64_t* seq) {
+  CHECK(isContiguous());
+  for (size_t i = 0; i < height_; i++) {
+    memcpy(data_ + i * width_, cpuSrc + seq[i] * width_, sizeof(real) * width_);
+  }
+}
+
+void CpuMatrix::copyFrom(const IVector& src) {
+  CHECK(isContiguous());
+  CHECK(elementCnt_ == src.getSize())
+      << "the src and dst should have same size.";
+  const int* cpuSrc = NULL;
+  IVectorPtr tmp;
+  if (src.useGpu()) {
+    CpuIVector tmp(src.getSize());
+    tmp.copyFrom(src);
+    cpuSrc = tmp.getData();
+  } else {
+    cpuSrc = src.getData();
+  }
+  for (size_t i = 0; i < elementCnt_; ++i) {
+    data_[i] = cpuSrc[i];
+  }
+}
+
+void CpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) {
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(b.getWidth(), width);
+  int* index = rowIndex.getData();
+  for (size_t i = 0; i < height; i++) {
+    CHECK_LT(static_cast<size_t>(index[i]), b.getHeight());
+    real* src = b.getData() + index[i] * width;
+    real* dst = getData() + i * width;
+    memcpy(dst, src, sizeof(real) * width);
+  }
+}
+
+MatrixPtr CpuMatrix::clone(size_t height, size_t width, bool useGpu) {
+  CHECK(isContiguous());
+
+  if (height == 0 && width == 0) {
+    height = height_;
+    width = width_;
+  }
+
+  CHECK(width && height);
+
+  if (useGpu) {
+    return std::make_shared<GpuMatrix>(height, width);
+  } else {
+    return std::make_shared<CpuMatrix>(height, width);
+  }
+}
+
+void CpuMatrix::resize(size_t newHeight, size_t newWidth) {
+  size_t newSize = newHeight * newWidth;
+  if (NULL == memoryHandle_.get() ||
+      newSize * sizeof(real) > memoryHandle_->getAllocSize()) {
+    memoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize * sizeof(real));
+    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
+  }
+
+  height_ = newHeight;
+  width_ = newWidth;
+  elementCnt_ = newSize;
+  stride_ = width_;
+}
+
+real CpuMatrix::getElement(size_t x, size_t y) const {
+  return data_[x * stride_ + y];
+}
+
+real CpuMatrix::getSum() {
+  CHECK(isContiguous());
+  double sum = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    for (size_t j = 0; j < width_; ++j) {
+      sum += data_[i * width_ + j];
+    }
+  }
+  return sum;
+}
+
+void CpuMatrix::accumulateColSum(Matrix& src) {
+  CHECK_EQ(getWidth(), src.getWidth());
+  CHECK_EQ(getHeight(), (size_t)1);
+
+  sumCols(src, 1.0);
+}
+
+real CpuMatrix::getAbsSum() {
+  CHECK(isContiguous());
+  double sum = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    for (size_t j = 0; j < width_; ++j) {
+      sum += fabs(data_[i * width_ + j]);
+    }
+  }
+  return sum;
+}
+
+MatrixPtr CpuMatrix::getTranspose() {
+  if (memoryHandle_.get() != NULL) {
+    return std::make_shared<CpuMatrix>(
+        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_), height_,
+        width_, true);
+  } else {
+    MatrixPtr copy_T(new CpuMatrix(data_, height_, width_, true));
+    return copy_T;
+  }
+}
+
+void CpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
+  if (memAlloc) {
+    matTrans = std::make_shared<CpuMatrix>(width_, height_);
+  } else {
+    CHECK(matTrans != NULL);
+  }
+  real* dataTrans = matTrans->getData();
+  real* data = getData();
+  int lda = getStride();
+  int ldc = matTrans->getStride();
+
+  for (size_t i = 0; i < height_; i++) {
+    for (size_t j = 0; j < width_; j++) {
+      dataTrans[j * ldc + i] = data[i * lda + j];
+    }
+  }
+}
+
+void CpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
+                           int channels, int blockH, int blockW, int strideH,
+                           int strideW, int paddingH, int paddingW,
+                           int outputH, int outputW) {
+  CHECK(feature.useGpu_ == false) << "Matrix type are not equal";
+
+  CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels),
+           feature.getHeight() * feature.getWidth())
+      << "Matrix dimensions are not equal";
+
+  size_t elemCnt = outputH * outputW * blockH * blockW * channels;
+  CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal";
+
+  int channelsCol = channels * blockH * blockW;
+  real* srcData = feature.getData();
+  for (int c = 0; c < channelsCol; ++c) {
+    int wOffset = c % blockW;
+    int hOffset = (c / blockW) % blockH;
+    int c_im = c / blockH / blockW;
+    for (int h = 0; h < outputH; ++h) {
+      for (int w = 0; w < outputW; ++w) {
+        // no c_im*height to Exclude the channel number
+        int imgRowIdx = h * strideH + hOffset;
+        int imgColIdx = w * strideW + wOffset;
+        if ((imgRowIdx - paddingH) < 0 ||
+            (imgRowIdx - paddingH) >= feaImgHeight ||
+            (imgColIdx - paddingW) < 0 ||
+            (imgColIdx - paddingW) >= feaImgWidth) {
+          data_[(c * outputH + h) * outputW + w] = 0;
+        } else {
+          imgRowIdx += c_im * feaImgHeight - paddingH;
+          imgColIdx -= paddingW;
+          data_[(c * outputH + h) * outputW + w] =
+              srcData[imgRowIdx * feaImgWidth + imgColIdx];
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight,
+                           int thisImgWidth, int channels, int blockH,
+                           int blockW, int strideH, int strideW, int paddingH,
+                           int paddingW, int outputH, int outputW,
+                           real alpha, real beta) {
+  CHECK(expandFeat.useGpu_ == false) << "Matrix type are not equal";
+  CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
+           getHeight() * getWidth())
+      << "Matrix dimensions are not equal";
+
+  size_t elemCnt = outputH * outputW * blockH * blockW * channels;
+
+  CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth())
+      << "Matrix dimensions are not equal";
+
+  real* expandData = expandFeat.getData();
+  int channelsCol = channels * blockH * blockW;
+  for (int c = 0; c < channelsCol; ++c) {
+    int wOffset = c % blockW;
+    int hOffset = (c / blockW) % blockH;
+    int c_im = c / blockW / blockH;
+    for (int h = 0; h < outputH; ++h) {
+      for (int w = 0; w < outputW; ++w) {
+        int imRowIdx = h * strideH + hOffset;
+        int imColIdx = w * strideW + wOffset;
+        if ((imRowIdx - paddingH) >= 0 &&
+            (imRowIdx - paddingH) < thisImgHeight &&
+            (imColIdx - paddingW) >= 0 &&
+            (imColIdx - paddingW) < thisImgWidth) {
+          imRowIdx += c_im * thisImgHeight - paddingH;
+          imColIdx -= paddingW;
+          data_[imRowIdx * thisImgWidth + imColIdx] =
+              alpha * expandData[(c * outputH + h) * outputW + w] +
+              beta * data_[imRowIdx * thisImgWidth + imColIdx];
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
+                               size_t imgSizeW, size_t channels, size_t sizeX,
+                               int start, size_t stride, size_t outputH,
+                               size_t outputW) {
+  real* inputData = inputMat.getData();
+  real* outData = data_;
+  size_t num = inputMat.getHeight();
+  size_t inWidth = imgSizeW;
+  size_t inHeight = imgSizeH;
+  CHECK(inHeight * inWidth == inputMat.getWidth() / channels);
+
+  /* initialize the data_ */
+  for (size_t i = 0; i < height_ * width_; i++) {
+    data_[i] = -FLT_MAX;
+  }
+
+  /* pool max one by one */
+  for (size_t n = 0; n < num; ++n) {         // frame by frame
+    for (size_t c = 0; c < channels; ++c) {  // channel by channel
+      for (size_t ph = 0; ph < outputH; ++ph) {
+        for (size_t pw = 0; pw < outputW; ++pw) {
+          size_t hstart = ph * stride + start;
+          size_t wstart = pw * stride + start;
+          size_t hend = std::min(hstart + sizeX, inHeight);
+          size_t wend = std::min(wstart + sizeX, inWidth);
+          for (size_t h = hstart; h < hend; ++h) {
+            for (size_t w = wstart; w < wend; ++w) {
+              outData[ph * outputW + pw] = std::max(outData[ph * outputW + pw],
+                                                    inputData[h * inWidth + w]);
+            }
+          }
+        }
+      }
+      // compute offset
+      inputData += inHeight * inWidth;
+      outData += outputH * outputW;
+    }
+  }
+}
+
+void CpuMatrix::maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
+                                Matrix& outGrad, Matrix& outV, size_t sizeX,
+                                int start, size_t stride, size_t outputH,
+                                size_t outputW, real scaleTargets,
+                                real scaleOutput) {
+  size_t num = image.getHeight();
+  size_t channels = size_t(width_ / imgSizeH / imgSizeW);
+  CHECK(image.getWidth() == imgSizeH * imgSizeW * channels);
+  CHECK(image.getHeight() == height_ && image.getWidth() == width_);
+  CHECK(outV.getHeight() == outGrad.getHeight() &&
+        outV.getWidth() == outGrad.getWidth());
+
+  real* tgtGrad = data_;
+  real* inData = image.getData();
+  real* otData = outV.getData();
+  real* otGrad = outGrad.getData();
+  for (size_t n = 0; n < num; ++n) {
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t ph = 0; ph < outputH; ++ph) {
+        for (size_t pw = 0; pw < outputW; ++pw) {
+          size_t hstart = ph * stride + start;
+          size_t wstart = pw * stride + start;
+          size_t hend = std::min(hstart + sizeX, imgSizeH);
+          size_t wend = std::min(wstart + sizeX, imgSizeW);
+          for (size_t h = hstart; h < hend; ++h) {
+            for (size_t w = wstart; w < wend; ++w) {
+              tgtGrad[h * imgSizeW + w] =
+                  scaleTargets * tgtGrad[h * imgSizeW + w] +
+                  scaleOutput * otGrad[ph * outputW + pw] *
+                      (inData[h * imgSizeW + w] == otData[ph * outputH + pw]);
+            }
+          }
+        }
+      }
+      // offset
+      inData += imgSizeH * imgSizeW;
+      otData += outputH * outputW;
+      tgtGrad += imgSizeH * imgSizeW;
+      otGrad += outputH * outputW;
+    }
+  }
+}
+
+void CpuMatrix::avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
+                               size_t channels, size_t sizeX, int start,
+                               size_t stride, size_t outputH, size_t outputW) {
+  // The main loop
+  size_t num = input.getHeight();
+  size_t inHeight = imgSizeH;
+  size_t inWidth = imgSizeW;
+  CHECK(inHeight * inWidth * channels == input.getWidth());
+  CHECK(outputH * outputW * channels * num == height_ * width_);
+  real* tgtData = data_;
+  real* inData = input.getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t ph = 0; ph < outputH; ++ph) {
+        for (size_t pw = 0; pw < outputW; ++pw) {
+          size_t hstart = ph * stride + start;
+          size_t wstart = pw * stride + start;
+          size_t hend = std::min(hstart + sizeX, inHeight);
+          size_t wend = std::min(wstart + sizeX, inWidth);
+          tgtData[ph * outputW + pw] = 0;  // clear
+          for (size_t h = hstart; h < hend; ++h) {
+            for (size_t w = wstart; w < wend; ++w) {
+              tgtData[ph * outputW + pw] += inData[h * inWidth + w];
+            }
+          }
+          tgtData[ph * outputW + pw] /= (hend - hstart) * (wend - wstart);
+        }
+      }
+      // compute offset
+      inData += inHeight * inWidth;
+      tgtData += outputH * outputW;
+    }
+  }
+}
+
+void CpuMatrix::avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
+                                size_t sizeX, int start, size_t stride,
+                                size_t outputH, size_t outputW,
+                                real scaleTargets, real scaleOutput) {
+  size_t num = input.getHeight();
+  size_t channels = input.getWidth() / outputH / outputW;
+  CHECK(imgSizeH * imgSizeW * channels == getWidth());
+  real* inData = input.getData();
+  real* outData = getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t ph = 0; ph < outputH; ++ph) {
+        for (size_t pw = 0; pw < outputW; ++pw) {
+          size_t hstart = ph * stride + start;
+          size_t wstart = pw * stride + start;
+          size_t hend = std::min(hstart + sizeX, imgSizeH);
+          size_t wend = std::min(wstart + sizeX, imgSizeW);
+          size_t poolsize = (hend - hstart) * (wend - wstart);
+          for (size_t h = hstart; h < hend; ++h) {
+            for (size_t w = wstart; w < wend; ++w) {
+              outData[h * imgSizeW + w] += inData[ph * outputW + pw] / poolsize;
+            }
+          }
+        }
+      }
+      // offset
+      outData += imgSizeH * imgSizeW;
+      inData += outputH * outputW;
+    }
+  }
+}
+
+void CpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH,
+                                  size_t imgSizeW, Matrix& denoms,
+                                  size_t channels, size_t sizeX, float scale,
+                                  float pow, bool blocked) {
+  size_t num = input.getHeight();
+  size_t height = imgSizeH;
+  size_t width = imgSizeW;
+  size_t numCols = input.getWidth();
+  CHECK(height * width * channels == input.getWidth());
+  CHECK(denoms.getHeight() == input.getHeight() &&
+        denoms.getWidth() == input.getWidth() && input.getHeight() == height_ &&
+        input.getWidth() == width_);
+  real* imgData = input.getData();
+  real* diffData = input.getData();
+  real* targetData = getData();
+  size_t halfSize = sizeX / 2;
+  size_t imgPixels = height * width;
+
+  // use integral vector to implement the sum in local window
+  real* integralData =
+      (real*)malloc((channels + sizeX + 1) * sizeof(real));  // NOLINT // TODO:
+  for (size_t i = 0; i <= halfSize; i++) {
+    integralData[i] = 0;
+  }
+  for (size_t i = 0; i < num; i++) {
+    real* targetPtr = targetData + i * numCols;
+    real* imgPtr = imgData + i * numCols;
+    real* diffPtr = diffData + i * numCols;
+    for (size_t m = 0; m < height; m++) {
+      for (size_t n = 0; n < width; n++) {
+        for (size_t c = 0; c < channels; c++) {
+          integralData[c + halfSize + 1] =
+              integralData[c + halfSize] + _square(*(diffPtr + c * imgPixels));
+        }
+        for (size_t k = channels + halfSize + 1; k <= channels + sizeX; k++) {
+          integralData[k] = integralData[channels + halfSize];
+        }
+        for (size_t k = 0; k < channels; k += 1) {
+          real a = integralData[k + sizeX] - integralData[k];
+          a = scale * a + 1;
+          targetPtr[k * imgPixels] = imgPtr[k * imgPixels] * _pow(a, -pow);
+        }
+        diffPtr++;
+        targetPtr++;
+        imgPtr++;
+      }
+    }
+  }
+  free(integralData);
+  integralData = NULL;
+}
+
+void CpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
+                                  Matrix& preOutV, Matrix& localOutV,
+                                  size_t channels, size_t imgSizeH,
+                                  size_t imgSizeW, size_t size, float scale,
+                                  float pow, bool blocked) {
+  LOG(FATAL) << "Not implemented";
+
+  CHECK(imgSizeH * imgSizeW * channels == preOutV.getWidth());
+  CHECK(denoms.getHeight() == preOutV.getHeight() &&
+        denoms.getWidth() == preOutV.getWidth() &&
+        preOutV.getHeight() == height_ && preOutV.getWidth() == width_);
+  CHECK(denoms.getHeight() == localGrad.getHeight() &&
+        denoms.getWidth() == localGrad.getWidth());
+
+  // NOLINT // TODO:
+}
+
+/**
+ * Input: one or more sequences. Each sequence contains some instances.
+ * Output: output size is the number of input sequences (NOT input instances).
+ * output[i] is set to max_{for each instance in this sequence}{input[i]}
+ */
+void CpuMatrix::maxSequenceForward(Matrix& input,
+                                   const IVector& sequence,
+                                   IVector& index) {
+  CHECK(dynamic_cast<CpuMatrix*>(&input));
+  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
+  CHECK(dynamic_cast<CpuIVector*>(&index));
+
+  real* outData = getData();
+  real* inputData = input.getData();
+  const int* starts = sequence.getData();
+  int* maxIndex = index.getData();
+  size_t numSequences = getHeight();
+  size_t dim = getWidth();
+
+  CHECK_EQ(dim, input.getWidth());
+  CHECK_EQ(numSequences, sequence.getSize() - 1);
+  CHECK_EQ(starts[numSequences], (int)input.getHeight());
+  CHECK_EQ(numSequences * dim, index.getSize());
+
+  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
+    // current sequence, loop for each input instance
+    // (1) first instance: do not need compare, copy value to outV directly
+    for (size_t k = 0; k < dim; ++k) {
+      outData[sequenceId * dim + k] = inputData[starts[sequenceId] * dim + k];
+      maxIndex[sequenceId * dim + k] = starts[sequenceId];
+    }
+    // (2) other instance in same sequence
+    for (int insId = starts[sequenceId] + 1; insId < starts[sequenceId + 1];
+         ++insId) {
+      // insId is the index on all instances
+      for (size_t k = 0; k < dim; ++k) {
+        // for each dim
+        if (inputData[insId * dim + k] > outData[sequenceId * dim + k]) {
+          // update max value and record index
+          outData[sequenceId * dim + k] = inputData[insId * dim + k];
+          maxIndex[sequenceId * dim + k] = insId;
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::maxSequenceBackward(Matrix& outputGrad,
+                                    const IVector& sequence,
+                                    IVector& index) {
+  CHECK(dynamic_cast<CpuMatrix*>(&outputGrad));
+  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
+  CHECK(dynamic_cast<CpuIVector*>(&index));
+
+  real* inputGrad = getData();
+  real* outGrad = outputGrad.getData();
+  int* maxIndex = index.getData();
+  size_t dim = getWidth();
+  size_t numSequences = sequence.getSize() - 1;
+
+  CHECK_EQ(dim, outputGrad.getWidth());
+  CHECK_EQ(numSequences, outputGrad.getHeight());
+  CHECK_EQ(numSequences * dim, index.getSize());
+
+  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
+    // current sequence
+    for (size_t j = 0; j < dim; ++j) {
+      // each dim
+      int insId = maxIndex[sequenceId * dim + j];
+      inputGrad[insId * dim + j] += outGrad[sequenceId * dim + j];
+    }
+  }
+}
+
+void CpuMatrix::contextProjectionForward(MatrixPtr input, MatrixPtr weight,
+                                         const IVector& sequence,
+                                         int contextLength, int contextStart,
+                                         size_t beginPad, bool isPadding) {
+  CHECK(dynamic_cast<CpuMatrix*>(input.get()));
+  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
+  if (weight) CHECK(dynamic_cast<CpuMatrix*>(weight.get()));
+
+  size_t numSequences = sequence.getSize() - 1;
+  int64_t inputDim = input->getWidth();
+  int64_t dim = getWidth();
+  CHECK_EQ(dim, inputDim * contextLength);
+  const int* starts = sequence.getData();
+
+  for (size_t i = 0; i < numSequences; ++i) {
+    for (int j = 0; j < contextLength; ++j) {
+      int begin = starts[i] + contextStart + j;
+      int end = starts[i + 1] + contextStart + j;
+      int dstBegin = starts[i];
+      int dstEnd = starts[i + 1];
+      if (begin < starts[i]) {
+        int64_t padSize =
+            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
+        MatrixPtr mat = this->subMatrix(starts[i], padSize);
+        if (isPadding) {
+          MatrixPtr sub = weight->subMatrix(j, padSize);
+          mat->addAtOffset(*sub, j * inputDim);
+        }
+        dstBegin = starts[i] + padSize;
+        begin = starts[i];
+      }
+      if (end > starts[i + 1]) {
+        int64_t padSize =
+            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
+        MatrixPtr mat = this->subMatrix(starts[i + 1] - padSize, padSize);
+        if (isPadding) {
+          MatrixPtr sub =
+              weight->subMatrix(beginPad + contextStart + j - padSize, padSize);
+          mat->addAtOffset(*sub, j * inputDim);
+        }
+        dstEnd = starts[i + 1] - padSize;
+        end = starts[i + 1];
+      }
+      if (end <= begin) continue;
+      MatrixPtr src = input->subMatrix(begin, end - begin);
+      MatrixPtr dst = this->subMatrix(dstBegin, dstEnd - dstBegin);
+      dst->addAtOffset(*src, j * inputDim);
+    }
+  }
+}
+
+void CpuMatrix::contextProjectionBackward(MatrixPtr inputGrad,
+                                          MatrixPtr weightGrad,
+                                          const IVector& sequence,
+                                          int contextLength, int contextStart,
+                                          size_t beginPad, bool isPadding) {
+  if (inputGrad) CHECK(dynamic_cast<CpuMatrix*>(inputGrad.get()));
+  if (weightGrad) CHECK(dynamic_cast<CpuMatrix*>(weightGrad.get()));
+  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
+
+  int64_t inputDim = 0;
+  int64_t dim = getWidth();
+  size_t numSequences = sequence.getSize() - 1;
+  const int* starts = sequence.getData();
+  if (inputGrad) {
+    inputDim = inputGrad->getWidth();
+  } else {
+    inputDim = weightGrad->getWidth();
+  }
+  CHECK_EQ(dim, inputDim * contextLength);
+
+  for (size_t i = 0; i < numSequences; ++i) {
+    for (int j = 0; j < contextLength; ++j) {
+      int begin = starts[i] + contextStart + j;
+      int end = starts[i + 1] + contextStart + j;
+      int dstBegin = starts[i];
+      int dstEnd = starts[i + 1];
+      if (begin < starts[i]) {
+        int64_t padSize =
+            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
+        if (isPadding && weightGrad) {
+          MatrixPtr mat = this->subMatrix(starts[i], padSize);
+          MatrixPtr sub = weightGrad->subMatrix(j, padSize);
+          sub->addAtOffset(*mat, j * inputDim);
+        }
+        dstBegin = starts[i] + padSize;
+        begin = starts[i];
+      }
+      if (end > starts[i + 1]) {
+        int64_t padSize =
+            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
+        if (isPadding && weightGrad) {
+          MatrixPtr mat = this->subMatrix(starts[i + 1] - padSize, padSize);
+          MatrixPtr sub = weightGrad->subMatrix(
+              beginPad + contextStart + j - padSize, padSize);
+          sub->addAtOffset(*mat, j * inputDim);
+        }
+        dstEnd = starts[i + 1] - padSize;
+        end = starts[i + 1];
+      }
+      if (end <= begin) continue;
+      if (!inputGrad) continue;
+      MatrixPtr src = inputGrad->subMatrix(begin, end - begin);
+      MatrixPtr dst = this->subMatrix(dstBegin, dstEnd - dstBegin);
+      src->addAtOffset(*dst, j * inputDim);
+    }
+  }
+}
+
+inline void vecAddTo(real* a, const real* b, size_t len) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i] += b[i];
+  }
+}
+
+inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i] += scaleB * b[i];
+  }
+}
+
+inline void colVecAddTo(real* a, const real* b, size_t len, size_t aWidth,
+                        size_t bWidth) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i * aWidth] += b[i * bWidth];
+  }
+}
+
+inline void colVecAddTo(real* a, real* b, real c, size_t len, size_t aWidth,
+                        size_t bWidth) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i * aWidth] += b[i * bWidth] * c;
+  }
+}
+
+void CpuMatrix::addBias(Matrix& b, real scale) {
+  CHECK(b.useGpu_ == false) << "Matrix type are not equal";
+
+  CHECK_EQ(b.getHeight(), (size_t)1);
+  CHECK_EQ(width_, b.getWidth());
+  real* aData = getData();
+  real* bData = b.getData();
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+
+  if (scale == 1 && getStride() % 32 == 0) {  // use libaddto
+    // @TODO(yuyang18) Make input addr can be unaligned.
+    // So merge this if and else
+    CHECK_EQ((size_t)aData % 32, 0UL);
+    CHECK_EQ((size_t)bData % 32, 0UL);
+    for (size_t i = 0; i < numSamples; i++) {
+      simd::addTo(aData + i * getStride(), bData, dim);
+    }
+  } else {
+    for (size_t i = 0; i < numSamples; i++) {
+      for (size_t j = 0; j < dim; j++) {
+        aData[i * getStride() + j] += scale * bData[j];
+      }
+    }
+  }
+}
+
+void CpuMatrix::collectBias(Matrix& a, real scale) {
+  CHECK_EQ(getHeight(), (size_t)1);
+  CHECK_EQ(width_, a.getWidth());
+  CpuSparseMatrix* aptr = dynamic_cast<CpuSparseMatrix*>(&a);
+  if (!aptr) {
+    sumCols(a, scale);
+  } else {
+    size_t nnz = aptr->getElementCnt();
+    int* cols = aptr->getCols();
+    real* A = aptr->getValue();
+    real* B = getData();
+    for (size_t i = 0; i < nnz; i++) {
+      B[cols[i]] += scale * A[i];
+    }
+  }
+}
+
+void CpuMatrix::sequenceAvgForward(Matrix& a,
+                                   const IVector& startsPos,
+                                   int mode) {
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(height, startsPos.getSize() - 1);
+  CHECK_EQ(width, a.getWidth());
+  real* dst = getData();
+  real* src = a.getData();
+  const int* starts = startsPos.getData();
+  MatrixPtr outMtx = Matrix::create(1, 1, false, false);
+  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
+  for (size_t i = 0; i < height; i++) {
+    int sequenceLength = starts[i + 1] - starts[i];
+    if (0 == sequenceLength) {
+      // empty sequence
+      continue;
+    }
+    outMtx->setData(dst + i * width);
+    dataMtx->setData(src + starts[i] * width, sequenceLength, width);
+    if (mode == 0) {
+      // plain average
+      outMtx->sumCols(*dataMtx, (real)1 / (real)sequenceLength);
+    } else if (mode == 1) {
+      // sum instead of average
+      outMtx->sumCols(*dataMtx, (real)1);
+    } else if (mode == 2) {
+      // divide by square root of sequenceLength
+      outMtx->sumCols(*dataMtx, (real)1 / std::sqrt(sequenceLength));
+    } else {
+      LOG(FATAL) << "should not reach here";
+    }
+  }
+}
+
+/* this = scaleAB*(a*b) + scaleT*this*/
+void CpuMatrix::mul(const MatrixPtr a, const MatrixPtr b, real scaleAB,
+                    real scaleT) {
+  CHECK(!isTransposed()) << "Not supported";
+
+  if (dynamic_cast<CpuMatrix*>(a.get()) && dynamic_cast<CpuMatrix*>(b.get())) {
+    mul(dynamic_cast<CpuMatrix*>(a.get()), dynamic_cast<CpuMatrix*>(b.get()),
+        scaleAB, scaleT);
+  } else if (dynamic_cast<CpuSparseMatrix*>(a.get()) &&
+             dynamic_cast<CpuMatrix*>(b.get())) {
+    mul(dynamic_cast<CpuSparseMatrix*>(a.get()),
+        dynamic_cast<CpuMatrix*>(b.get()), scaleAB, scaleT);
+  } else if (dynamic_cast<CpuMatrix*>(a.get()) &&
+             dynamic_cast<CpuSparseMatrix*>(b.get())) {
+    mul(dynamic_cast<CpuMatrix*>(a.get()),
+        dynamic_cast<CpuSparseMatrix*>(b.get()), scaleAB, scaleT);
+  } else {
+    LOG(FATAL) << "Not supported";
+  }
+}
+
+void CpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
+                    real scaleT) {
+  if (dynamic_cast<CacheRowCpuMatrix*>(b)) {
+    return mul(a, dynamic_cast<CacheRowCpuMatrix*>(b), this, scaleAB, scaleT);
+  } else if (dynamic_cast<SparseRowCpuMatrix*>(b)) {
+    return mul(a, dynamic_cast<SparseRowCpuMatrix*>(b), this, scaleAB, scaleT);
+  } else {
+    return mul(a, b, this, scaleAB, scaleT);
+  }
+}
+
+void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
+  CHECK(!isTransposed()) << "Not supported";
+
+  size_t a_col, b_col, a_row, b_row;
+  CBLAS_TRANSPOSE a_trans, b_trans;
+  if (!a->isTransposed()) {
+    a_col = a->getWidth();
+    a_row = a->getHeight();
+    a_trans = CblasNoTrans;
+  } else {
+    a_col = a->getHeight();
+    a_row = a->getWidth();
+    a_trans = CblasTrans;
+  }
+  if (!b->isTransposed()) {
+    b_col = b->getWidth();
+    b_row = b->getHeight();
+    b_trans = CblasNoTrans;
+  } else {
+    b_col = b->getHeight();
+    b_row = b->getWidth();
+    b_trans = CblasTrans;
+  }
+
+  CHECK_EQ(a_col, b_row);
+  CHECK_EQ(a_row, getHeight());
+  CHECK_EQ(b_col, getWidth());
+
+  real* A = a->getData();
+  real* B = b->getData();
+  real* C = getData();
+
+  int M = getHeight();
+  int N = getWidth();
+  int K = a_col;
+  int lda = a->getStride();
+  int ldb = b->getStride();
+  int ldc = getStride();
+#ifndef PADDLE_TYPE_DOUBLE
+  cblas_sgemm(CblasRowMajor, a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb,
+              scaleT, C, ldc);
+#else
+  cblas_dgemm(CblasRowMajor, a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb,
+              scaleT, C, ldc);
+// TODO(yuyang18): Is gemm defined other place?
+#endif
+
+  VLOG(2) << " A[0]=" << A[0] << " A[1]=" << A[1] << " B[0]=" << B[0]
+          << " B[1]=" << B[1] << " C[0]=" << C[0] << " C[1]=" << C[1];
+}
+
+void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c,
+                    real scaleAB, real scaleT) {
+  CHECK(!c->isTransposed()) << "Not supported";
+  CHECK_EQ(c->getValueType(), FLOAT_VALUE);
+
+  real* A = a->getData();
+  real* B = b->getData();
+  real* C = c->getValue();
+  int* rows = c->getRows();
+  int* cols = c->getCols();
+  size_t height = c->getHeight();
+  size_t width = c->getWidth();
+  if (scaleT == 0) {
+    c->zeroMem();
+  }
+
+  if (!a->isTransposed() && !b->isTransposed()) {
+    size_t m = a->getWidth();
+    CHECK_EQ(b->getHeight(), m);
+    CHECK_EQ(a->getHeight(), height);
+    CHECK_EQ(b->getWidth(), width);
+    if (c->getFormat() == SPARSE_CSC) {
+      for (size_t i = 0; i < width; i++) {
+        size_t start = c->getColStartIdx(i);
+        size_t end = c->getColStartIdx(i + 1);
+        for (size_t j = start; j < end; j++) {
+          real sum = 0;
+          size_t rowIdx = rows[j];
+          for (size_t k = 0; k < m; k++) {
+            sum += A[rowIdx * m + k] * B[k * width + i];
+          }
+          C[j] = scaleAB * sum + scaleT * C[j];
+        }
+      }
+    } else {
+      for (size_t i = 0; i < height; i++) {
+        size_t start = c->getRowStartIdx(i);
+        size_t end = c->getRowStartIdx(i + 1);
+        for (size_t j = start; j < end; j++) {
+          real sum = 0;
+          size_t colIdx = cols[j];
+          for (size_t k = 0; k < m; k++) {
+            sum += A[i * m + k] * B[k * width + colIdx];
+          }
+          C[j] = scaleAB * sum + scaleT * C[j];
+        }
+      }
+    }
+  } else if (a->isTransposed() && !b->isTransposed()) {
+    size_t m = a->getHeight();
+    CHECK_EQ(m, b->getHeight());
+    CHECK_EQ(b->getWidth(), width);
+    CHECK_EQ(a->getWidth(), height);
+
+    if (c->getFormat() == SPARSE_CSC) {
+      for (size_t i = 0; i < width; i++) {
+        size_t start = c->getColStartIdx(i);
+        size_t end = c->getColStartIdx(i + 1);
+        for (size_t j = start; j < end; j++) {
+          real sum = 0;
+          size_t rowIdx = rows[j];
+          for (size_t k = 0; k < m; k++) {
+            sum += A[k * height + rowIdx] * B[k * width + i];
+          }
+          C[j] = scaleAB * sum + scaleT * C[j];
+        }
+      }
+    } else {
+      for (size_t i = 0; i < height; i++) {
+        int start = c->getRowStartIdx(i);
+        int end = c->getRowStartIdx(i + 1);
+        for (int j = start; j < end; j++) {
+          real sum = 0;
+          size_t colIdx = cols[j];
+          for (size_t k = 0; k < m; k++) {
+            sum += A[k * height + i] * B[k * width + colIdx];
+          }
+          C[j] = scaleAB * sum + scaleT * C[j];
+        }
+      }
+    }
+  } else if (!a->isTransposed() && b->isTransposed()) {
+    size_t m = a->getWidth();
+    CHECK_EQ(b->getWidth(), m);
+    CHECK_EQ(a->getHeight(), height);
+    CHECK_EQ(b->getHeight(), width);
+    if (c->getFormat() == SPARSE_CSR) {
+      for (size_t i = 0; i < height; i++) {
+        size_t start = c->getRowStartIdx(i);
+        size_t end = c->getRowStartIdx(i + 1);
+        for (size_t j = start; j < end; j++) {
+          real sum = 0;
+          size_t colIdx = cols[j];
+          for (size_t k = 0; k < m; k++) {
+            sum += A[i * m + k] * B[colIdx * m + k];
+          }
+          C[j] = scaleAB * sum + scaleT * C[j];
+        }
+      }
+    } else {
+      LOG(FATAL) << "Not supported csc format "
+                    "when a is not trans and b is trans";
+    }
+  } else {
+    LOG(FATAL) << "Not supported";
+  }
+}
+
+void CpuMatrix::mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB,
+                    real scaleT) {
+  CHECK(!trans_) << "Not supported";
+  CHECK(!a->isTransposed()) << "Not supported";
+  CHECK(scaleT == 0 || scaleT == 1);
+
+  // TODO(yuyang18): Maybe bug implementation here
+  CHECK_EQ(scaleAB, static_cast<real>(1.0));
+
+  real* A = a->getData();
+  real* B = b->getValue();
+  real* C = getData();
+  int* rows = b->getRows();
+  int* cols = b->getCols();
+
+  if (scaleT == 0) {
+    zeroMem();
+  }
+  if (b->getFormat() == SPARSE_CSC) {
+    if (!b->isTransposed()) {
+      size_t m = a->getWidth();
+      CHECK_EQ(b->getHeight(), m);
+      CHECK_EQ(a->getHeight(), height_);
+      CHECK_EQ(b->getWidth(), width_);
+
+      if (b->getValueType() == NO_VALUE) {
+        for (size_t j = 0; j < b->getWidth(); ++j) {
+          int start = b->getColStartIdx(j);
+          int end = b->getColStartIdx(j + 1);
+          for (int i = start; i < end; ++i) {
+            colVecAddTo(C + j, A + rows[i], height_, width_, a->getWidth());
+          }
+        }
+      } else if (b->getValueType() == FLOAT_VALUE) {
+        for (size_t j = 0; j < b->getWidth(); ++j) {
+          int start = b->getColStartIdx(j);
+          int end = b->getColStartIdx(j + 1);
+          for (int i = start; i < end; ++i) {
+            colVecAddTo(C + j, A + rows[i], B[i], height_, width_,
+                        a->getWidth());
+          }
+        }
+      }
+    } else /*if (b->isTransposed())*/ {
+      size_t m = a->getWidth();
+      CHECK_EQ(b->getHeight(), width_);
+      CHECK_EQ(a->getHeight(), height_);
+      CHECK_EQ(b->getWidth(), m);
+      if (b->getValueType() == NO_VALUE) {
+        for (size_t i = 0; i < b->getWidth(); ++i) {
+          int start = b->getColStartIdx(i);
+          int end = b->getColStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            colVecAddTo(C + rows[j], A + i, height_, width_, a->getWidth());
+          }
+        }
+      } else if (b->getValueType() == FLOAT_VALUE) {
+        for (size_t i = 0; i < b->getWidth(); ++i) {
+          int start = b->getColStartIdx(i);
+          int end = b->getColStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            colVecAddTo(C + rows[j], A + i, B[j], height_, width_,
+                        a->getWidth());
+          }
+        }
+      }
+    }
+  } else {
+    if (!b->isTransposed()) {
+      size_t m = a->getWidth();
+      CHECK_EQ(b->getHeight(), m);
+      CHECK_EQ(a->getHeight(), height_);
+      CHECK_EQ(b->getWidth(), width_);
+
+      if (b->getValueType() == NO_VALUE) {
+        for (size_t j = 0; j < b->getHeight(); ++j) {
+          int start = b->getRowStartIdx(j);
+          int end = b->getRowStartIdx(j + 1);
+          for (int i = start; i < end; ++i) {
+            colVecAddTo(C + cols[i], A + j, height_, width_, a->getWidth());
+          }
+        }
+      } else if (b->getValueType() == FLOAT_VALUE) {
+        for (size_t j = 0; j < b->getHeight(); ++j) {
+          int start = b->getRowStartIdx(j);
+          int end = b->getRowStartIdx(j + 1);
+          for (int i = start; i < end; ++i) {
+            colVecAddTo(C + cols[i], A + j, B[i], height_, width_,
+                        a->getWidth());
+          }
+        }
+      }
+    } else /*if (b->isTransposed())*/ {
+      size_t m = a->getWidth();
+      CHECK_EQ(b->getHeight(), width_);
+      CHECK_EQ(a->getHeight(), height_);
+      CHECK_EQ(b->getWidth(), m);
+      if (b->getValueType() == NO_VALUE) {
+        for (size_t i = 0; i < b->getHeight(); ++i) {
+          int start = b->getRowStartIdx(i);
+          int end = b->getRowStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            colVecAddTo(C + i, A + cols[j], height_, width_, a->getWidth());
+          }
+        }
+      } else if (b->getValueType() == FLOAT_VALUE) {
+        for (size_t i = 0; i < b->getHeight(); ++i) {
+          int start = b->getRowStartIdx(i);
+          int end = b->getRowStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            colVecAddTo(C + i, A + cols[j], B[j], height_, width_,
+                        a->getWidth());
+          }
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::selectRows(Matrix& table, IVector& ids) {
+  if (dynamic_cast<CacheRowCpuMatrix*>(&table)) {
+    selectRowsImp(*dynamic_cast<CacheRowCpuMatrix*>(&table), ids);
+  } else if (dynamic_cast<SparseRowCpuMatrix*>(&table)) {
+    selectRowsImp(*dynamic_cast<SparseRowCpuMatrix*>(&table), ids);
+  } else {
+    CHECK(table.isContiguous());
+    selectRowsImp(*dynamic_cast<CpuMatrix*>(&table), ids);
+  }
+}
+
+void CpuMatrix::selectElements(Matrix& table, IVector& ids) {
+  CHECK_EQ(table.getHeight(), ids.getSize());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), 1U);
+  real* tableData = table.getData();
+  int* idsData = ids.getData();
+  for (size_t i = 0; i < table.getHeight(); i++) {
+    data_[i] += tableData[i * table.getWidth() + idsData[i]];
+  }
+}
+
+void CpuMatrix::addElements(Matrix& table, IVector& ids) {
+  CHECK_EQ(table.getHeight(), ids.getSize());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), 1U);
+  real* tableData = table.getData();
+  int* idsData = ids.getData();
+  for (size_t i = 0; i < table.getHeight(); i++) {
+    tableData[i * table.getWidth() + idsData[i]] += data_[i];
+  }
+}
+
+// this.row[i] += table.row[ids[i]]
+template <typename TableMatType>
+void CpuMatrix::selectRowsImp(TableMatType& table, IVector& ids) {
+  CHECK(!table.useGpu());
+  CHECK(!ids.useGpu());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), table.getWidth());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  real* a = getData();
+  size_t tableSize = table.getHeight();
+  int* index = ids.getData();
+
+  for (size_t i = 0; i < numSamples; ++i) {
+    if (index[i] == -1) continue;
+    CHECK_LT(index[i], (int)tableSize);
+    CHECK_GE(index[i], 0);
+    vecAddTo(a + i * stride_, table.getRow(index[i]), dim);
+  }
+}
+
+void CpuMatrix::addToRows(Matrix& table, IVector& ids) {
+  if (dynamic_cast<CacheRowCpuMatrix*>(&table)) {
+    addToRowsImp(*dynamic_cast<CacheRowCpuMatrix*>(&table), ids);
+  } else if (dynamic_cast<SparseAutoGrowRowCpuMatrix*>(&table)) {
+    addToRowsImp(*dynamic_cast<SparseAutoGrowRowCpuMatrix*>(&table), ids);
+  } else if (dynamic_cast<SparseRowCpuMatrix*>(&table)) {
+    addToRowsImp(*dynamic_cast<SparseRowCpuMatrix*>(&table), ids);
+  } else {
+    CHECK(table.isContiguous());
+    addToRowsImp(*dynamic_cast<CpuMatrix*>(&table), ids);
+  }
+}
+
+// table.row[ids[i]] += this.row[i]
+template <typename TableMatType>
+void CpuMatrix::addToRowsImp(TableMatType& table, IVector& ids) {
+  CHECK(!table.useGpu());
+  CHECK(!ids.useGpu());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), table.getWidth());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  real* a = getData();
+  size_t tableSize = table.getHeight();
+  int* index = ids.getData();
+
+  for (size_t i = 0; i < numSamples; ++i) {
+    if (index[i] == -1) continue;
+    CHECK_LT(index[i], (int)tableSize);
+    CHECK_GE(index[i], 0);
+    vecAddTo(table.getRow(index[i]), a + i * stride_, dim);
+  }
+}
+
+static ThreadLocal<std::vector<const real*>> threadLocalColArray;
+
+template <typename MatBType, typename MatCType>
+void CpuMatrix::mul(CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB,
+                    real scaleT) {
+  CHECK(!c->isTransposed()) << "Not supported";
+  CHECK(!b->isTransposed()) << "Not supported";
+  // TODO(yuyang18): Maybe bug implementation here.
+  CHECK(scaleAB == 1) << "Not supported";
+  CHECK(scaleT == 0 || scaleT == 1) << "Not supported";
+  CHECK_EQ(a->getFormat(), SPARSE_CSR) << "Not supported";
+
+  real* B = b->getData();
+  real* C = c->getData();
+  size_t height = c->getHeight();
+  size_t width = c->getWidth();
+  int* cols = a->getCols();
+  real* values = a->getValue();
+
+  if (scaleT == 0) {
+    c->zeroMem();
+  }
+
+  if (!a->isTransposed()) {
+    size_t m = a->getWidth();
+    CHECK_EQ(b->getHeight(), m);
+    CHECK_EQ(a->getHeight(), height);
+    CHECK_EQ(b->getWidth(), width);
+
+    if (a->getValueType() == NO_VALUE) {
+      if (width % 32 == 0) {  // use libaddto
+        // @TODO(yuyang18) Make input addr can be unaligned.
+        // So merge this if and else
+        CHECK_EQ((size_t)B % 32, 0UL);
+        CHECK_EQ((size_t)C % 32, 0UL);
+        auto& colArray = *threadLocalColArray;
+        for (size_t i = 0; i < a->getHeight(); ++i) {
+          const int start = a->getRowStartIdx(i);
+          const int end = a->getRowStartIdx(i + 1);
+          size_t colNum = end - start;
+          colArray.resize(colNum);
+          for (int j = 0; j < end - start; ++j) {
+            colArray[j] = b->getRow(cols[j + start]);
+          }
+          simd::batchAddTo(c->getRow(i), &colArray[0], colNum, width);
+        }
+
+      } else {
+        for (size_t i = 0; i < a->getHeight(); ++i) {
+          const int start = a->getRowStartIdx(i);
+          const int end = a->getRowStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            vecAddTo(c->getRow(i), b->getRow(cols[j]), width);
+          }
+        }
+      }
+    } else if (a->getValueType() == FLOAT_VALUE) {
+      for (size_t i = 0; i < a->getHeight(); ++i) {
+        const int start = a->getRowStartIdx(i);
+        const int end = a->getRowStartIdx(i + 1);
+        for (int j = start; j < end; ++j) {
+          vecAddTo(c->getRow(i), b->getRow(cols[j]), values[j], width);
+        }
+      }
+    }
+  } else /*if (a->isTransposed())*/ {
+    size_t m = a->getHeight();
+    CHECK_EQ(b->getHeight(), m);
+    CHECK_EQ(a->getWidth(), height);
+    CHECK_EQ(b->getWidth(), width);
+    if (a->getValueType() == NO_VALUE) {
+      if (width % 32 == 0) {  // use libaddto
+        // @TODO(yuyang18) Make input addr can be unaligned.
+        // So merge this if and else
+        CHECK_EQ((size_t)B % 32, 0UL);
+        CHECK_EQ((size_t)C % 32, 0UL);
+        for (size_t i = 0; i < a->getHeight(); ++i) {
+          const int start = a->getRowStartIdx(i);
+          const int end = a->getRowStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            simd::addTo(c->getRow(cols[j]), b->getRow(i), width);
+          }
+        }
+
+      } else {
+        for (size_t i = 0; i < a->getHeight(); ++i) {
+          const int start = a->getRowStartIdx(i);
+          const int end = a->getRowStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            vecAddTo(c->getRow(cols[j]), b->getRow(i), width);
+          }
+        }
+      }
+    } else if (a->getValueType() == FLOAT_VALUE) {
+      for (size_t i = 0; i < a->getHeight(); ++i) {
+        const int start = a->getRowStartIdx(i);
+        const int end = a->getRowStartIdx(i + 1);
+        for (int j = start; j < end; ++j) {
+          vecAddTo(c->getRow(cols[j]), b->getRow(i), values[j], width);
+        }
+      }
+    }
+  }
+}
+
+// instantiation mul() called in SparseRowMatrix.cpp
+template void CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(
+    CpuSparseMatrix* a, CpuMatrix* b, SparseRowCpuMatrix* c, real scaleAB,
+    real scaleT);
+template void CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(
+    CpuSparseMatrix* a, CpuMatrix* b, SparseAutoGrowRowCpuMatrix* c,
+    real scaleAB, real scaleT);
+template void CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(CpuSparseMatrix* a,
+                                                           CpuMatrix* b,
+                                                           CacheRowCpuMatrix* c,
+                                                           real scaleAB,
+                                                           real scaleT);
+
+void SharedCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
+                          real scaleT) {
+  CHECK(!isTransposed()) << "Not supported";
+  CHECK(!b->isTransposed()) << "Not supported";
+  CHECK_EQ(scaleAB, 1) << "Not supported";
+  CHECK_EQ(scaleT, 1) << "Not supported";
+  CHECK_EQ(a->getFormat(), SPARSE_CSR) << "not supported";
+
+  real* B = b->getData();
+  real* C = getData();
+  size_t height = getHeight();
+  size_t width = getWidth();
+
+  // get real trans
+  MatrixPtr aTrans;
+  if (a->isTransposed()) {
+    aTrans = a->getTmpSparseMatrix(a->getWidth(), a->getHeight());
+    a->transpose(aTrans, false);
+  }
+  a = dynamic_cast<CpuSparseMatrix*>(aTrans.get());
+
+  size_t m = a->getWidth();
+  CHECK_EQ(b->getHeight(), m);
+  CHECK_EQ(a->getHeight(), height);
+  CHECK_EQ(b->getWidth(), width);
+
+  size_t blockSize = (height / blockNum_) + 1;
+  CpuMatrixPtr localBuf = *localBuf_;
+  if (!localBuf) {
+    localBuf = std::make_shared<CpuMatrix>(blockSize, width);
+  } else {
+    localBuf->resize(blockSize, width);
+  }
+  localBuf->zeroMem();
+  real* localC = localBuf->getData();
+  std::vector<int>& blockSeq = *blockSeq_;
+  if (blockSeq.size() == 0) {
+    for (int k = 0; k < blockNum_; ++k) {
+      blockSeq.push_back(k);
+    }
+    std::random_shuffle(blockSeq.begin(), blockSeq.end());
+  }
+  std::vector<int>& localBufRows = *localBufRows_;
+  int* cols = a->getCols();
+  real* value = a->getValue();
+
+  for (int k = 0; k < blockNum_; ++k) {
+    int blockId = blockSeq[k];
+    size_t blockBegin = blockId * blockSize;
+    size_t blockEnd = (blockId + 1) * blockSize;
+    if (blockId == blockNum_ - 1) {
+      blockEnd = height;
+    }
+    if (a->getValueType() == NO_VALUE) {
+      for (size_t i = blockBegin; i < blockEnd; ++i) {
+        int start = a->getRowStartIdx(i);
+        int end = a->getRowStartIdx(i);
+        size_t colNum = a->getColNum(i);
+        if (colNum == 0) {
+          continue;
+        }  // skip empty row
+        localBufRows.push_back(i);
+        size_t bufPos = localBufRows.size() - 1;
+        for (int j = start; j < end; ++j) {
+          vecAddTo(localC + bufPos * width, B + cols[j] * width, width);
+        }
+      }
+    } else if (a->getValueType() == FLOAT_VALUE) {
+      for (size_t i = blockBegin; i < blockEnd; ++i) {
+        int start = a->getRowStartIdx(i);
+        int end = a->getRowStartIdx(i);
+        size_t colNum = a->getColNum(i);
+        if (colNum == 0) {
+          continue;
+        }  // skip empty row
+        localBufRows.push_back(i);
+        size_t bufPos = localBufRows.size() - 1;
+        for (int j = start; j < end; ++j) {
+          vecAddTo(localC + bufPos * width, B + cols[j] * width, value[j],
+                   width);
+        }
+      }
+    }
+
+    {
+      std::lock_guard<std::mutex> guard(*blockLocks_[blockId]);
+      for (size_t i = 0; i < localBufRows.size(); ++i) {
+        vecAddTo(C + localBufRows[i] * width, localC + i * width, width);
+      }
+    }
+    memset(localC, 0, localBufRows.size() * width * sizeof(real));
+    localBufRows.clear();
+  }
+
+  VLOG(2) << " B[0]=" << B[0] << " B[1]=" << B[1] << " C[0]=" << C[0]
+          << " C[1]=" << C[1];
+}
+
+void SharedCpuMatrix::add(Matrix& b, real p1, real p2) {
+  CHECK_EQ(blockNum_, 1);
+  std::lock_guard<std::mutex> guard(*blockLocks_[0]);
+  CpuMatrix::add(b, p1, p2);
+}
+
+void SharedCpuMatrix::add(real p1, real p2) {
+  CHECK_EQ(blockNum_, 1);
+  std::lock_guard<std::mutex> guard(*blockLocks_[0]);
+  CpuMatrix::add(p1, p2);
+}
+
+void SharedCpuMatrix::initShared(int blockNum) {
+  CHECK_GT(height_ * width_, 1UL * 1024 * 1024)
+      << "should not share small matrix";
+  initBlock(blockNum);
+}
+
+void SharedCpuMatrix::initBlock(int blockNum) {
+  CHECK_LE(blockNum, 200) << "should not use large block number";
+  blockNum_ = blockNum;
+  blockLocks_.resize(blockNum);
+  for (auto& locker : blockLocks_) {
+    locker.reset(new std::mutex);
+  }
+}
+
+/* Add a (column) vector b to matrix a, column by column */
+void CpuMatrix::addColumnVector(const Matrix& b) {
+  BaseMatrix::addColVector(const_cast<Matrix&>(b));
+}
+
+/* this = a*b */
+void CpuMatrix::mul(const MatrixPtr a, const MatrixPtr b) {
+  return mul(a, b, 1.0, 0.0);
+}
+
+/* this = scaleAB*(this*b) +  scaleT*this */
+void CpuMatrix::rightMul(Matrix& b, real scaleAB, real scaleT) {
+  (void)b;
+  (void)scaleAB;
+  (void)scaleT;
+  LOG(FATAL) << "Not implemented";
+}
+
+/* this = this* b */
+void CpuMatrix::rightMul(Matrix& b) { return rightMul(b, 1.0, 0.0); }
+
+/* this = scaleAB*(a*this) +  scaleT*this */
+void CpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
+  (void)a;
+  (void)scaleAB;
+  (void)scaleT;
+  LOG(FATAL) << "Not implemented";
+}
+
+/* this = a*this) */
+void CpuMatrix::leftMul(Matrix& a) { return leftMul(a, 1.0, 0.0); }
+
+void CpuMatrix::colMerge(Matrix& src) { src.rowSum(*this); }
+
+void CpuMatrix::rowSum(Matrix& sum) {
+  CHECK_EQ(sum.getHeight(), getHeight());
+  CHECK_EQ(sum.getWidth(), (size_t)1);
+
+  sum.sumRows(*this);
+}
+
+void CpuMatrix::rowMaxId(IVector& maxIds) {
+  CHECK(!maxIds.useGpu()) << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  CHECK_EQ(maxIds.getSize(), numSamples);
+
+  real* a = getData();
+  int* s = maxIds.getData();
+  size_t dim = getWidth();
+
+  for (size_t i = 0; i < numSamples; i++) {
+    real sm = a[i * dim];
+    int maxId = 0;
+    for (size_t j = 1; j < dim; j++) {
+      if (a[i * dim + j] > sm) {
+        maxId = j;
+        sm = a[i * dim + j];
+      }
+    }
+    s[i] = maxId;
+  }
+}
+
+void CpuMatrix::rowMax(Matrix& max) {
+  CHECK_EQ(max.getHeight(), getHeight());
+  CHECK_EQ(max.getWidth(), (size_t)1);
+  max.maxRows(*this);
+}
+
+/* get beam size of max ids and values */
+void CpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
+  CHECK(isContiguous());
+  CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
+  size_t numSamples = getHeight();
+  size_t beam = maxVal.getWidth();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getHeight(), numSamples);
+
+  real* a = getData();
+  int* s = maxIds.getData();
+  real* t = maxVal.getData();
+  size_t dim = getWidth();
+  for (size_t i = 0; i < numSamples; i++) {
+    std::vector<std::pair<real, size_t>> vec;
+    for (size_t j = 0; j < dim; j++) {
+      vec.push_back(std::pair<real, size_t>(a[i * dim + j], j));
+    }
+
+    std::partial_sort(
+        vec.begin(), vec.begin() + beam, vec.end(),
+        [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
+          return l.first > r.first;
+        });
+    for (size_t j = 0; j < beam; j++) {
+      t[i * beam + j] = vec[j].first;
+      s[i * beam + j] = vec[j].second;
+    }
+  }
+}
+
+void CpuMatrix::colMax(Matrix& max) {
+  CHECK_EQ(max.getWidth(), getWidth());
+  CHECK_EQ(max.getHeight(), (size_t)1);
+  max.maxCols(*this);
+}
+
+void CpuMatrix::rowNormalizeL1(Matrix& out) {
+  CHECK(!out.useGpu());
+
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(out.getHeight(), numSamples);
+  CHECK_EQ(out.getWidth(), dim);
+  real* a = getData();
+  real* b = out.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    real s = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      s += a[i * dim + j];
+    }
+    // Right now, we just bet that sum won't be zero. If this really happens,
+    // we will figure out what should be done then.
+    CHECK_GT(s, 0);
+    s = 1 / s;
+    for (size_t j = 0; j < dim; ++j) {
+      b[i * dim + j] = s * a[i * dim + j];
+    }
+  }
+}
+
+/* calulate classification error */
+void CpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) {
+  CHECK(dynamic_cast<CpuMatrix*>(output.get()));
+  CHECK(dynamic_cast<CpuIVector*>(label.get()));
+
+  size_t numSamples = getHeight();
+  size_t dim = output->getWidth();
+  CHECK_EQ(label->getSize(), numSamples);
+  CHECK_EQ(output->getHeight(), numSamples);
+  CHECK_EQ(getWidth(), (size_t)1);
+
+  real* out = output->getData();
+  real* result = getData();
+  int* lbl = label->getData();
+  real maxData;
+  int maxIndex;
+  for (size_t i = 0; i < numSamples; ++i) {
+    CHECK_GE(lbl[i], 0);
+    CHECK_LT((size_t)lbl[i], dim);
+    maxData = out[i * dim];
+    maxIndex = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      if (maxData < out[i * dim + j]) {
+        maxIndex = j;
+        maxData = out[i * dim + j];
+      }
+    }
+    result[i] = (maxIndex != lbl[i]);
+  }
+}
+
+/* copy -log(output[label]) to this->data[i] */
+void CpuMatrix::oneHotCrossEntropy(Matrix& output, IVector& label) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  CHECK(dynamic_cast<CpuIVector*>(&label));
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getSize(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(getWidth(), (size_t)1);
+
+  real* out = output.getData();
+  real* cost = getData();
+  int* lbl = label.getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim) {
+    CHECK_GE(lbl[i], 0);
+    CHECK_LT((size_t)lbl[i], dim);
+    cost[i] = -std::log(out[lbl[i]]);
+  }
+}
+
+/* calculate the error of outputV according to label */
+void CpuMatrix::oneHotCrossEntropyBp(Matrix& output, IVector& label) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  CHECK(dynamic_cast<CpuIVector*>(&label));
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(output.getWidth(), dim);
+  real* out = output.getData();
+  real* grad = getData();
+  int* lbl = label.getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
+    grad[lbl[i]] -= 1 / out[lbl[i]];
+  }
+}
+
+/*
+    We implement the matrix functionality in CostLayer.cpp,
+    but we define the scalar function here for sanity check
+    deletion of the function does not affect anything neverthelss
+*/
+void CpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
+                                               real alpha) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  CHECK(dynamic_cast<CpuIVector*>(&label));
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getSize(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(getWidth(), (size_t)1);
+
+  real* out = output.getData();
+  real* cost = getData();
+  int* lbl = label.getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim) {
+    CHECK_GE(lbl[i], 0);
+    CHECK_LT((size_t)lbl[i], dim);
+    real sum = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      sum += out[j];
+    }
+    sum = _safelog(sum);
+    cost[i] = -_safelog(out[lbl[i]]) + sum + alpha * _square(sum);
+  }
+}
+
+/*
+    We implement the matrix functionality in CostLayer.cpp,
+    but we define the scalar function here for sanity check
+    deletion of the function does not affect anything neverthelss
+*/
+void CpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& output, IVector& label,
+                                                 real alpha) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  CHECK(dynamic_cast<CpuIVector*>(&label));
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(output.getWidth(), dim);
+  real* out = output.getData();
+  real* grad = getData();
+  int* lbl = label.getData();
+
+  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
+    grad[lbl[i]] -= 1 / out[lbl[i]];
+    real sum = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      sum += out[j];
+    }
+    for (size_t j = 0; j < dim; ++j) {
+      if (j == (size_t)lbl[i]) {
+        grad[j] += -1 / out[j];
+      }
+      grad[j] += 1 / sum + 2 * alpha * _safelog(sum) / sum;
+    }
+  }
+}
+
+#define FORWARD_LOOP()                      \
+  size_t numSamples = getHeight();          \
+  size_t dim = getWidth();                  \
+  CHECK_EQ(output.getHeight(), numSamples); \
+  CHECK_EQ(output.getWidth(), dim);         \
+  const real* in = getData();               \
+  real* out = output.getData();             \
+  for (size_t i = 0; i < numSamples; ++i, in += dim, out += dim)
+
+#define BACKWARD_LOOP()                     \
+  size_t numSamples = getHeight();          \
+  size_t dim = getWidth();                  \
+  CHECK_EQ(output.getHeight(), numSamples); \
+  CHECK_EQ(output.getWidth(), dim);         \
+  real* grad = getData();                   \
+  real* out = output.getData();             \
+  for (size_t i = 0; i < numSamples; ++i, grad += dim, out += dim)
+
+void CpuMatrix::softmax(Matrix& output) {
+  CHECK(!output.useGpu());
+
+  const float THRESHOLD = -64.0;
+
+  FORWARD_LOOP() {
+    real max = -1.0e20;
+    for (size_t j = 0; j < dim; ++j) {
+      if (in[j] > max) {
+        max = in[j];
+      }
+    }
+    for (size_t j = 0; j < dim; ++j) {
+      real a = in[j] - max;
+      if (a < THRESHOLD) {
+        a = THRESHOLD;
+      }
+      out[j] = a;
+    }
+    vExp(dim, out, out);
+
+    real sum = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      sum += out[j];
+    }
+    sum = 1 / sum;
+    for (size_t j = 0; j < dim; ++j) {
+      out[j] *= sum;
+    }
+  }
+}
+
+void CpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
+  CHECK_EQ(getWidth(), 1UL);
+  CHECK_EQ(output.getWidth(), 1UL);
+  CHECK(isContiguous());
+
+  MatrixPtr inTmp = Matrix::create(nullptr, /* height= */ 1, 1,
+                                 /* trans= */ false, false);
+  MatrixPtr outTmp = Matrix::create(nullptr, /* height= */ 1, 1,
+                                 /* trans= */ false, false);
+  size_t numSequences = index.getSize() - 1;
+  auto starts = index.getData();
+  for (size_t i = 0; i < numSequences; ++i) {
+    size_t offset = starts[i];
+    size_t size = starts[i + 1] - starts[i];
+    inTmp->setData(getData() + offset, 1UL, size);
+    outTmp->setData(output.getData() + offset, 1UL, size);
+    inTmp->softmax(*outTmp);
+  }
+}
+
+void CpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
+  CHECK(output.useGpu_ == false) << "Matrix type are not equal";
+  CHECK_EQ(getHeight(), sftmaxSum.getHeight());
+
+  real* sums = sftmaxSum.getData();
+
+  BACKWARD_LOOP() {
+    real sum = sums[i];
+    for (size_t j = 0; j < dim; ++j) {
+      grad[j] = out[j] * (grad[j] - sum);
+    }
+  }
+}
+
+void CpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) {
+  size_t numSamples = getHeight();
+  size_t dim = output1.getWidth();
+  CHECK_EQ(getWidth(), 1UL);
+  CHECK_EQ(output1.getHeight(), numSamples);
+  CHECK_EQ(output1.getWidth(), output2.getWidth());
+
+  real* out = getData();
+  const real* x = output1.getData();
+  const real* y = output2.getData();
+  size_t yInc = dim;
+  if (output2.getHeight() == 1LU) {
+    yInc = 0;
+  } else {
+    CHECK_EQ(output2.getHeight(), numSamples);
+  }
+  for (size_t i = 0; i < numSamples; ++i, x += dim, y += yInc) {
+    real squareSumX = 0;
+    real squareSumY = 0;
+    real xy = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      squareSumX += _square(x[j]);
+      squareSumY += _square(y[j]);
+      xy += x[j] * y[j];
+    }
+    CHECK(squareSumX > 0 && squareSumY > 0);
+    out[i] = scale * xy / (std::sqrt(squareSumX) * std::sqrt(squareSumY));
+  }
+}
+
+void CpuMatrix::cosSimDerivative(Matrix& output, Matrix& prevOut1,
+                                 Matrix& prevOut2, Matrix& prevGrad1,
+                                 Matrix& prevGrad2, real scale) {
+  CHECK(output.useGpu_ == false) << "Matrix type are not equal";
+
+  CHECK_EQ(getWidth(), 1UL);
+  CHECK_EQ(output.getWidth(), 1UL);
+
+  size_t numSamples = getHeight();
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(prevOut1.getHeight(), numSamples);
+  CHECK_EQ(prevGrad1.getHeight(), numSamples);
+
+  size_t dim = prevOut1.getWidth();
+  CHECK_EQ(prevOut2.getWidth(), dim);
+  CHECK_EQ(prevGrad1.getWidth(), dim);
+  CHECK_EQ(prevGrad2.getWidth(), dim);
+
+  const real* grad = getData();
+  const real* out = output.getData();
+  const real* prevOutX = prevOut1.getData();
+  const real* prevOutY = prevOut2.getData();
+  real* prevGradX = prevGrad1.getData();
+  real* prevGradY = prevGrad2.getData();
+  size_t yInc = dim;
+  if (prevOut2.getHeight() == 1LU) {
+    yInc = 0;
+    CHECK_EQ(prevGrad2.getHeight(), 1LU);
+  } else {
+    CHECK_EQ(prevOut2.getHeight(), numSamples);
+    CHECK_EQ(prevGrad2.getHeight(), numSamples);
+  }
+  for (size_t i = 0; i < numSamples; ++i, prevOutX += dim, prevOutY += yInc,
+              prevGradX += dim, prevGradY += yInc) {
+    real squareSumX = 0;
+    real squareSumY = 0;
+    real xy = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      squareSumX += _square(prevOutX[j]);
+      squareSumY += _square(prevOutY[j]);
+      xy += prevOutX[j] * prevOutY[j];
+    }
+    CHECK(squareSumX > 0 && squareSumY > 0);
+    if (xy == 0) {
+      real reciprocal = 1.0f / (std::sqrt(squareSumX) * std::sqrt(squareSumY));
+      for (size_t j = 0; j < dim; ++j) {
+        prevGradX[j] += scale * grad[i] * prevOutY[j] * reciprocal;
+        prevGradY[j] += scale * grad[i] * prevOutX[j] * reciprocal;
+      }
+    } else {
+      real reciprocalXY = 1.0f / xy;
+      real reciprocalSquareSumX = 1.0f / squareSumX;
+      real reciprocalSquareSumY = 1.0f / squareSumY;
+      for (size_t j = 0; j < dim; ++j) {
+        prevGradX[j] += out[i] * grad[i] * (prevOutY[j] * reciprocalXY -
+                                            prevOutX[j] * reciprocalSquareSumX);
+        prevGradY[j] += out[i] * grad[i] * (prevOutX[j] * reciprocalXY -
+                                            prevOutY[j] * reciprocalSquareSumY);
+      }
+    }
+  }
+}
+
+void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(label.getWidth(), dim);
+  CHECK_EQ(getWidth(), (size_t)1);
+  real* out = output.getData();
+  real* cost = getData();
+
+  auto labelptr = dynamic_cast<CpuSparseMatrix*>(&label);
+  if (labelptr) {
+    // it is a CpuSparseMatrix
+    if (labelptr->getFormat() == SPARSE_CSR) {
+      // treat label as a SparseMatrix
+      for (size_t i = 0; i < numSamples; ++i) {
+        for (size_t j = 0; j < dim; ++j) {
+          cost[i] += _square(out[i * dim + j]);
+        }
+      }
+      if (labelptr->getValueType() == NO_VALUE) {
+        int* cols = labelptr->getCols();
+        for (size_t i = 0; i < numSamples; ++i) {
+          for (size_t j = labelptr->getRowStartIdx(i);
+               j < labelptr->getRowStartIdx(i + 1); ++j) {
+            cost[i] += 1.0 - 2.0 * out[i * dim + cols[j]];
+            /*
+             * explanation of above line: original codes are follows:
+             * cost[i] -= _square(out[i * dim + feature.col]);
+             * cost[i] += _square(1.0 - out[i * dim + feature.col]);
+             */
+          }
+        }
+      } else if (labelptr->getValueType() == FLOAT_VALUE) {
+        int* cols = labelptr->getCols();
+        real* values = labelptr->getValue();
+        for (size_t i = 0; i < numSamples; ++i) {
+          real sum1 = 0;
+          real sum2 = 0;
+          for (size_t j = labelptr->getRowStartIdx(i);
+               j < labelptr->getRowStartIdx(i + 1); ++j) {
+            sum1 += values[j] * values[j];
+            sum2 += values[j] * out[i * dim + cols[j]];
+            /*
+             * explanation of above line: original codes are follows:
+             * cost[i] -= _square(out[i * dim + feature.col]);
+             * cost[i] += _square(value.col - out[i * dim + feature.col]);
+             */
+          }
+          cost[i] += sum1 - 2.0 * sum2;
+        }
+      } else {
+        LOG(FATAL) << "unsupported sparse matrix value type in sumOfSquares";
+        return;
+      }
+      return;
+    } else {
+      LOG(FATAL) << "unsupported sparse matrix format in sumOfSquares";
+      return;
+    }
+  }
+
+  BaseMatrix::sumOfSquares(output, label);
+}
+
+/* calculate the error of outputV according to label */
+void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(output.getWidth(), dim);
+  CHECK_EQ(label.getWidth(), dim);
+
+  real* out = output.getData();
+  real* grad = getData();
+
+  auto labelptr = dynamic_cast<CpuSparseMatrix*>(&label);
+  if (labelptr) {
+    // it is a CpuSparseMatrix
+    if (labelptr->getFormat() == SPARSE_CSR) {
+      // treat label as a SparseMatrix
+      for (size_t i = 0; i < numSamples; ++i) {
+        for (size_t j = 0; j < dim; ++j) {
+          grad[i * dim + j] += 2.0 * out[i * dim + j];
+        }
+      }
+      if (labelptr->getValueType() == NO_VALUE) {
+        int* cols = labelptr->getCols();
+        for (size_t i = 0; i < numSamples; ++i) {
+          for (size_t j = labelptr->getRowStartIdx(i);
+               j < labelptr->getRowStartIdx(i + 1); ++j) {
+            grad[i * dim + cols[j]] -= 2.0;
+            /*
+             * explanation of above line: original codes are follows:
+             * grad[i * dim + feature.col] -= 2.0 * out[i * dim + feature.col];
+             * grad[i * dim + feature.col] += 2.0 * (out[i * dim + feature.col]
+             * - 1);
+             */
+          }
+        }
+      } else if (labelptr->getValueType() == FLOAT_VALUE) {
+        int* cols = labelptr->getCols();
+        real* values = labelptr->getValue();
+        for (size_t i = 0; i < numSamples; ++i) {
+          for (size_t j = labelptr->getRowStartIdx(i);
+               j < labelptr->getRowStartIdx(i + 1); ++j) {
+            grad[i * dim + cols[j]] -= 2.0 * values[j];
+            /*
+             * explanation of above line: original codes are follows:
+             * grad[i * dim + feature.col] -= 2.0 * out[i * dim + feature.col];
+             * grad[i * dim + feature.col] += 2.0 * (out[i * dim + feature.col]
+             * - value.col);
+             */
+          }
+        }
+      } else {
+        LOG(FATAL) << "unsupported sparse matrix value type in sumOfSquares";
+        return;
+      }
+      return;
+    } else {
+      LOG(FATAL) << "unsupported sparse matrix format in sumOfSquares";
+      return;
+    }
+  }
+
+  real* lbl = label.getData();
+  size_t ld = getStride();
+  size_t outLd = output.getStride();
+  size_t lblLd = label.getStride();
+  CHECK(lbl);
+  for (size_t i = 0; i < numSamples;
+       ++i, out += outLd, lbl += lblLd, grad += ld) {
+    for (size_t j = 0; j < dim; ++j) {
+      grad[j] += 2.0 * (out[j] - lbl[j]);  // positive gradient;
+    }
+  }
+}
+
+void CpuMatrix::tanh(Matrix& output) {
+  CHECK(isContiguous());
+  CHECK(output.isContiguous());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(output.getWidth(), dim);
+  errno = 0;
+  vTanh(numSamples * dim, getData(), output.getData());
+  CHECK_EQ(errno, 0) << "vTanh error";
+}
+
+void CpuMatrix::tanhDerivative(Matrix& output) {
+  BaseMatrix::tanhDerivative(output);
+}
+
+void CpuMatrix::softrelu(Matrix& output) {
+  CHECK(isContiguous());
+  CHECK(output.isContiguous());
+  const real THRESHOLD = 40.0;
+  FORWARD_LOOP() {  // TODO(yuyang18): SIMD it?
+    for (size_t j = 0; j < dim; ++j) {
+      real x = in[j];
+      if (x > THRESHOLD) {
+        x = THRESHOLD;
+      } else if (x < -THRESHOLD) {
+        x = -THRESHOLD;
+      }
+      out[j] = x;
+    }
+  }
+  errno = 0;
+  vExp(numSamples * dim, output.getData(), output.getData());
+  vLog1p(numSamples * dim, output.getData(), output.getData());
+  CHECK_EQ(errno, 0) << "vExp+vLog1p error";
+}
+
+void CpuMatrix::softreluDerivative(Matrix& output) {
+  CHECK(isContiguous());
+  CHECK(output.isContiguous());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  size_t size = numSamples * dim;
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(output.getWidth(), dim);
+  real* grad = getData();
+  MatrixPtr tmpMat = Matrix::create(numSamples, dim);
+  real* tmp = tmpMat->getData();
+
+  errno = 0;
+  vExp(size, output.getData(), tmpMat->getData());
+  CHECK_EQ(errno, 0) << "vExp error";
+
+  for (size_t i = 0; i < size; ++i) {
+    grad[i] *= (1.0 - 1.0 / tmp[i]);
+  }
+}
+
+void CpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
+  CHECK(isContiguous());
+  CHECK(output.isContiguous());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(output.getWidth(), dim);
+
+  const real* in = getData();
+  real* out = output.getData();
+
+  // out = p2*in
+  for (size_t i = 0; i < numSamples * dim; ++i) {
+    out[i] = p2 * in[i];
+  }
+
+  // out = tanh(out)
+  errno = 0;
+  vTanh(numSamples * dim, out, out);
+  CHECK_EQ(errno, 0) << "vTanh error";
+
+  // out = p1 * out
+  for (size_t i = 0; i < numSamples * dim; ++i) {
+    out[i] = p1 * out[i];
+  }
+}
+
+/* uniform randomization, minimize precision = 1e-5 */
+void CpuMatrix::randomizeUniform() {
+  CHECK(isContiguous());
+  real* data = getData();
+  unsigned int* randSeed = ThreadLocalRand::getSeed();
+  real recipRandMax = 1.0f / (real)RAND_MAX;
+  for (size_t i = 0; i < elementCnt_; ++i) {
+    *data++ = rand_r(randSeed) * recipRandMax;
+  }
+}
+
+void CpuMatrix::print(std::ostream& os) const {
+  CHECK(isContiguous());
+  for (size_t i = 0; i < height_; ++i) {
+    for (size_t j = 0; j < width_; ++j) {
+      os << data_[i * width_ + j] << " ";
+    }
+    os << std::endl;
+  }
+}
+
+void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
+  real* input = data.getData();
+  real* w = W.getData();
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  for (size_t n = 0, k = 0; n < numSamples; ++n) {
+    for (size_t i = 0; i < numElements; ++i, ++k) {
+      data_[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
+    }
+  }
+}
+
+void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
+  real* ograd = oGrad.getData();
+  real* input = data.getData();
+  real* wgrad = data_;
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t partial_sum = numElements / (this->getHeight() * this->getWidth());
+  for (size_t n = 0, k = 0; n < numSamples; ++n) {
+    for (size_t i = 0; i < numElements; ++i, ++k) {
+      wgrad[i / partial_sum] += ograd[k] * (input[k] > 0 ? 0 : input[k]);
+    }
+  }
+}
+
+void CpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
+  real* diff = data_;
+  real* input = data.getData();
+  real* ograd = oGrad.getData();
+  real* w = W.getData();
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  for (size_t n = 0, k = 0; n < numSamples; ++n) {
+    for (size_t i = 0; i < numElements; ++i, ++k) {
+      diff[k] += ograd[k] * (input[k] > 0 ? 1 : w[i / partial_sum]);
+    }
+  }
+}
+
+void CpuMatrix::print(std::ostream& os, size_t height, size_t width) const {
+  CHECK(isContiguous());
+  size_t h = height_ < height ? height_ : height;
+  size_t w = width_ < width ? width_ : width;
+  os.setf(std::ostream::scientific);
+  os << "[";
+  for (size_t i = 0; i < h; ++i) {
+    for (size_t j = 0; j < w; ++j) {
+      os << data_[i * width_ + j] << " ";
+    }
+    if (i == h - 1) {
+      os << "]";
+    }
+    os << std::endl;
+  }
+}
+
+void CpuMatrix::printOneRow(std::ostream& os, size_t idx) const {
+  CHECK_LT(idx, height_);
+  size_t offset = idx * stride_;
+  os << data_[offset];
+  for (size_t i = 1; i < width_; ++i) {
+    os << " " << data_[offset + i];
+  }
+  os << ";";
+}
+
+void CpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
+  CHECK(isContiguous());
+  CHECK(height_ == refMat.getHeight());
+  CHECK(width_ == refMat.getWidth());
+  CpuMatrix cpuRef(height_, width_);
+  cpuRef.copyFrom(refMat);
+  size_t diffCnt = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    for (size_t j = 0; j < width_; ++j) {
+      real a = getElement(i, j);
+      real b = cpuRef.getElement(i, j);
+      if (fabs(a - b) > 0.00001) {
+        ++diffCnt;
+        if (printDiff) {
+          os << "ref= " << a << "  check= " << b << std::endl;
+        }
+      }
+    }
+  }
+  LOG(INFO) << "the  diffCnt is " << diffCnt;
+}
+
+real CpuMatrix::getMin() {
+  size_t size = getHeight() * getWidth();
+  real* data = getData();
+  real res = data[0];
+  for (size_t i = 1; i < size; ++i) {
+    if (res > data[i]) {
+      res = data[i];
+    }
+  }
+  return res;
+}
+
+real CpuMatrix::getMax() {
+  size_t size = getHeight() * getWidth();
+  real* data = getData();
+  real res = data[0];
+  for (size_t i = 1; i < size; ++i) {
+    if (res < data[i]) {
+      res = data[i];
+    }
+  }
+  return res;
+}
+
+void CpuMatrix::circularConv(Matrix& in0, Matrix& in1) {
+  size_t height = this->getHeight();
+  size_t width0 = this->getWidth();
+  size_t width1 = in1.getWidth();
+
+  CHECK_EQ(height, in0.getHeight());
+  CHECK_EQ(width0, in0.getWidth());
+  CHECK_EQ(height, in1.getHeight());
+
+  CHECK_EQ(width1 % 2, 1U);
+
+  real* outV = this->getData();
+  real* inV0 = in0.getData();
+  real* inV1 = in1.getData();
+
+  int leftCtxLen = (width1 - 1) / 2;
+  for (size_t x = 0; x < height;
+       ++x, outV += width0, inV0 += width0, inV1 += width1) {
+    for (size_t i = 0; i < width0; ++i) {  // each dimension of output
+      for (size_t j = 0; j < width1; ++j) {
+        // iterate over all dimentions of inV1
+        int index = i + j - leftCtxLen;
+        index = (index + width0) % width0;
+        outV[i] += inV0[index] * inV1[j];
+      }
+    }
+  }
+}
+
+void CpuMatrix::circularConvDerivative(Matrix& outG, Matrix& in0, Matrix& in1,
+                                       Matrix& inG0, Matrix& inG1) {
+  size_t height = in0.getHeight();
+  size_t width0 = in0.getWidth();
+  size_t width1 = in1.getWidth();
+
+  CHECK_EQ(height, in1.getHeight());
+  CHECK_EQ(height, inG0.getHeight());
+  CHECK_EQ(width0, inG0.getWidth());
+  CHECK_EQ(height, inG1.getHeight());
+  CHECK_EQ(width1, inG1.getWidth());
+  CHECK_EQ(height, outG.getHeight());
+  CHECK_EQ(width0, outG.getWidth());
+
+  real* outGV = outG.getData();
+  real* inV0 = in0.getData();
+  real* inV1 = in1.getData();
+  real* inGV0 = inG0.getData();
+  real* inGV1 = inG1.getData();
+
+  int leftCtxLen = (width1 - 1) / 2;
+  for (size_t x = 0; x < height; ++x, outGV += width0, inV0 += width0,
+              inV1 += width1, inGV0 += width0, inGV1 += width1) {
+    for (size_t j = 0; j < width1; ++j) {  // iterate over width1
+      for (size_t i = 0; i < width0; ++i) {
+        // such over all dimensions of outG
+        int index = i + j - leftCtxLen;
+        index = (index + width0) % width0;
+        inGV0[index] += outGV[i] * inV1[j];
+        inGV1[j] += outGV[i] * inV0[index];
+      }
+    }
+  }
+}
+
+void CpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
+  CHECK(labelPtr);
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(numSamples, output.getHeight());
+  CHECK_EQ(numSamples, labelPtr->getHeight());
+  CHECK_EQ(dim, labelPtr->getWidth());
+
+  real* out = output.getData();
+  real* cost = getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      CHECK(out[j] > 0 && out[j] < 1.0);
+      cost[i] -= std::log(1 - out[j]);
+    }
+
+    const int* cols = labelPtr->getRowCols(i);
+    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
+      CHECK_LT(size_t(cols[j]), dim);
+      cost[i] -= std::log(out[cols[j]] / (1 - out[cols[j]]));
+    }
+  }
+}
+
+void CpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
+  CHECK(labelPtr);
+
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(numSamples, output.getHeight());
+  CHECK_EQ(numSamples, labelPtr->getHeight());
+  CHECK_EQ(dim, output.getWidth());
+  CHECK_EQ(dim, labelPtr->getWidth());
+
+  real* out = output.getData();
+  real* grad = getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      CHECK(out[j] > 0 && out[j] < 1.0);
+      grad[j] += 1.0 / (1 - out[j]);
+    }
+
+    const int* cols = labelPtr->getRowCols(i);
+    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
+      CHECK_LT(size_t(cols[j]), dim);
+      grad[cols[j]] -= 1.0 / (out[cols[j]] * (1 - out[cols[j]]));
+    }
+  }
+}
+
+/* calculate the classification error for multi binary label */
+void CpuMatrix::classificationErrorMulti(Matrix& output, Matrix& label,
+                                         real threshold) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
+  CHECK(labelPtr);
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(numSamples, output.getHeight());
+  CHECK_EQ(numSamples, labelPtr->getHeight());
+  CHECK_EQ(dim, labelPtr->getWidth());
+
+  real* out = output.getData();
+  real* result = getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim) {
+    real sum = 0.0;
+    for (size_t j = 0; j < dim; ++j) {
+      if (out[j] >= threshold) {
+        sum += 1.0;
+      }
+    }
+
+    const int* cols = labelPtr->getRowCols(i);
+    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
+      CHECK_LT(size_t(cols[j]), dim);
+      if (out[cols[j]] < threshold) {
+        sum += 1.0;
+      } else {
+        sum -= 1.0;
+      }
+    }
+    result[i] = sum / dim;
+  }
+}
+
+////////////////////////////////////////////////////////////////
+//               functions executed via cpu                   //
+////////////////////////////////////////////////////////////////
+
+void GpuMatrix::selectElements(Matrix& table, IVector& ids) {
+  execViaCpu2(&CpuMatrix::selectElements, *this, table, ids);
+}
+}  // namespace paddle
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
new file mode 100644
index 00000000000000..f27773d1108b63
--- /dev/null
+++ b/paddle/math/Matrix.h
@@ -0,0 +1,1500 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <memory>
+#include <thread>
+#include <stdint.h>
+
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/ThreadLocal.h"
+
+#include <hl_gpu.h>
+
+#include "MemoryHandle.h"
+#include "paddle/utils/TypeDefs.h"
+#include "Vector.h"
+#include "paddle/utils/ThreadLocal.h"
+#include "BaseMatrix.h"
+
+namespace paddle {
+
+enum SparseValueType { NO_VALUE = 0, FLOAT_VALUE = 1 };
+
+/**
+ * @brief  matrix sparse_format .
+ *
+ * nnz represents nonzero number in sparse matrix.
+ *
+ * SPARSE_CSR: row major matrix. length of row is height_ + 1, each element
+ * represents row start index in Matrix. length of col and value are nnz.
+ *
+ * SPARSE_CSC: col major matrix. length of col is width_ + 1, each element
+ * represents col start index in Matrix. length of col and value are nnz.
+ *
+ * @code
+ * for example: [0, 1, 0, 2, 0;
+ *               1, 0, 0, 0, 0;
+ *               0, 0, 0, 2, 5];
+ * SPARSE_CSR row   [0, 2, 3, 5];
+ *            col   [1, 3, 0, 3, 4];
+ *            value [1, 2, 1, 2, 5]
+ * SPARSE_CSC col   [0, 1, 2, 2, 4, 5];
+ *            row   [1, 0, 0, 2, 2];
+ *            value [1, 1, 2, 2, 5]
+ * @endcode
+ */
+enum SparseFormat { SPARSE_CSR = 0, SPARSE_CSC = 1 };
+
+class Matrix;
+class GpuMatrix;
+class CpuMatrix;
+class CpuSparseMatrix;
+class GpuSparseMatrix;
+typedef std::shared_ptr<Matrix> MatrixPtr;
+typedef std::shared_ptr<GpuMatrix> GpuMatrixPtr;
+typedef std::shared_ptr<CpuMatrix> CpuMatrixPtr;
+typedef std::shared_ptr<GpuSparseMatrix> GpuSparseMatrixPtr;
+typedef std::shared_ptr<CpuSparseMatrix> CpuSparseMatrixPtr;
+
+/**
+ * Copy or assignemnt constructor will share the data as opposed to making a
+ * copy of the original data. To make a copy of the orinal data, use copyFrom()
+ * instead.
+ */
+class Matrix : public BaseMatrix {
+protected:
+  Matrix(MemoryHandlePtr memHandle, size_t height, size_t width, bool trans,
+         bool use_gpu);
+
+  Matrix(real* data, size_t height, size_t width, bool trans, bool use_gpu);
+
+  Matrix(real* data, size_t height, size_t width, size_t stride, bool trans,
+         bool use_gpu);
+
+  static ThreadLocal<MatrixPtr> tmpMat_;
+
+public:
+  size_t elementCnt_;  // maximal number of elements which can be held in data_
+  MemoryHandlePtr memoryHandle_;
+
+public:
+  virtual ~Matrix() {}
+
+  static MatrixPtr create(MemoryHandlePtr memHandle, size_t height,
+                          size_t width, bool trans = false);
+  static MatrixPtr create(size_t height, size_t width, bool trans = false,
+                          bool useGpu = false);
+  static MatrixPtr create(real* data, size_t height, size_t width,
+                          bool trans = false, bool useGpu = false);
+  static MatrixPtr create(real* data, size_t height, size_t width,
+                          size_t stride, bool trans = false,
+                          bool useGpu = false);
+
+  static MatrixPtr createSparseMatrix(size_t height, size_t width, size_t nnz,
+                                      SparseValueType valueType = FLOAT_VALUE,
+                                      bool trans = false, bool useGpu = false);
+  static MatrixPtr createSparseMatrix(size_t height, size_t width, size_t nnz,
+                                      SparseValueType valueType = FLOAT_VALUE,
+                                      SparseFormat foramt = SPARSE_CSR,
+                                      bool trans = false, bool useGpu = false);
+
+  static MatrixPtr createSparseMatrix(real* data, int* row, int* col,
+                                      size_t height, size_t width,
+                                      size_t nnz, /* used to allocate space */
+                                      SparseValueType valueType, /*value type*/
+                                      SparseFormat format, bool trans,
+                                      bool useGpu);
+
+  static void resizeOrCreateSparseMatrix(
+      MatrixPtr& matrix, size_t height, size_t width, size_t nnz,
+      SparseValueType valueType = FLOAT_VALUE, SparseFormat foramt = SPARSE_CSR,
+      bool trans = false, bool useGpu = false);
+
+  static void resizeOrCreate(MatrixPtr& a, size_t height, size_t width,
+                             bool trans = false, bool useGpu = false);
+
+  /**
+   * @brief  set the data buffer used to hold the matrix data.
+   *
+   * caller should make sure that the size of data is at least
+   * sizeof(real)*height*width.
+   */
+  void setData(real* data) {
+    BaseMatrix::setData(data);
+    memoryHandle_.reset();
+  }
+
+  /// the data should be contiguous
+  void setData(real* data, size_t newHeight, size_t newWidth) {
+    setData(data);
+    height_ = newHeight;
+    width_ = newWidth;
+    elementCnt_ = newHeight * newWidth;
+    stride_ = width_;
+  }
+
+  size_t getWidth() const { return width_; }
+  size_t getHeight() const { return height_; }
+  size_t getStride() const { return stride_; }
+  size_t getElementCnt() const { return elementCnt_; }
+  virtual real* getData() { return data_; }
+  virtual const real* getData() const { return data_; }
+  bool isTransposed() const { return trans_; }
+  bool isContiguous() const { return stride_ == width_ || height_ == 1; }
+
+  // If sparse matrix, need to dynamic_cast to CpuSparseMatrix/GpuSparseMatrix
+  // befor call the following functions.
+  // Declare these functions in the base class just easy to call them.
+  // And these declarations should be moved to base class of sparse matrix
+  // if refactor sparse matrix
+  virtual int* getRows() const {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;   //! suppress warning for no return value.
+  }
+
+  virtual int* getCols() const {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;   //! suppress warning for no return value.
+  }
+
+  virtual SparseFormat getFormat() const {
+    LOG(FATAL) << "Not implemented";
+    return SPARSE_CSR;  //! suppress warning for no return value.
+  }
+
+  virtual SparseValueType getValueType() const {
+    LOG(FATAL) << "Not implemented";
+    return NO_VALUE;    //! suppress warning for no return value.
+  }
+
+  /**
+   * @brief matrix elment-wise add
+   *
+   * Named add3 just because add/add2 has been used in BaseMatrix.cu
+   * and they are not virtual function.
+   */
+  virtual void add3(MatrixPtr b) { LOG(FATAL) << "Not implemented"; }
+
+  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }
+
+  virtual void zeroMem() { LOG(FATAL) << "Not implemented"; }
+
+  virtual void resetOne() { LOG(FATAL) << "Not implemented"; }
+
+  virtual void copyFrom(const Matrix& src) { LOG(FATAL) << "Not implemented"; }
+
+  virtual void trimFrom(const CpuSparseMatrix& src) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  // asynchronous copy
+  virtual void copyFrom(const Matrix& src, hl_stream_t stream) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  MatrixPtr subMatrix(size_t startRow, size_t endRow, size_t startCol,
+                      size_t endCol);
+
+  MatrixPtr subRowMatrix(size_t startRow, size_t endRow) {
+    return subMatrix(startRow, endRow, 0, getWidth());
+  }
+
+  MatrixPtr subColMatrix(size_t startCol, size_t endCol) {
+    return subMatrix(0, getHeight(), startCol, endCol);
+  }
+
+  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows) {
+    CHECK_LE(startRow + numRows, getHeight());
+    return Matrix::create(getData() + startRow * getWidth(), numRows,
+                          getWidth(), trans_, useGpu_);
+  }
+  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows, MatrixPtr dest) {
+    CHECK_LE(startRow + numRows, getHeight());
+    CHECK_EQ(useGpu_, dest->useGpu_);
+    dest->setData(this->rowBuf(startRow), numRows, getWidth());
+    return dest;
+  }
+
+  /**
+   * If this is GpuMatrix, src is assumed to be CPU memory
+   *
+   * If this is CpuMatrix, src is assumed to be CPU memory
+   */
+  virtual void copyFrom(const real* src, size_t size) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void copyFrom(const real* src, const int64_t* seq) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief convert a int vector to a real matrix.
+   *
+   * (1) source and dest are both in CPU.
+   *
+   * (2) sizes are exactly match.
+   */
+  virtual void copyFrom(const IVector& src) {
+    LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
+  }
+
+  virtual void copyByRowIndex(Matrix& b, IVector& rowIndex) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief Create a matrix with the same type (GpuMatrix, CpuMatrix,
+   *        NonValueSparseMatrix, etc.) as this.
+   *
+   * If height and width is zero, the new matrix will have the same size
+   * as this, otherwise the new matrix will have the specified size.
+   *
+   */
+  virtual MatrixPtr clone(size_t height = 0, size_t width = 0,
+                          bool useGpu = false) {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;
+  }
+
+  virtual real* getRowBuf(size_t row) {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;
+  }
+
+  virtual real getElement(size_t x, size_t y) const {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  virtual real getSum() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  virtual void accumulateColSum(Matrix& src) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual real getAbsSum() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  /**
+   * @note Original data may not be preserved after resize().
+   */
+  virtual void resize(size_t newHeight, size_t newWidth) = 0;
+
+  /**
+   * @note This should only be used for sparse matrix.
+   */
+  virtual void resize(size_t newHeight, size_t newWidth,
+                      size_t newNnz, /* total item used to allocate space */
+                      SparseValueType valueType, SparseFormat format) = 0;
+
+  /**
+   * @brief This should only be used for sparse matrix.
+   *
+   * Currently must be called for each row in order.
+   * The matrix is not valid until setRow is called for the last row.
+   */
+  virtual void setRow(size_t row, size_t colNum, const unsigned int* cols,
+                      const real* values) = 0;
+
+  virtual MatrixPtr getTranspose() = 0;
+
+  /**
+   * @brief  hard transpose.
+   *
+   * allocate matTrans' memory outside, then set memAlloc as false;
+   * else set as true.
+   */
+  virtual void transpose(MatrixPtr matTrans, bool memAlloc) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+public:
+  /// Only set all variables to 0 or NULL but not free them.
+  virtual void clear() {
+    height_ = 0;
+    width_ = 0;
+    data_ = NULL;
+  }
+
+  void reshape(size_t height, size_t width);
+
+  /// add b to each sample of this.
+  virtual void addBias(Matrix& b, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// add each sample from a to this.
+  virtual void collectBias(Matrix& a, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void sequenceAvgForward(Matrix& a, const IVector& startsPos,
+    int mode) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = scaleAB*(a*b) + scaleT*this
+   * @endcode
+   */
+  virtual void mul(const MatrixPtr a, const MatrixPtr b, real scaleAB,
+                   real scaleT) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// Add a vector (column) b to matrix a, column by column.
+  virtual void addColumnVector(const Matrix& b) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   this(i, j) += vec(index(i, j), 0)
+   * where index(i, j) = ((codes(i) + numClasses) >> (j + 1)) - 1
+   * @endcode
+   */
+  virtual void addByBitCode(size_t numClasses, const IVector& codes,
+                            const Matrix& vec) {
+    (void)numClasses;
+    (void)codes;
+    (void)vec;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   vec(index(i, j), 0) += this(i, j)
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void addByBitCodeBackward(size_t numClasses, const IVector& codes,
+                                    Matrix& vec) {
+    (void)numClasses;
+    (void)codes;
+    (void)vec;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   this(i, j) += <mat.row(index(i, j)), input.row(i)>
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void mulByBitCode(size_t numClasses, const IVector& codes,
+                            const Matrix& mat, const Matrix& input) {
+    (void)numClasses;
+    (void)codes;
+    (void)mat;
+    (void)input;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   mat.row(index(i, j)) += this(i, j) * input.row(i)
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void mulByBitCodeBackwardWeight(size_t numClasses,
+                                          const IVector& codes, Matrix& mat,
+                                          const Matrix& input) {
+    (void)numClasses;
+    (void)codes;
+    (void)mat;
+    (void)input;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   input.row(i) += this(i, j) * mat.row(index(i, j))
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void mulByBitCodeBackwardError(size_t numClasses,
+                                         const IVector& codes,
+                                         const Matrix& mat, Matrix& input) {
+    (void)numClasses;
+    (void)codes;
+    (void)mat;
+    (void)input;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength
+   *   sum(i, 0) = scaleSum * \sum_j  bit(i, j) * this(i, j)
+   * where bit(i, j) = ((codes(i) + numClasses) & 2^j) ? 1 : 0
+   * @endcode
+   */
+  virtual void sumByBitCode(size_t numClasses, IVector& codes, Matrix& sum,
+                            real scaleSum) {
+    (void)numClasses;
+    (void)codes;
+    (void)sum;
+    (void)scaleSum;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength
+   *  this(i, j) -= bit(i, j)
+   * where bit(i, j) is same as that for sumByBitCode
+   * @endcode
+   */
+  virtual void subByBitCode(size_t numClasses_, IVector& codes) {
+    (void)numClasses_;
+    (void)codes;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * add the sum of each row of this to mat
+   */
+  virtual void rowSum(Matrix& sum) {
+    (void)sum;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * set the max of each row of this to mat
+   */
+  virtual void rowMax(Matrix& max) {
+    (void)max;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; }
+
+  virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; }
+
+  /**
+   * @brief Get the top k elements of each row of this matrix.
+   *
+   * The column ids and values of these elements are stored in
+   * maxIds and max respectively. Note that the top k
+   * elements are not sorted.
+   */
+  virtual void rowMax(IVector& maxIds, Matrix& max) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// normalize each row so that the sum of each row is 1.
+  virtual void rowNormalizeL1(Matrix& out) {
+    (void)out;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   *  this = a*b
+   * @endcode
+   */
+  virtual void mul(const MatrixPtr a, const MatrixPtr b) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = scaleAB*(this*b) +  scaleT*this
+   * @endcode
+   */
+  virtual void rightMul(Matrix& b, real scaleAB, real scaleT) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = this* b
+   * @endcode
+   */
+  virtual void rightMul(Matrix& b) { LOG(FATAL) << "Not implemented"; }
+
+  /**
+   * @code
+   * this = scaleAB*(a*this) +  scaleT*this
+   * @endcode
+   */
+  virtual void leftMul(Matrix& a, real scaleAB, real scaleT) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = a*this)
+   * @endcode
+   */
+  virtual void leftMul(Matrix& a) { LOG(FATAL) << "Not implemented"; }
+
+  /// merge the element for each col.
+  virtual void colMerge(Matrix& src) { LOG(FATAL) << "Not implemented"; }
+
+  /// copy -log(output[label]) to this->data[i].
+  virtual void oneHotCrossEntropy(Matrix& output, IVector& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// calculate the error of outputV according to label.
+  virtual void oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// copy -log(output[label]) to this->data[i].
+  virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
+                                              real alpha) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// calculate the error of outputV according to label.
+  virtual void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                                IVector& label,
+                                                real alpha) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * \f[
+   *  a[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} b_{i+j} * c_{j}
+   * \f]
+   *  
+   * b contains M elements,
+   * c contains N elements (N is odd),
+   * b's index arithmetic is computed modulo M,
+   * c's index arithmetic is computed modulo N.
+   */
+  virtual void circularConv(Matrix& b, Matrix& c) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void circularConvDerivative(Matrix& output, Matrix& prevOut1,
+                                      Matrix& prevOut2, Matrix& prevGrad1,
+                                      Matrix& prevGrad2) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+
+  /* output_ij = exp(this_{ij}) / (sum_j exp(this_ij)) */
+  virtual void softmax(Matrix& output) {
+    (void)output;
+    LOG(FATAL) << "Not implemeted";
+  }
+  virtual void sequenceSoftmax(Matrix& output, const IVector& index) {
+    (void)output;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void softmaxBackward(Matrix& outputV) {
+    (void)outputV;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /*
+    sum_i = sum_j this_ij * output_ij
+    this_ij = output_ij* (this_ij - sum_i)
+  */
+  virtual void softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// calculate the sum of squares diff cost.
+  virtual void sumOfSquares(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// gradient of sumOfSquares.
+  virtual void sumOfSquaresBp(Matrix& outputV, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void tanh(Matrix& output) { LOG(FATAL) << "Not implemented"; }
+
+  virtual void tanhDerivative(Matrix& output) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void softrelu(Matrix& output) { LOG(FATAL) << "Not implemented"; }
+
+  virtual void softreluDerivative(Matrix& output) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void scaledTanh(Matrix& output, real p1, real p2) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * cosine similarity, for each row i,
+   *   this[i] = cos(output1[i], output2[i])
+   *
+   * output2 can only have one row, then for each row i,
+   *   this[i] = cos(output1[i], output2[0])
+   */
+  virtual void cosSim(Matrix& output1, Matrix& output2, real scale = 1.0f) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void cosSimDerivative(Matrix& output, Matrix& prevOut1,
+                                Matrix& prevOut2, Matrix& prevGrad1,
+                                Matrix& prevGrad2, real scale = 1.0f) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// print out the values of elements to os
+  virtual void print(std::ostream& os) const {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * print a part of the matrix
+   * from the (top,left) value to the (height, width) value (not included)
+   */
+  virtual void print(std::ostream& os, size_t height, size_t width) const {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// print one row to os
+  virtual void printOneRow(std::ostream& os, size_t idx) const {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void check(std::ostream& os, Matrix& refMat, bool printDiff = true) {}
+
+  virtual real getMin() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+  virtual real getMax() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  virtual void randomizeUniform() { LOG(FATAL) << "Not implemented"; }
+
+  /**
+   * @brief  calulate the error of classification
+   *
+   * output[i] = 1 if row i is an error.
+   *
+   * output[i] = 0 if row i is correct.
+   */
+  virtual void classificationError(MatrixPtr output, IVectorPtr label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * This function is used to calculate the convolution:
+   *
+   * It will expand a feature matrix according to the
+   * convolution filters
+   */
+  virtual void convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
+                          int channels, int blockH, int blockW, int strideH,
+                          int strideW, int paddingH, int paddingW,
+                          int outputH, int outputW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * This function is the reverse implementation of convExpand:
+   *
+   * Its function is to restore a expanded-matrix into a feature matrix
+   */
+  virtual void convShrink(Matrix& expandColMat, int thisImgHeight,
+                          int thisImgWidth, int channels, int blockH,
+                          int blockW, int strideH, int strideW, int paddingH,
+                          int paddingW, int outputH, int outputW,
+                          real alpha = 1.0f, real beta = 0.0f) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * Pooling forward operation, pick out the largest element
+   * in the sizeX of value
+   */
+  virtual void maxPoolForward(Matrix& inputMat, size_t imgSizeH,
+                              size_t imgSizeW, size_t channels, size_t sizeX,
+                              int start_, size_t stride, size_t outputH,
+                              size_t outputW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /// Pooling backward operation.
+  virtual void maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
+                               Matrix& outGrad, Matrix& outV, size_t sizeX,
+                               int start, size_t stride, size_t outputH,
+                               size_t outputW, real scaleTargets,
+                               real scaleOutput) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /// Pooling forward operation, caculate the average of sizeX elements.
+  virtual void avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
+                              size_t channels, size_t sizeX, int start,
+                              size_t stride, size_t outputH, size_t outputW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
+                               size_t sizeX, int start, size_t stride,
+                               size_t outputH, size_t outputW,
+                               real scaleTargets, real scaleOutput) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /// normalize-operation.
+  virtual void crossMapNormalFwd(Matrix& input, size_t imgSizeH,
+                                 size_t imgSizeW, Matrix& denoms,
+                                 size_t channels, size_t sizeX, float scale,
+                                 float pow, bool blocked) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
+                                 Matrix& preOutV, Matrix& localOutV,
+                                 size_t channels, size_t imgSizeH,
+                                 size_t imgSizeW, size_t size, float scale,
+                                 float pow, bool blocked) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * Input: one or more sequences. Each sequence contains some instances.
+   *
+   * Output: output size is the number of input sequences (NOT input
+   * instances).
+   *
+   * output[i] is set to max_input[i].
+   */
+  virtual void maxSequenceForward(Matrix& input, const IVector& sequence,
+                                  IVector& index) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
+                                   IVector& index) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void contextProjectionForward(MatrixPtr input, MatrixPtr weight,
+                                        const IVector& sequence,
+                                        int contextLength,
+                                        int contextStart, size_t beginPad,
+                                        bool isPadding) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void contextProjectionBackward(MatrixPtr inputGrad,
+                                         MatrixPtr weightGrad,
+                                         const IVector& sequence,
+                                         int contextLength,
+                                         int contextStart, size_t beginPad,
+                                         bool isPadding) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void contextProjectionBackwardData(MatrixPtr inputGrad,
+                                             const IVector& sequence,
+                                             int contextLength,
+                                             int contextStart) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void contextProjectionBackwardWeight(MatrixPtr weightGrad,
+                                               const IVector& sequence,
+                                               int contextLength,
+                                               int contextStart, int totalPad,
+                                               size_t beginPad) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * this.row[i] += table.row[ids[i]]
+   * if ids[i] == -1, it will be ignored
+   * @endcode
+   */
+  virtual void selectRows(Matrix& table, IVector& ids) {
+    (void)table;
+    (void)ids;
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this[i] = table[i, id[i]]
+   * @endcode
+   */
+  virtual void selectElements(Matrix& table, IVector& ids) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * table.row[ids[i]] += this.row[i]
+   * if ids[i] == -1, it will be ignored
+   * @endcode
+   */
+  virtual void addToRows(Matrix& table, IVector& ids) {
+    (void)table;
+    (void)ids;
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * table[i, id[i]] += this[i]
+   * @endcode
+   */
+  virtual void addElements(Matrix& table, IVector& ids) {
+    LOG(FATAL) << "Not implemented";
+  }
+  /**
+   * @brief  cross entropy for multi binary labels
+   *
+   * @code
+   * this[i] = -sum(label[i][j]*log(output[i][j])
+   *           + (1-label[i][j])*log(1-output[i][j]))
+   * @endcode             
+   */
+  virtual void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief  The gradient of cross entropy for multi binary labels on output
+   *
+   * @code
+   * this[i][j] = -label[i][j]/output[i][j]
+   *              + (1-label[i][j])/(1-output[i][j])
+   * @endcode             
+   */
+  virtual void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief  Calculate the classification error for multi binary labels
+   * 
+   * @code
+   * this[i] = sum((output[i][j] >= threshold && label[i][j] == 0)
+   *            || (output[i][j] < threshold && label[i][j] == 1))
+   *            / output->getWidth()
+   * @endcode           
+   */
+  virtual void classificationErrorMulti(Matrix& output, Matrix& label,
+                                        real threshold) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void paramReluForward(Matrix& data, Matrix& W) {
+    LOG(FATAL) << "Not implemented";
+  }
+  virtual void paramReluBackwardW(Matrix& oGrad, Matrix& data) {
+    LOG(FATAL) << "Not implemented";
+  }
+  virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
+    LOG(FATAL) << "Not implemented";
+  }
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
+  mat.print(os);
+  return os;
+}
+
+class GpuMatrix : public Matrix {
+public:
+  GpuMatrix();
+
+  GpuMatrix(size_t height, size_t width, bool trans = false);
+  GpuMatrix(real* data, size_t height, size_t width, bool trans = false)
+      : Matrix(data, height, width, trans, true) {}
+  GpuMatrix(real* data, size_t height, size_t width, size_t stride,
+            bool trans = false)
+      : Matrix(data, height, width, stride, trans, true) {}
+  GpuMatrix(GpuMemHandlePtr dataHandle, size_t height, size_t width,
+            bool trans = false)
+      : Matrix(dataHandle, height, width, trans, true) {}
+  ~GpuMatrix();
+
+  void zeroMem();
+  void resetOne();
+
+  void resize(size_t newHeight, size_t newWidth);
+  void resize(size_t newHeight, size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType, SparseFormat format) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+  void setRow(size_t row, size_t colNum, const unsigned int* cols,
+              const real* values) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+
+  /**
+   * Copy the data from cpu_memory buffer
+   */
+  void copyFrom(const real* hostSrc, size_t size);
+
+  void copyFrom(const real* hostSrc, const int64_t* seq);
+
+  void copyFrom(const Matrix& src, hl_stream_t stream);
+
+  void copyFrom(const Matrix& src);
+
+  void copyFrom(const IVector& src);
+
+  void copyByRowIndex(Matrix& b, IVector& rowIndex);
+
+  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
+
+  real getElement(size_t x, size_t y) const;
+
+  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
+  virtual real* getRowBuf(size_t row) { return getRow(row); }
+
+  real getSum();
+  void accumulateColSum(Matrix& src);
+  real getAbsSum();
+
+  MatrixPtr getTranspose();
+  void transpose(MatrixPtr matTrans, bool memAlloc);
+
+  /// add b to each sample of this.
+  void addBias(Matrix& b, real scale);
+
+  /**
+   * @code
+   * add each sample from a to this.
+   * @endcode
+   */
+  void collectBias(Matrix& a, real scale);
+
+  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
+
+  /**
+   * @code
+   * this.row[i] += table.row[ids[i]]
+   * @endcode
+   */
+  virtual void selectRows(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * this[i] = table[i, id[i]]
+   * @endcode
+   */
+  virtual void selectElements(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * table.row[ids[i]] += this.row[i]
+   * @endcode
+   */
+  virtual void addToRows(Matrix& table, IVector& ids);
+
+  void addColumnVector(const Matrix& b);
+
+  /**
+   * @code
+   * this = scaleAB*(a*b) + scaleT*this
+   * @endcode
+   */
+  void mul(const MatrixPtr a, const MatrixPtr b, real scaleAB, real scaleT);
+
+  /**
+   * @code
+   * this = a*b
+   * @endcode
+   */
+  void mul(const MatrixPtr a, const MatrixPtr b);
+
+  void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
+
+  void mul(const GpuSparseMatrix& a, const GpuMatrix& b, real scaleAB,
+           real scaleT);
+
+  void mul(const GpuMatrix& a, const GpuSparseMatrix& b, real scaleAB,
+           real scaleT);
+
+  /**
+   * @code
+   * this = scaleAB*(this*b) +  scaleT*this
+   * @endcode
+   */
+  void rightMul(Matrix& b, real scaleAB, real scaleT);
+
+  /**
+   * @code
+   * this = this* b
+   * @endcode
+   */
+  void rightMul(Matrix& b);
+
+  /**
+   * @code
+   * this = scaleAB*(a*this) +  scaleT*this
+   * @endcode
+   */
+  void leftMul(Matrix& a, real scaleAB, real scaleT);
+
+  /**
+   * @code
+   * this = a*this
+   * @endcode
+   */
+  void leftMul(Matrix& a);
+
+  void colMerge(Matrix& src);
+  void rowSum(Matrix& sum);
+  void rowMax(Matrix& max);
+  void rowMax(IVector& maxIds, Matrix& max);
+  void colMax(Matrix& max);
+
+  void oneHotCrossEntropy(Matrix& output, IVector& label);
+  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
+  void oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
+                                      real alpha);
+  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV, IVector& label,
+                                        real alpha);
+
+  void softmax(Matrix& output);
+  void sequenceSoftmax(Matrix& output, const IVector& index);
+  void softmaxBackward(Matrix& outputV);
+  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
+
+  /// calculate the sum of squares diff cost.
+  void sumOfSquares(Matrix& output, Matrix& label);
+
+  /// gradient of sumOfSquares.
+  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
+  void tanh(Matrix& output);
+  void tanhDerivative(Matrix& output);
+  void softrelu(Matrix& output);
+  void softreluDerivative(Matrix& output);
+  void scaledTanh(Matrix& output, real p1, real p2);
+
+  void cosSim(Matrix& output1, Matrix& output2, real scale);
+  void cosSimDerivative(Matrix& output, Matrix& prevOut1, Matrix& prevOut2,
+                        Matrix& prevGrad1, Matrix& prevGrad2, real scale);
+
+  virtual void print(std::ostream& os) const;
+  virtual void print(std::ostream& os, size_t height, size_t width) const;
+
+  void paramReluForward(Matrix& data, Matrix& W);
+  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
+  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
+
+  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
+  void randomizeUniform();
+
+  void classificationError(MatrixPtr output, IVectorPtr label);
+
+  void convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
+                  int channels, int blockH, int blockW, int strideH,
+                  int strideW, int paddingH, int paddingW,
+                  int outputH, int outputW);
+
+  void convShrink(Matrix& expandColMat, int thisImgHeight, int thisImgWidth,
+                  int channels, int blockH, int blochW, int strideH,
+                  int strideW, int paddingH, int paddingWreal,
+                  int outputH, int outputW,
+                  real alpha = 1.0f, real beta = 0.0f);
+
+  void maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW,
+                      size_t channels, size_t sizeX, int start_, size_t stride,
+                      size_t outputH, size_t outputW);
+
+  void maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
+                       Matrix& outGrad, Matrix& outV, size_t sizeX, int start,
+                       size_t stride, size_t outputH, size_t outputW,
+                       real scaleTargets, real scaleOutput);
+
+  void avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
+                      size_t channels, size_t sizeX, int start, size_t stride,
+                      size_t outputH, size_t outputW);
+
+  void avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
+                       size_t sizeX, int start, size_t stride, size_t outputH,
+                       size_t outputW, real scaleTargets, real scaleOutput);
+
+  void crossMapNormalFwd(Matrix& input, size_t imgSizeH, size_t imgSizeW,
+                         Matrix& denoms, size_t channels, size_t sizeX,
+                         float scale, float pow, bool blocked);
+
+  void crossMapNormalBwd(Matrix& localGrad, Matrix& denoms, Matrix& preOutV,
+                         Matrix& localOutV, size_t channels, size_t imgSizeH,
+                         size_t imgSizeW, size_t sizeX, float scale, float pow,
+                         bool blocked);
+
+  void maxSequenceForward(Matrix& input, const IVector& sequence,
+                          IVector& index);
+
+  void maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
+                           IVector& index);
+
+  void contextProjectionForward(MatrixPtr input, MatrixPtr weight,
+                                const IVector& sequence, int contextLength,
+                                int contextStart, size_t beginPad,
+                                bool isPadding);
+
+  void contextProjectionBackwardData(MatrixPtr inputGrad,
+                                     const IVector& sequence,
+                                     int contextLength, int contextStart);
+
+  void contextProjectionBackwardWeight(MatrixPtr weightGrad,
+                                       const IVector& sequence,
+                                       int contextLength,
+                                       int contextStart, int totalPad,
+                                       size_t beginPad);
+};
+
+class CpuMatrix : public Matrix {
+public:
+  CpuMatrix(size_t height, size_t width, bool trans = false);
+  CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
+      : Matrix(data, height, width, trans, false) {}
+  CpuMatrix(real* data, size_t height, size_t width, size_t stride,
+            bool trans = false)
+      : Matrix(data, height, width, stride, trans, false) {}
+
+  CpuMatrix(CpuMemHandlePtr dataHandle, size_t height, size_t width,
+            bool trans = false)
+      : Matrix(dataHandle, height, width, trans, false) {}
+
+  ~CpuMatrix();
+
+  void zeroMem();
+  void resetOne();
+  void resize(size_t newHeight, size_t newWidth);
+  void resize(size_t newHeight, size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType, SparseFormat format) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+  void setRow(size_t row, size_t colNum, const unsigned int* cols,
+              const real* values) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+
+  real getElement(size_t x, size_t y) const;
+  real getSum();
+  void accumulateColSum(Matrix& src);
+  real getAbsSum();
+
+  MatrixPtr getTranspose();
+  void transpose(MatrixPtr matTrans, bool memAlloc);
+
+  void copyFrom(const Matrix& src);
+
+  void copyFrom(const Matrix& src, hl_stream_t stream);
+
+  void copyFrom(const real* cpuSrc, size_t size);
+
+  void copyFrom(const real* cpuSrc, const int64_t* seq);
+
+  void copyFrom(const IVector& src);
+
+  void copyFrom(CpuSparseMatrix& src);
+
+  void copyByRowIndex(Matrix& b, IVector& rowIndex);
+
+  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
+
+  void convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
+                  int channels, int blcokH, int blockW, int strideH,
+                  int strideW, int paddingH, int paddingW,
+                  int outputH, int outputW);
+
+  void convShrink(Matrix& expandFeat, int thisImgHeight, int thisImgWidth,
+                  int channels, int blockH, int blockW, int strideH,
+                  int strideW, int paddingH, int paddingW,
+                  int outputH, int outputW,
+                  real alpha = 1.0f, real beta = 0.0f);
+
+  void maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW,
+                      size_t channels, size_t sizeX, int start_, size_t stride,
+                      size_t outputH, size_t outputW);
+
+  void maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
+                       Matrix& outGrad, Matrix& outV, size_t sizeX, int start,
+                       size_t stride, size_t outputH, size_t outputW,
+                       real scaleTargets, real scaleOutput);
+
+  void avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
+                      size_t channels, size_t sizeX, int start, size_t stride,
+                      size_t outputH, size_t outputW);
+
+  void avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
+                       size_t sizeX, int start, size_t stride, size_t outputH,
+                       size_t outputW, real scaleTargets, real scaleOutput);
+
+  void crossMapNormalFwd(Matrix& input, size_t imgSizeH, size_t imgSizeW,
+                         Matrix& denoms, size_t channels, size_t sizeX,
+                         float scale, float pow, bool blocked);
+
+  void crossMapNormalBwd(Matrix& localGrad, Matrix& denoms, Matrix& preOutV,
+                         Matrix& localOutV, size_t channels, size_t imgSizeH,
+                         size_t imgSizeW, size_t sizeX, float scale, float pow,
+                         bool blocked);
+
+  void maxSequenceForward(Matrix& input, const IVector& sequence,
+                          IVector& index);
+
+  void maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
+                           IVector& index);
+
+  void contextProjectionForward(MatrixPtr input, MatrixPtr weight,
+                                const IVector& sequence, int contextLength,
+                                int contextStart, size_t beginPad,
+                                bool isPadding);
+
+  void contextProjectionBackward(MatrixPtr inputGrad, MatrixPtr weightGrad,
+                                 const IVector& sequence, int contextLength,
+                                 int contextStart, size_t beginPad,
+                                 bool isPadding);
+
+  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
+  virtual real* getRowBuf(size_t row) { return getRow(row); }
+
+public:
+  /// add b to each sample of this.
+  void addBias(Matrix& b, real scale);
+
+  /// add each sample of a to this.
+  void collectBias(Matrix& a, real scale);
+
+  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
+
+
+  /**
+   * @code
+   * this.row[i] += table.row[ids[i]]
+   * @endcode
+   */
+  virtual void selectRows(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * table.row[ids[i]] += this.row[i]
+   * @endcode
+   */ 
+  virtual void addToRows(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * this[i] = table[i, id[i]]
+   * @endcode
+   */ 
+  virtual void selectElements(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * table[i, id[i]] += this[i]
+   * @endcode
+   */
+  virtual void addElements(Matrix& table, IVector& ids);
+
+  /**
+   * use abstract getRow() to get row from table.
+   *
+   * Define table as template instead of virtual class for performance sake.
+   * internal used by above two virtual funcs.
+   */
+  template <typename TableMatType>
+  void selectRowsImp(TableMatType& table, IVector& ids);
+  template <typename TableMatType>
+  void addToRowsImp(TableMatType& table, IVector& ids);
+
+  void addColumnVector(const Matrix& b);
+
+  void mul(const MatrixPtr a, const MatrixPtr b, real scaleAB, real scaleT);
+  void mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+
+  void mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, real scaleT);
+
+  static void mul(CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c, real scaleAB,
+                  real scaleT);
+
+  /**
+   * c = a * b
+   *
+   * use abstract getRow() to get row from B,C.
+   * Define B,C as template instead of virtual class for performance sake.
+   */
+  template <typename MatBType, typename MatCType>
+  static void mul(CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB,
+                  real scaleT);
+
+  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+
+  void mul(const MatrixPtr a, const MatrixPtr b);
+
+  void rightMul(Matrix& b, real scaleAB, real scaleT);
+  void rightMul(Matrix& b);
+
+  void leftMul(Matrix& a, real scaleAB, real scaleT);
+  void leftMul(Matrix& a);
+  void colMerge(Matrix& src);
+  void rowSum(Matrix& sum);
+  void rowMaxId(IVector& maxIds);
+  void rowMax(Matrix& max);
+  void rowMax(IVector& maxIds, Matrix& maxVal);
+  void colMax(Matrix& max);
+  void rowNormalizeL1(Matrix& out);
+
+  void oneHotCrossEntropy(Matrix& output, IVector& label);
+  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
+  void oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
+                                      real alpha);
+  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV, IVector& label,
+                                        real alpha);
+
+  void circularConv(Matrix& b, Matrix& c);
+  void circularConvDerivative(Matrix& output, Matrix& prevOut1,
+                              Matrix& prevOut2, Matrix& prevGrad1,
+                              Matrix& prevGrad2);
+
+  void softmax(Matrix& output);
+  void sequenceSoftmax(Matrix& output, const IVector& index);
+  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
+
+  /// calculate the sum of squares diff cost.
+  void sumOfSquares(Matrix& output, Matrix& label);
+
+  /// gradient of sumOfSquares.
+  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
+
+  void tanh(Matrix& output);
+  void tanhDerivative(Matrix& output);
+
+  void softrelu(Matrix& output);
+  void softreluDerivative(Matrix& output);
+  void scaledTanh(Matrix& output, real p1, real p2);
+
+  void cosSim(Matrix& output1, Matrix& output2, real scale);
+  void cosSimDerivative(Matrix& output, Matrix& prevOut1, Matrix& prevOut2,
+                        Matrix& prevGrad1, Matrix& prevGrad2, real scale);
+
+  void print(std::ostream& os) const;
+  void print(std::ostream& os, size_t height, size_t width) const;
+  void printOneRow(std::ostream& os, size_t idx) const;
+
+  void paramReluForward(Matrix& data, Matrix& W);
+  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
+  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
+
+  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
+
+  real getMin();
+  real getMax();
+
+  void randomizeUniform();
+
+  void classificationError(MatrixPtr output, IVectorPtr label);
+
+  void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
+
+  void addByBitCodeBackward(size_t numClasses, const IVector& codes,
+                            Matrix& vec);
+
+  void mulByBitCode(size_t numClasses, const IVector& codes, const Matrix& mat,
+                    const Matrix& input);
+
+  void mulByBitCodeBackwardWeight(size_t numClasses, const IVector& codes,
+                                  Matrix& mat, const Matrix& input);
+
+  void mulByBitCodeBackwardError(size_t numClasses, const IVector& codes,
+                                 const Matrix& mat, Matrix& input);
+
+  void sumByBitCode(size_t numClasses, IVector& codes, Matrix& sum,
+                    real scaleSum);
+
+  void subByBitCode(size_t numClasses_, IVector& codes);
+
+  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
+  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
+  void classificationErrorMulti(Matrix& output, Matrix& label, real threshold);
+};
+
+class SharedCpuMatrix : public CpuMatrix {
+public:
+  /* blockNum is number of partitions of the matrix  */
+  SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
+      : CpuMatrix(height, width, trans) {
+    initShared(blockNum);
+  }
+  SharedCpuMatrix(int blockNum, real* data, size_t height, size_t width,
+                  bool trans = false)
+      : CpuMatrix(data, height, width, trans) {
+    initShared(blockNum);
+  }
+
+  SharedCpuMatrix(int blockNum, CpuMemHandlePtr dataHandle, size_t height,
+                  size_t width, bool trans = false)
+      : CpuMatrix(dataHandle, height, width, trans) {
+    initShared(blockNum);
+  }
+
+  SharedCpuMatrix(CpuMemHandlePtr dataHandle, size_t height,
+                  size_t width, bool trans = false)
+      : CpuMatrix(dataHandle, height, width, trans) {
+    initBlock(1);
+  }
+
+  ~SharedCpuMatrix() {}
+
+public:
+  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+  void add(Matrix& b, real p1, real p2);
+  void add(real p1, real p2);
+
+private:
+  void initShared(int blockNum);
+  void initBlock(int blockNum);
+
+  int blockNum_;
+  std::vector<std::unique_ptr<std::mutex>> blockLocks_;
+  ThreadLocal<CpuMatrixPtr> localBuf_;
+  ThreadLocal<std::vector<int>> localBufRows_;
+  ThreadLocal<std::vector<int>> blockSeq_;
+};
+
+typedef struct { unsigned int col; } sparse_non_value_t;
+
+typedef struct {
+  unsigned int col;
+  float value;
+} sparse_float_value_t;
+
+}  // namespace paddle
+#include "ExecViaCpu.h"
diff --git a/paddle/math/MatrixBitCode.cpp b/paddle/math/MatrixBitCode.cpp
new file mode 100644
index 00000000000000..d179ac1f533554
--- /dev/null
+++ b/paddle/math/MatrixBitCode.cpp
@@ -0,0 +1,273 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include "Bits.h"
+#include "Matrix.h"
+#include "hl_gpu.h"
+
+namespace paddle {
+
+namespace {
+
+struct SimpleCode {
+  SimpleCode(size_t code, size_t numClasses) : c_(code + numClasses) {}
+  inline size_t calcIndex(int bit) const { return (c_ >> (bit + 1)) - 1; }
+  inline bool calcBit(int bit) const { return c_ & (1 << bit); }
+  inline int getLength() const { return findLastSet(c_) - 1; }
+
+private:
+  size_t c_;
+};
+
+struct SimpleCodeTable {
+  explicit SimpleCodeTable(size_t numClasses) : numClasses_(numClasses) {}
+  SimpleCode operator()(size_t code) const {
+    return SimpleCode(code, numClasses_);
+  }
+  size_t size() const { return numClasses_; }
+  int getMaxCodeLength() const { return findLastSet(numClasses_ - 1); }
+
+private:
+  size_t numClasses_;
+  int maxCodeLength_;
+};
+
+}  // namespace
+
+/**
+ * CodeTable class should support 3 functions:
+ *
+ * size_t size()
+ *   return the number of codes
+ *
+ * int getMaxCodeLength()
+ *   return the maximal code length
+ *
+ * Code operator()(size_t i)
+ *   return the i-th code. Code class is descriebed below.
+ *
+ * Code class should support 3 functions:
+ *
+ * int getLength()
+ *   return the length of the code
+ *
+ * bool calcIndex(int bit)
+ *   bit ranges from 0 to getLength() - 1
+ *   return the index for the (1+bit) level parent
+ *
+ * bool calcBit(int bit)
+ *   return true if the bit level parent is the right child of (1+bit) level
+ *   parent
+ *
+ */
+
+/*
+   for i:
+     for j < codeLength:
+       op(tmat(i, j), vec(0, index(i, j)))
+*/
+template <class CodeTable, class Op, class TMat, class Mat>
+static void addByBitCodeT(Op op, CodeTable codeTable, const IVector& codes,
+                          TMat& tmat, Mat& vec) {
+  CHECK(!vec.useGpu());
+
+  size_t numClasses = codeTable.size();
+  size_t maxCodeLength = codeTable.getMaxCodeLength();
+  size_t numSamples = tmat.getHeight();
+  size_t oWidth = tmat.getWidth();
+  CHECK_EQ(tmat.getWidth(), maxCodeLength);
+  CHECK_EQ(codes.getSize(), numSamples);
+  CHECK_EQ(vec.getHeight(), (size_t)1);
+  CHECK_EQ(vec.getWidth(), numClasses - 1);
+
+  auto data = tmat.getData();
+  auto v = vec.getData();
+  const int* c = codes.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    auto code = codeTable(c[i]);
+    int codeLength = code.getLength();
+    for (int j = 0; j < codeLength; ++j) {
+      size_t index = code.calcIndex(j);
+      op(data[i * oWidth + j], v[index]);
+    }
+  }
+}
+
+/* For j < codeLength:
+   this(i, j) += vec(0, index(i, j))
+*/
+void CpuMatrix::addByBitCode(size_t numClasses, const IVector& codes,
+                             const Matrix& vec) {
+  auto op = [](real& t, real v) { t += v; };
+  addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec);
+}
+
+/* For j < codeLength:
+   vec(0, index(i, j)) += this(i, j)
+*/
+void CpuMatrix::addByBitCodeBackward(size_t numClasses, const IVector& codes,
+                                     Matrix& vec) {
+  auto op = [](real t, real& v) { v += t; };
+  addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec);
+}
+
+/*
+  for i:
+    for j < codeLength:
+      op(tmat(i, j), mat.row(index(i, j)), input.row(i))
+*/
+template <class Op, class CodeTable, class IVec, class TMat, class WMat,
+          class InMat>
+void mulByBitCodeT(Op op, CodeTable codeTable, IVec& codes, TMat& tmat,
+                   WMat& weight, InMat& input) {
+  CHECK(!tmat.useGpu() && !weight.useGpu() && !input.useGpu());
+
+  size_t numClasses = codeTable.size();
+  size_t maxCodeLength = codeTable.getMaxCodeLength();
+  size_t numSamples = tmat.getHeight();
+  size_t inputDim = input.getWidth();
+  size_t oWidth = tmat.getWidth();
+  CHECK_EQ(tmat.getWidth(), maxCodeLength);
+  CHECK_EQ(codes.getSize(), numSamples);
+  CHECK_EQ(input.getHeight(), numSamples);
+  CHECK_EQ(weight.getHeight(), numClasses - 1);
+  CHECK_EQ(weight.getWidth(), inputDim);
+
+  real* data = tmat.getData();
+  const int* c = codes.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    auto code = codeTable(c[i]);
+    int codeLength = code.getLength();
+    for (int j = 0; j < codeLength; ++j) {
+      size_t index = code.calcIndex(j);
+      op(data[i * oWidth + j], weight.rowBuf(index), input.rowBuf(i), inputDim);
+    }
+  }
+}
+
+/* For j < codeLength:
+   this(i, j) += <weight.row(index(i, j)), input.row(i)>
+*/
+void CpuMatrix::mulByBitCode(size_t numClasses, const IVector& codes,
+                             const Matrix& weight, const Matrix& input) {
+  auto op = [](real& t, const real* weightRow, const real* inputRow,
+               size_t inputDim) {
+    real sum = 0;
+    for (size_t k = 0; k < inputDim; ++k) {
+      sum += weightRow[k] * inputRow[k];
+    }
+    t += sum;
+  };
+
+  mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
+}
+
+/* For index(i, j) >= 0:
+   weight.row(index(i, j)) += this(i, j) * input.row(i)
+*/
+void CpuMatrix::mulByBitCodeBackwardWeight(size_t numClasses,
+                                           const IVector& codes, Matrix& weight,
+                                           const Matrix& input) {
+  auto op =
+      [](const real t, real* weightRow, const real* inputRow, size_t inputDim) {
+        for (size_t k = 0; k < inputDim; ++k) {
+          weightRow[k] += t * inputRow[k];
+        }
+      };
+
+  mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
+}
+
+/* For j < codeLength:
+   input.row(i) += this(i, j) * weight.row(index(i, j))
+*/
+void CpuMatrix::mulByBitCodeBackwardError(size_t numClasses,
+                                          const IVector& codes,
+                                          const Matrix& weight, Matrix& input) {
+  auto op =
+      [](const real t, const real* weightRow, real* inputRow, size_t inputDim) {
+        for (size_t k = 0; k < inputDim; ++k) {
+          inputRow[k] += t * weightRow[k];
+        }
+      };
+
+  mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
+}
+
+template <class CodeTable>
+void sumByBitCodeT(CodeTable codeTable, IVector& codes, const CpuMatrix& tmat,
+                   Matrix& sum, real scaleSum) {
+  size_t maxCodeLength = codeTable.getMaxCodeLength();
+  size_t numSamples = tmat.getHeight();
+  size_t oWidth = tmat.getWidth();
+  CHECK_EQ(tmat.getWidth(), maxCodeLength);
+  CHECK_EQ(codes.getSize(), numSamples);
+  CHECK_EQ(sum.getHeight(), numSamples);
+  CHECK_EQ(sum.getWidth(), (size_t)1);
+
+  const real* data = tmat.getData();
+  real* s = sum.getData();
+  int* c = codes.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    real sm = 0;
+    auto code = codeTable(c[i]);
+    int codeLength = code.getLength();
+    for (int j = 0; j < codeLength; ++j) {
+      if (code.calcBit(j)) {
+        sm += data[i * oWidth + j];
+      }
+    }
+    s[i] = scaleSum * sm;
+  }
+}
+
+/* For j < codeLength:
+   sum(i, 0) = \sum_j  bit(i, j) * this(i, j)
+*/
+void CpuMatrix::sumByBitCode(size_t numClasses, IVector& codes, Matrix& sum,
+                             real scaleSum) {
+  sumByBitCodeT(SimpleCodeTable(numClasses), codes, *this, sum, scaleSum);
+}
+
+template <class CodeTable>
+void subByBitCodeT(CodeTable codeTable, IVector& codes, CpuMatrix& tmat) {
+  size_t maxCodeLength = codeTable.getMaxCodeLength();
+  size_t numSamples = tmat.getHeight();
+  size_t oWidth = tmat.getWidth();
+  CHECK_EQ(tmat.getWidth(), maxCodeLength);
+  CHECK_EQ(codes.getSize(), numSamples);
+
+  real* data = tmat.getData();
+  int* c = codes.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    auto code = codeTable(c[i]);
+    int codeLength = code.getLength();
+    for (int j = 0; j < codeLength; ++j) {
+      if (code.calcBit(j)) {
+        data[i * oWidth + j] -= 1;
+      }
+    }
+  }
+}
+
+/* For j < codeLength
+   this(i, j) -= bit(i, j)
+*/
+void CpuMatrix::subByBitCode(size_t numClasses, IVector& codes) {
+  subByBitCodeT(SimpleCodeTable(numClasses), codes, *this);
+}
+
+}  // namespace paddle
diff --git a/paddle/math/MemoryHandle.cpp b/paddle/math/MemoryHandle.cpp
new file mode 100644
index 00000000000000..11f746df5c2fb3
--- /dev/null
+++ b/paddle/math/MemoryHandle.cpp
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cmath>
+#include "MemoryHandle.h"
+#include "Storage.h"
+
+namespace paddle {
+
+/**
+ * Calculate the actual allocation size according to the required size.
+ */
+MemoryHandle::MemoryHandle(size_t size)
+    : size_(size), buf_(nullptr) {
+  if (size_ <= 256) {
+    // Memory allocation in cuda is always aligned to at least 256 bytes.
+    // In many cases it is 512 bytes.
+    allocSize_ = 256;
+  } else if (size_ <= 512) {
+    allocSize_ = 512;
+  } else if (size_ <= (1 << 16)) {
+    // Allocate multiple of 1024 bytes.
+    allocSize_ = (size + 1023) & ~(1023);
+  } else {
+    allocSize_ = size_;
+  }
+}
+
+GpuMemoryHandle::GpuMemoryHandle(size_t size) : MemoryHandle(size) {
+  CHECK(size != 0) << " allocate 0 bytes";
+  deviceId_ = hl_get_device();
+  allocator_ = StorageEngine::singleton()->getGpuAllocator(deviceId_);
+  buf_ = allocator_->alloc(allocSize_);
+}
+
+GpuMemoryHandle::~GpuMemoryHandle() {
+  allocator_->free(buf_, allocSize_);
+}
+
+CpuMemoryHandle::CpuMemoryHandle(size_t size) : MemoryHandle(size) {
+  CHECK(size != 0) << " allocate 0 bytes";
+  allocator_ = StorageEngine::singleton()->getCpuAllocator();
+  buf_ = allocator_->alloc(allocSize_);
+}
+
+CpuMemoryHandle::~CpuMemoryHandle() {
+  allocator_->free(buf_, allocSize_);
+}
+
+}  // namespace paddle
diff --git a/paddle/math/MemoryHandle.h b/paddle/math/MemoryHandle.h
new file mode 100644
index 00000000000000..809fba2d0a8963
--- /dev/null
+++ b/paddle/math/MemoryHandle.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <memory>
+#include "PoolAllocator.h"
+
+namespace paddle {
+
+class MemoryHandle {
+protected:
+  explicit MemoryHandle(size_t size);
+  virtual ~MemoryHandle() {}
+
+public:
+  void* getBuf() const { return buf_; }
+  size_t getSize() const { return size_; }
+  size_t getAllocSize() const { return allocSize_; }
+
+protected:
+  PoolAllocator* allocator_;
+  size_t size_;         // the requested size
+  size_t allocSize_;    // the allocated size
+  int deviceId_;        // the device id of memory if gpu memory
+  void* buf_;
+};
+
+/**
+ * Wrapper class for raw gpu memory handle.
+ *
+ * The raw handle will be released at destructor
+ */
+class GpuMemoryHandle : public MemoryHandle {
+public:
+  explicit GpuMemoryHandle(size_t size);
+  virtual ~GpuMemoryHandle();
+};
+
+/**
+ * Wrapper class for raw cpu memory handle.
+ *
+ * The raw handle will be released at destructor
+ */
+class CpuMemoryHandle : public MemoryHandle {
+public:
+  explicit CpuMemoryHandle(size_t size);
+  virtual ~CpuMemoryHandle();
+};
+
+typedef std::shared_ptr<MemoryHandle> MemoryHandlePtr;
+typedef std::shared_ptr<CpuMemoryHandle> CpuMemHandlePtr;
+typedef std::shared_ptr<GpuMemoryHandle> GpuMemHandlePtr;
+}  // namespace paddle
diff --git a/paddle/math/PoolAllocator.cpp b/paddle/math/PoolAllocator.cpp
new file mode 100644
index 00000000000000..3a03496eb190ba
--- /dev/null
+++ b/paddle/math/PoolAllocator.cpp
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "PoolAllocator.h"
+
+namespace paddle {
+
+PoolAllocator::PoolAllocator(Allocator* allocator,
+  size_t sizeLimit, const std::string& name)
+    : allocator_(allocator),
+      sizeLimit_(sizeLimit),
+      poolMemorySize_(0),
+      name_(name) {}
+
+PoolAllocator::~PoolAllocator() {
+  freeAll();
+}
+
+void* PoolAllocator::alloc(size_t size) {
+  if (sizeLimit_ > 0) {
+    std::lock_guard<std::mutex> guard(mutex_);
+    auto it = pool_.find(size);
+    if (it == pool_.end() || it->second.size() == 0) {
+      if (poolMemorySize_ >= sizeLimit_) {
+        freeAll();
+      }
+      return allocator_->alloc(size);
+    } else {
+      auto buf = it->second.back();
+      it->second.pop_back();
+      poolMemorySize_ -= size;
+      return buf;
+    }
+  } else {
+    return allocator_->alloc(size);
+  }
+}
+
+void PoolAllocator::free(void* ptr, size_t size) {
+  if (sizeLimit_ > 0) {
+    std::lock_guard<std::mutex> guard(mutex_);
+    auto& it = pool_[size];
+    it.push_back(ptr);
+    poolMemorySize_ += size;
+  } else {
+    allocator_->free(ptr);
+  }
+}
+
+void PoolAllocator::freeAll() {
+  for (auto it : pool_) {
+    for (auto ptr : it.second) {
+      allocator_->free(ptr);
+    }
+  }
+  poolMemorySize_ = 0;
+  pool_.clear();
+}
+
+void PoolAllocator::printAll() {
+  size_t memory = 0;
+  LOG(INFO) << name_ << ":";
+  for (auto it : pool_) {
+    LOG(INFO) << "  size:" << it.first;
+    for (auto ptr : it.second) {
+      LOG(INFO) << "    ptr:" << ptr;
+      memory += it.first;
+    }
+  }
+  LOG(INFO) << "memory size: " << memory;
+}
+
+}  // namespace paddle
diff --git a/paddle/math/PoolAllocator.h b/paddle/math/PoolAllocator.h
new file mode 100644
index 00000000000000..22af0eb8937534
--- /dev/null
+++ b/paddle/math/PoolAllocator.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <vector>
+#include <unordered_map>
+#include "Allocator.h"
+
+namespace paddle {
+
+/**
+ * @brief Memory pool allocator implementation.
+ */
+class PoolAllocator {
+public:
+  /**
+   * @brief constructor.
+   * @param allocator a Allocator object.
+   * @param sizeLimit The maximum size memory can be managed,
+   * if sizeLimit == 0, the pool allocator is a simple wrapper of allocator.
+   */
+  PoolAllocator(Allocator* allocator,
+                size_t sizeLimit = 0,
+                const std::string& name = "pool");
+
+  /**
+   * @brief destructor.
+   */
+  ~PoolAllocator();
+
+  void* alloc(size_t size);
+  void free(void* ptr, size_t size);
+  std::string getName() { return name_; }
+
+private:
+  void freeAll();
+  void printAll();
+  std::unique_ptr<Allocator> allocator_;
+  std::mutex mutex_;
+  std::unordered_map<size_t, std::vector<void*>> pool_;
+  size_t sizeLimit_;
+  size_t poolMemorySize_;
+  std::string name_;
+};
+
+}  // namespace paddle
diff --git a/paddle/math/SIMDFunctions.cpp b/paddle/math/SIMDFunctions.cpp
new file mode 100644
index 00000000000000..6147bed3d81112
--- /dev/null
+++ b/paddle/math/SIMDFunctions.cpp
@@ -0,0 +1,390 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+
+#include "SIMDFunctions.h"
+#include <immintrin.h>
+#include <algorithm>
+
+#ifndef __AVX__
+static void addto_sse(float* a, const float* b, size_t len) {
+  int offset = len % 16;
+  __m128 ma0, ma1, ma2, ma3;
+  __m128 mb0, mb1, mb2, mb3;
+
+  for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) {
+    ma0 = _mm_load_ps(a);
+    ma1 = _mm_load_ps(a + 4);
+    ma2 = _mm_load_ps(a + 8);
+    ma3 = _mm_load_ps(a + 12);
+
+    mb0 = _mm_load_ps(b);
+    mb1 = _mm_load_ps(b + 4);
+    mb2 = _mm_load_ps(b + 8);
+    mb3 = _mm_load_ps(b + 12);
+
+    ma0 = _mm_add_ps(ma0, mb0);
+    ma1 = _mm_add_ps(ma1, mb1);
+    ma2 = _mm_add_ps(ma2, mb2);
+    ma3 = _mm_add_ps(ma3, mb3);
+
+    _mm_store_ps(a, ma0);
+    _mm_store_ps(a + 4, ma1);
+    _mm_store_ps(a + 8, ma2);
+    _mm_store_ps(a + 12, ma3);
+  }
+
+  for (int i = 0; i < offset; i++) a[i] += b[i];
+}
+
+static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) {
+  int offset = len % 16;
+
+  __m128 ma0, ma1, ma2, ma3;
+  __m128 mb0, mb1, mb2, mb3;
+
+  for (unsigned int k = 0; k < len / 16; k++, a += 16) {
+    ma0 = _mm_load_ps(a);
+    ma1 = _mm_load_ps(a + 4);
+    ma2 = _mm_load_ps(a + 8);
+    ma3 = _mm_load_ps(a + 12);
+
+    for (int i = 0; i < batch; i++) {
+      mb0 = _mm_load_ps(b[i]);
+      mb1 = _mm_load_ps(b[i] + 4);
+      mb2 = _mm_load_ps(b[i] + 8);
+      mb3 = _mm_load_ps(b[i] + 12);
+      ma0 = _mm_add_ps(ma0, mb0);
+      ma1 = _mm_add_ps(ma1, mb1);
+      ma2 = _mm_add_ps(ma2, mb2);
+      ma3 = _mm_add_ps(ma3, mb3);
+      b[i] += 16;
+    }
+
+    _mm_store_ps(a, ma0);
+    _mm_store_ps(a + 4, ma1);
+    _mm_store_ps(a + 8, ma2);
+    _mm_store_ps(a + 12, ma3);
+  }
+
+  for (int i = 0; i < offset; i++) {
+    for (int k = 0; k < batch; k++) a[i] += b[k][i];
+  }
+  return;
+}
+
+static void col_max_sse(float* result, const float* data, int dim,
+                        int numSamples) {
+  // first sample, direct copy
+  for (int d = 0; d < dim; ++d) {
+    result[d] = data[d];
+  }
+  int offset = dim % 16;
+  __m128 ma0, ma1, ma2, ma3;
+  __m128 mb0, mb1, mb2, mb3;
+  // first 16n dims
+  for (int k = 0; k < dim / 16; k++, result += 16, data += 16) {
+    ma0 = _mm_load_ps(result);
+    ma1 = _mm_load_ps(result + 4);
+    ma2 = _mm_load_ps(result + 8);
+    ma3 = _mm_load_ps(result + 12);
+    for (int i = 1; i < numSamples; i++) {
+      mb0 = _mm_load_ps(data + i * dim);
+      mb1 = _mm_load_ps(data + i * dim + 4);
+      mb2 = _mm_load_ps(data + i * dim + 8);
+      mb3 = _mm_load_ps(data + i * dim + 12);
+      ma0 = _mm_max_ps(ma0, mb0);
+      ma1 = _mm_max_ps(ma1, mb1);
+      ma2 = _mm_max_ps(ma2, mb2);
+      ma3 = _mm_max_ps(ma3, mb3);
+    }
+    _mm_store_ps(result, ma0);
+    _mm_store_ps(result + 4, ma1);
+    _mm_store_ps(result + 8, ma2);
+    _mm_store_ps(result + 12, ma3);
+  }
+  // last dims
+  for (int d = 0; d < offset; ++d) {
+    float sm = data[d];
+    for (int i = 1; i < numSamples; ++i) {
+      sm = std::max(sm, data[i * dim + d]);
+    }
+    result[d] = sm;
+  }
+}
+
+#else
+static void addto_avx(float* a, const float* b, size_t len) {
+  int offset = len % 32;
+
+  __m256 ma0, ma1, ma2, ma3;
+  __m256 mb0, mb1, mb2, mb3;
+
+  for (unsigned int k = 0; k < len / 32; k++, a += 32, b += 32) {
+    ma0 = _mm256_load_ps(a);
+    ma1 = _mm256_load_ps(a + 8);
+    ma2 = _mm256_load_ps(a + 16);
+    ma3 = _mm256_load_ps(a + 24);
+
+    mb0 = _mm256_load_ps(b);
+    mb1 = _mm256_load_ps(b + 8);
+    mb2 = _mm256_load_ps(b + 16);
+    mb3 = _mm256_load_ps(b + 24);
+
+    ma0 = _mm256_add_ps(ma0, mb0);
+    ma1 = _mm256_add_ps(ma1, mb1);
+    ma2 = _mm256_add_ps(ma2, mb2);
+    ma3 = _mm256_add_ps(ma3, mb3);
+
+    _mm256_store_ps(a, ma0);
+    _mm256_store_ps(a + 8, ma1);
+    _mm256_store_ps(a + 16, ma2);
+    _mm256_store_ps(a + 24, ma3);
+  }
+
+  for (int i = 0; i < offset; i++) a[i] += b[i];
+
+  return;
+}
+
+static void batch_addto_avx(float* a, const float* b[], int batch, size_t len) {
+  int offset = len % 32;
+
+  __m256 ma0, ma1, ma2, ma3;
+  __m256 mb0, mb1, mb2, mb3;
+
+  for (unsigned int k = 0; k < len / 32; k++, a += 32) {
+    ma0 = _mm256_load_ps(a);
+    ma1 = _mm256_load_ps(a + 8);
+    ma2 = _mm256_load_ps(a + 16);
+    ma3 = _mm256_load_ps(a + 24);
+
+    for (int i = 0; i < batch; i++) {
+      mb0 = _mm256_load_ps(b[i]);
+      mb1 = _mm256_load_ps(b[i] + 8);
+      mb2 = _mm256_load_ps(b[i] + 16);
+      mb3 = _mm256_load_ps(b[i] + 24);
+      ma0 = _mm256_add_ps(ma0, mb0);
+      ma1 = _mm256_add_ps(ma1, mb1);
+      ma2 = _mm256_add_ps(ma2, mb2);
+      ma3 = _mm256_add_ps(ma3, mb3);
+      b[i] += 32;
+    }
+
+    _mm256_store_ps(a, ma0);
+    _mm256_store_ps(a + 8, ma1);
+    _mm256_store_ps(a + 16, ma2);
+    _mm256_store_ps(a + 24, ma3);
+  }
+
+  for (int i = 0; i < offset; i++) {
+    for (int k = 0; k < batch; k++) a[i] += b[k][i];
+  }
+  return;
+}
+
+static void col_max_avx(float* result, const float* data, int dim,
+                        int numSamples) {
+  // first sample, direct copy
+  for (int d = 0; d < dim; ++d) {
+    result[d] = data[d];
+  }
+  int offset = dim % 32;
+  __m256 ma0, ma1, ma2, ma3;
+  __m256 mb0, mb1, mb2, mb3;
+  // first 16n dims
+  for (int k = 0; k < dim / 32; k++, result += 32, data += 32) {
+    ma0 = _mm256_load_ps(result);
+    ma1 = _mm256_load_ps(result + 8);
+    ma2 = _mm256_load_ps(result + 16);
+    ma3 = _mm256_load_ps(result + 24);
+    for (int i = 1; i < numSamples; i++) {
+      mb0 = _mm256_load_ps(data + i * dim);
+      mb1 = _mm256_load_ps(data + i * dim + 8);
+      mb2 = _mm256_load_ps(data + i * dim + 16);
+      mb3 = _mm256_load_ps(data + i * dim + 24);
+      ma0 = _mm256_max_ps(ma0, mb0);
+      ma1 = _mm256_max_ps(ma1, mb1);
+      ma2 = _mm256_max_ps(ma2, mb2);
+      ma3 = _mm256_max_ps(ma3, mb3);
+    }
+    _mm256_store_ps(result, ma0);
+    _mm256_store_ps(result + 8, ma1);
+    _mm256_store_ps(result + 16, ma2);
+    _mm256_store_ps(result + 24, ma3);
+  }
+  // last dims
+  for (int d = 0; d < offset; ++d) {
+    float sm = data[d];
+    for (int i = 1; i < numSamples; ++i) {
+      sm = std::max(sm, data[i * dim + d]);
+    }
+    result[d] = sm;
+  }
+}
+
+static void decayL1_avx(float* dst, float* src, float lambda, size_t sz) {
+  int64_t i;
+  int64_t size = sz;
+  float src_val;
+
+  __m256 ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8;
+  //  __m256 ymm9, ymm10;
+
+  ymm1 = _mm256_set1_ps(lambda);
+  ymm2 = _mm256_setzero_ps();
+
+  for (i = 0; i <= size - 16; i += 16) {
+    ymm3 = _mm256_load_ps(src + i);
+    ymm6 = _mm256_load_ps(src + i + 8);
+
+    ymm4 = _mm256_sub_ps(ymm3, ymm1);
+    ymm7 = _mm256_sub_ps(ymm6, ymm1);
+
+    ymm5 = _mm256_add_ps(ymm3, ymm1);
+    ymm8 = _mm256_add_ps(ymm6, ymm1);
+
+    ymm4 = _mm256_max_ps(ymm4, ymm2);
+    ymm7 = _mm256_max_ps(ymm7, ymm2);
+
+    ymm5 = _mm256_min_ps(ymm5, ymm2);
+    ymm8 = _mm256_min_ps(ymm8, ymm2);
+
+    ymm5 = _mm256_or_ps(ymm4, ymm5);
+    ymm8 = _mm256_or_ps(ymm7, ymm8);
+
+    _mm256_store_ps(dst + i, ymm5);
+    _mm256_store_ps(dst + i + 8, ymm8);
+  }
+  if (i <= size - 8) {
+    ymm3 = _mm256_load_ps(src + i);
+    ymm4 = _mm256_sub_ps(ymm3, ymm1);
+    ymm5 = _mm256_add_ps(ymm3, ymm1);
+    ymm4 = _mm256_max_ps(ymm4, ymm2);
+    ymm5 = _mm256_min_ps(ymm5, ymm2);
+    ymm5 = _mm256_or_ps(ymm4, ymm5);
+    _mm256_store_ps(dst + i, ymm5);
+
+    i += 8;
+  }
+  for (; i < size; i++) {
+    src_val = src[i];
+    if (src_val > 0) {
+      dst[i] = ((src_val > lambda) ? (src_val - lambda) : 0);
+    } else {
+      dst[i] = ((-src_val > lambda) ? (src_val + lambda) : 0);
+    }
+  }
+}
+
+static void decayL1_avx(float* dst, float* src, float* lr, float lambda,
+                        size_t sz) {
+  int64_t i;
+  int64_t size = sz;
+  float src_val;
+
+  __m256 ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8;
+  __m256 ymm9, ymm10;
+
+  ymm1 = _mm256_set1_ps(lambda);
+  ymm2 = _mm256_setzero_ps();
+
+  for (i = 0; i <= size - 16; i += 16) {
+    ymm9 = _mm256_load_ps(lr + i);
+    ymm10 = _mm256_load_ps(lr + i + 8);
+
+    ymm3 = _mm256_load_ps(src + i);
+    ymm6 = _mm256_load_ps(src + i + 8);
+
+    ymm9 = _mm256_mul_ps(ymm9, ymm1);
+    ymm10 = _mm256_mul_ps(ymm10, ymm1);
+
+    ymm4 = _mm256_sub_ps(ymm3, ymm9);
+    ymm7 = _mm256_sub_ps(ymm6, ymm10);
+
+    ymm5 = _mm256_add_ps(ymm3, ymm9);
+    ymm8 = _mm256_add_ps(ymm6, ymm10);
+
+    ymm4 = _mm256_max_ps(ymm4, ymm2);
+    ymm7 = _mm256_max_ps(ymm7, ymm2);
+
+    ymm5 = _mm256_min_ps(ymm5, ymm2);
+    ymm8 = _mm256_min_ps(ymm8, ymm2);
+
+    ymm5 = _mm256_or_ps(ymm4, ymm5);
+    ymm8 = _mm256_or_ps(ymm7, ymm8);
+
+    _mm256_store_ps(dst + i, ymm5);
+    _mm256_store_ps(dst + i + 8, ymm8);
+  }
+  if (i <= size - 8) {
+    ymm3 = _mm256_load_ps(src + i);
+    ymm9 = _mm256_load_ps(lr + i);
+    ymm9 = _mm256_mul_ps(ymm9, ymm1);
+    ymm4 = _mm256_sub_ps(ymm3, ymm9);
+    ymm5 = _mm256_add_ps(ymm3, ymm9);
+    ymm4 = _mm256_max_ps(ymm4, ymm2);
+    ymm5 = _mm256_min_ps(ymm5, ymm2);
+    ymm5 = _mm256_or_ps(ymm4, ymm5);
+    _mm256_store_ps(dst + i, ymm5);
+
+    i += 8;
+  }
+  for (; i < size; i++) {
+    src_val = src[i];
+    float nlambda = lr[i] * lambda;
+    if (src_val > 0) {
+      dst[i] = ((src_val > nlambda) ? (src_val - nlambda) : 0);
+    } else {
+      dst[i] = ((-src_val > nlambda) ? (src_val + nlambda) : 0);
+    }
+  }
+}
+
+#endif
+
+#ifndef __AVX__
+#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
+#else
+#define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__)
+#endif
+
+namespace paddle {
+namespace simd {
+namespace internal {
+void addToImpl(float* a, const float* b, size_t len) {
+  SIMD_INVOKE(addto, a, b, len);
+}
+void batchAddToImpl(float* a, const float* b[], int batch, size_t len) {
+  SIMD_INVOKE(batch_addto, a, b, batch, len);
+}
+
+void colMaxImpl(float* result, const float* data, int dim, int numSamples) {
+  SIMD_INVOKE(col_max, result, data, dim, numSamples);
+}
+
+#ifdef __AVX__
+void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len) {
+  decayL1_avx(dst, src, lambda, len);
+}
+void decayL1AvxImpl(float* dst, float* src, float* lr, float lambda,
+                    size_t len) {
+  decayL1_avx(dst, src, lr, lambda, len);
+}
+
+#endif
+}  // namespace internal
+}  // namespace simd
+}  // namespace paddle
diff --git a/paddle/math/SIMDFunctions.h b/paddle/math/SIMDFunctions.h
new file mode 100644
index 00000000000000..2b984d5f96a620
--- /dev/null
+++ b/paddle/math/SIMDFunctions.h
@@ -0,0 +1,167 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+
+namespace paddle {
+
+namespace simd {
+
+namespace naive {
+template <typename Type>
+inline void addTo(Type* a, const Type* b, size_t len) {
+  for (size_t i = 0; i < len; ++i) {
+    a[i] += b[i];
+  }
+}
+
+template <typename Type>
+inline void batchAddTo(Type* a, const Type* b[], int batch, size_t len) {
+  for (int i = 0; i < batch; ++i) {
+    for (size_t j = 0; j < len; ++j) {
+      a[j] += b[i][j];
+    }
+  }
+}
+
+/**
+ * @note this method is unused in paddle.
+ */
+template <typename Type>
+inline void colMax(Type* result, const Type* data, int dim, int numSamples) {
+  for (int d = 0; d < dim; ++d) {
+    Type sm = data[d];
+    for (int i = 1; i < numSamples; ++i) {
+      sm = sm > data[i * dim + d] ? sm : data[i * dim + d];
+    }
+    result[d] = sm;
+  }
+}
+
+template <typename Type>
+inline void decayL1(Type* dst, Type* src, Type* lr, Type lambda, size_t len) {
+  for (size_t i = 0; i < len; ++i) {
+    Type& src_val = src[i];
+    float nlambda = lr[i] * lambda;
+    if (src_val > 0) {
+      dst[i] = ((src_val > nlambda) ? (src_val - nlambda) : 0);
+    } else {
+      dst[i] = ((-src_val > nlambda) ? (src_val + nlambda) : 0);
+    }
+  }
+}
+
+template <class Type>
+inline void decayL1(Type* dst, Type* src, Type lambda, size_t len) {
+  for (size_t i = 0; i < len; ++i) {
+    Type& src_val = src[i];
+    if (src_val > 0) {
+      dst[i] = ((src_val > lambda) ? (src_val - lambda) : 0);
+    } else {
+      dst[i] = ((-src_val > lambda) ? (src_val + lambda) : 0);
+    }
+  }
+}
+}  // namespace naive
+
+template <typename Type>
+inline void addTo(Type* a, const Type* b, size_t len) {
+  naive::addTo(a, b, len);
+}
+
+template <typename Type>
+inline void batchAddTo(Type* a, const Type* b[], int batch, size_t len) {
+  naive::batchAddTo(a, b, batch, len);
+}
+
+template <typename Type>
+inline void colMax(Type* result, const Type* data, int dim, int numSamples) {
+  naive::colMax(result, data, dim, numSamples);
+}
+
+template <typename Type>
+inline void decayL1(Type* dst, Type* src, Type* lr, Type lambda, size_t len) {
+  naive::decayL1(dst, src, lr, lambda, len);
+}
+
+template <typename Type>
+inline void decayL1(Type* dst, Type* src, Type lambda, size_t len) {
+  naive::decayL1(dst, src, lambda, len);
+}
+
+template <size_t AlignSize>
+inline bool isPointerAlign(void* ptr) {
+  return reinterpret_cast<uintptr_t>(ptr) % AlignSize == 0;
+}
+
+inline bool vec_check(size_t len) {
+#ifdef __AVX__
+  return len % 8 == 0;
+#else
+  return len % 4 == 0;
+#endif
+}
+
+namespace internal {
+void addToImpl(float* a, const float* b, size_t len);
+void batchAddToImpl(float* a, const float* b[], int batch, size_t len);
+void colMaxImpl(float* result, const float* data, int dim, int numSamples);
+#ifdef __AVX__
+void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len);
+void decayL1AvxImpl(float* dst, float* src, float* lr, float lambda,
+                    size_t len);
+#endif
+}  // namespace internal
+
+template <>
+inline void addTo(float* a, const float* b, size_t len) {
+  internal::addToImpl(a, b, len);
+}
+
+template <>
+inline void batchAddTo(float* a, const float* b[], int batch, size_t len) {
+  internal::batchAddToImpl(a, b, batch, len);
+}
+
+template <>
+inline void colMax(float* result, const float* data, int dim, int numSamples) {
+  internal::colMaxImpl(result, data, dim, numSamples);
+}
+
+template <>
+inline void decayL1(float* dst, float* src, float lambda, size_t len) {
+#ifdef __AVX__
+  internal::decayL1AvxImpl(dst, src, lambda, len);
+#else
+  naive::decayL1(dst, src, lambda, len);
+#endif
+}
+
+template <>
+inline void decayL1(float* dst, float* src, float* lr, float lambda,
+                    size_t len) {
+#ifdef __AVX__
+  internal::decayL1AvxImpl(dst, src, lr, lambda, len);
+#else
+  naive::decayL1(dst, src, lr, lambda, len);
+#endif
+}
+
+}  // namespace simd
+
+}  // namespace paddle
diff --git a/paddle/math/SparseMatrix.cpp b/paddle/math/SparseMatrix.cpp
new file mode 100644
index 00000000000000..67ac0488623075
--- /dev/null
+++ b/paddle/math/SparseMatrix.cpp
@@ -0,0 +1,775 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <vector>
+#include "hl_gpu.h"
+#include "SparseMatrix.h"
+#include "paddle/utils/Util.h"
+#include "hl_top_k.h"
+#include <iostream>
+
+namespace paddle {
+
+GpuSparseMatrix::GpuSparseMatrix(size_t height, size_t width, size_t nnz,
+                                 SparseValueType valueType, SparseFormat format,
+                                 bool trans)
+    : Matrix(NULL, height, width, trans, true) {
+  resize(height, width, nnz, valueType, format);
+}
+
+GpuSparseMatrix::GpuSparseMatrix(GpuMemHandlePtr dataHandle,
+                                 hl_sparse_matrix_s_ptr sMatrix, size_t height,
+                                 size_t width, size_t nnz,
+                                 SparseValueType valueType, SparseFormat format,
+                                 bool trans, MemoryHandlePtr sMemoryHandle)
+    : Matrix(dataHandle, height, width, trans, true) {
+  CHECK(dataHandle && sMatrix) << "Invalid argument pointer";
+
+  size_t size = 0;
+  if (format == SPARSE_CSR) {
+    size = (height + 1) * sizeof(int) + nnz * sizeof(int);
+  } else {
+    size = (width + 1) * sizeof(int) + nnz * sizeof(int);
+  }
+
+  if (NO_VALUE != valueType) {
+    size += nnz * sizeof(real);
+  }
+  CHECK_LE(size, dataHandle->getSize());
+
+  sMatrix_ = sMatrix;
+
+  if (sMemoryHandle == NULL) {
+    sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(dataHandle->getSize());
+  } else {
+    CHECK_EQ(sMemoryHandle->getSize(), dataHandle->getSize());
+    sMemoryHandle_ = sMemoryHandle;
+  }
+
+  elementCnt_ = nnz;
+  valueType_ = valueType;
+  format_ = format;
+  if (format_ == SPARSE_CSR)
+    sparseResizeCSR();
+  else
+    sparseResizeCSC();
+}
+
+GpuSparseMatrix::GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix, size_t height,
+                                 size_t width, size_t nnz,
+                                 SparseValueType valueType, SparseFormat format,
+                                 bool trans, MemoryHandlePtr sMemoryHandle)
+    : Matrix(NULL, height, width, trans, true) {
+  CHECK(sMatrix) << "Invalid argument pointer";
+  sMatrix_ = sMatrix;
+  sMemoryHandle_ = sMemoryHandle;
+  elementCnt_ = nnz;
+  format_ = format;
+  valueType_ = valueType;
+}
+
+GpuSparseMatrix::GpuSparseMatrix(real* value, int* rows, int* cols,
+                                 size_t height, size_t width, size_t nnz,
+                                 SparseValueType valueType, SparseFormat format,
+                                 bool trans)
+    : Matrix(NULL, height, width, trans, true) {
+  size_t size = 0;
+  if (format == SPARSE_CSR) {
+    size = (height + 1) * sizeof(int) + nnz * sizeof(int);
+  } else {
+    size = (width + 1) * sizeof(int) + nnz * sizeof(int);
+  }
+
+  if (NO_VALUE != valueType) {
+    size += nnz * sizeof(real);
+  }
+  elementCnt_ = nnz;
+  valueType_ = valueType;
+  format_ = format;
+
+  sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(size);
+  if (format_ == SPARSE_CSR) {
+    rows_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
+    cols_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+        (height_ + 1) * sizeof(int));
+    if (NO_VALUE != valueType_) {
+      value_ = reinterpret_cast<real*>(
+          reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+          (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
+    } else {
+      value_ = NULL;
+    }
+
+    if (sMatrix_ == NULL) {
+      /* construct hl_sparse_matrix_s */
+      hl_sparse_matrix_s tmp;
+      hl_construct_sparse_matrix(
+          &tmp, value, rows, cols, HL_SPARSE_CSR,
+          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, height_,
+          width_, elementCnt_);
+      hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
+      sMatrix_ = tmp2;
+    }
+
+  } else {
+    cols_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
+    rows_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+        (width_ + 1) * sizeof(int));
+    if (NO_VALUE != valueType_) {
+      value_ = reinterpret_cast<real*>(
+          reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+          (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
+    } else {
+      value_ = NULL;
+    }
+
+    if (sMatrix_ == NULL) {
+      /* construct hl_sparse_matrix_s */
+      hl_sparse_matrix_s tmp;
+      hl_construct_sparse_matrix(
+          &tmp, value, rows, cols, HL_SPARSE_CSC,
+          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, height_,
+          width_, elementCnt_);
+      hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
+      sMatrix_ = tmp2;
+    }
+    LOG(INFO) << "weight to matrix ";
+  }
+}
+
+void GpuSparseMatrix::sparseResizeCSR() {
+  rows_ =
+      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
+  cols_ =
+      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+                             (height_ + 1) * sizeof(int));
+  if (NO_VALUE != valueType_) {
+    value_ = reinterpret_cast<real*>(
+        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+        (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
+  } else {
+    value_ = NULL;
+  }
+
+  if (sMatrix_ == NULL) {
+    /* construct hl_sparse_matrix_s */
+    hl_sparse_matrix_s tmp;
+    hl_construct_sparse_matrix(
+        &tmp, data_, memoryHandle_->getSize(), HL_SPARSE_CSR,
+        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, height_, width_,
+        elementCnt_);
+    hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
+    sMatrix_ = tmp2;
+  }
+}
+
+void GpuSparseMatrix::sparseResizeCSC() {
+  cols_ =
+      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
+  rows_ =
+      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+                             (width_ + 1) * sizeof(int));
+  if (NO_VALUE != valueType_) {
+    value_ = reinterpret_cast<real*>(
+        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+        (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
+  } else {
+    value_ = NULL;
+  }
+
+  if (sMatrix_ == NULL) {
+    /* construct hl_sparse_matrix_s */
+    hl_sparse_matrix_s tmp;
+    hl_construct_sparse_matrix(
+        &tmp, memoryHandle_->getBuf(), memoryHandle_->getSize(), HL_SPARSE_CSC,
+        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, height_, width_,
+        elementCnt_);
+    hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
+    sMatrix_ = tmp2;
+  }
+}
+
+void GpuSparseMatrix::resize(size_t newHeight, size_t newWidth, size_t newNnz,
+                             SparseValueType valueType, SparseFormat format) {
+  if (format == SPARSE_CSR) {
+    resizeCSR(newHeight, newWidth, newNnz, valueType);
+  } else {
+    resizeCSC(newHeight, newWidth, newNnz, valueType);
+  }
+}
+
+void GpuSparseMatrix::resizeCSR(size_t newHeight, size_t newWidth,
+                                size_t newNnz, SparseValueType valueType) {
+  size_t newSize = (newHeight + 1) * sizeof(int) + newNnz * sizeof(int);
+  if (NO_VALUE != valueType) {
+    newSize += newNnz * sizeof(real);
+  }
+
+  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
+    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize);
+    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
+    sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
+    end_ = reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+           sMemoryHandle_->getSize();
+    sMatrix_ = NULL;
+  } else if (valueType != valueType_) {
+    sMatrix_ = NULL;
+  } else {
+    /*
+     * newNnz > elementCnt_ is necessary for the following condition:
+     * Firstly, height_ is 9 elementCnt_ is 56
+     * Secondly, height_ is 11 elementCnt_ is 44
+     *   ==> height_ is bigger, sMatrix_ will resize, and total item is 44 now
+     * Then, height_ is 10 elementCnt_ is 52
+     *   ==> Without newNnz > elementCnt_ condition, sMatrix_ will fail
+     */
+    if ((ssize_t)((newHeight + 1) * sizeof(int)) >
+            ((char*)cols_ - (char*)rows_) ||
+        newNnz > static_cast<size_t>(sMatrix_->nnz)) {
+      sMatrix_ = NULL;
+    } else if (NO_VALUE == valueType) {
+      if ((ssize_t)(newNnz * sizeof(int)) > (end_ - (char*)cols_)) {
+        sMatrix_ = NULL;
+      }
+    } else {
+      if ((ssize_t)(newNnz * sizeof(int)) > ((char*)value_ - (char*)cols_) ||
+          (ssize_t)(newNnz * sizeof(real)) > (end_ - (char*)value_)) {
+        sMatrix_ = NULL;
+      }
+    }
+  }
+
+  height_ = newHeight;
+  width_ = newWidth;
+  elementCnt_ = newNnz;
+  valueType_ = valueType;
+  format_ = SPARSE_CSR;
+
+  if (sMatrix_ == NULL) {
+    sparseResizeCSR();
+  }
+}
+
+void GpuSparseMatrix::resizeCSC(size_t newHeight, size_t newWidth,
+                                size_t newNnz, SparseValueType valueType) {
+  size_t newSize = (newWidth + 1) * sizeof(int) + newNnz * sizeof(int);
+  if (NO_VALUE != valueType) {
+    newSize += newNnz * sizeof(real);
+  }
+
+  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
+    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize);
+    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
+    sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
+    end_ = reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+           sMemoryHandle_->getSize();
+    sMatrix_ = NULL;
+  } else if (valueType != valueType_) {
+    sMatrix_ = NULL;
+  } else {
+    /*
+     * newNnz > elementCnt_ is necessary for the following condition:
+     * Firstly, height_ is 9 elementCnt_ is 56
+     * Secondly, height_ is 11 elementCnt_ is 44
+     *   ==> height_ is bigger, sMatrix_ will resize,
+     *       and total item is 44 now
+     * Then, height_ is 10 elementCnt_ is 52
+     *   ==> Without newNnz > elementCnt_ condition, sMatrix_ will fail
+     */
+    if ((ssize_t)((newWidth + 1) * sizeof(int)) >
+            ((char*)rows_ - (char*)cols_) ||
+        newNnz > static_cast<size_t>(sMatrix_->nnz)) {
+      sMatrix_ = NULL;
+    } else if (NO_VALUE == valueType) {
+      if ((ssize_t)(newNnz * sizeof(int)) > (end_ - (char*)rows_)) {
+        sMatrix_ = NULL;
+      }
+    } else {
+      if ((ssize_t)(newNnz * sizeof(int)) > ((char*)value_ - (char*)rows_) ||
+          (ssize_t)(newNnz * sizeof(real)) > (end_ - (char*)value_)) {
+        sMatrix_ = NULL;
+      }
+    }
+  }
+
+  height_ = newHeight;
+  width_ = newWidth;
+  elementCnt_ = newNnz;
+  valueType_ = valueType;
+  format_ = SPARSE_CSC;
+
+  if (sMatrix_ == NULL) {
+    sparseResizeCSC();
+  }
+}
+
+void GpuSparseMatrix::resize(size_t newHeight, size_t newWidth) {
+  resize(newHeight, newWidth, elementCnt_, valueType_, format_);
+}
+
+MatrixPtr GpuSparseMatrix::getTranspose() {
+  CHECK(memoryHandle_.get() || sMatrix_) << "not supported";
+  if (memoryHandle_.get()) {
+    MatrixPtr copy_T(new GpuSparseMatrix(
+        std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_), sMatrix_,
+        height_, width_, elementCnt_, valueType_, format_, true,
+        sMemoryHandle_));
+    return copy_T;
+  } else {
+    MatrixPtr copy_T(new GpuSparseMatrix(sMatrix_, height_, width_, elementCnt_,
+                                         valueType_, format_, true,
+                                         sMemoryHandle_));
+    return copy_T;
+  }
+}
+
+void GpuSparseMatrix::copyRow(int offsets, size_t colNum,
+                              const sparse_non_value_t* row) {
+  memcpy(cols_ + offsets, row, sizeof(int) * colNum);
+}
+
+void GpuSparseMatrix::copyRow(int offsets, size_t colNum,
+                              const sparse_float_value_t* row) {
+  for (size_t j = 0; j < colNum; j++) {
+    cols_[offsets + j] = row[j].col;
+    value_[offsets + j] = row[j].value;
+  }
+}
+
+void GpuSparseMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
+  if (auto mat = dynamic_cast<const CpuSparseMatrix*>(&src)) {
+    copyFrom(*(const_cast<CpuSparseMatrix*>(mat)), stream);
+  } else if (auto mat = dynamic_cast<const GpuSparseMatrix*>(&src)) {
+    copyFrom(*(const_cast<GpuSparseMatrix*>(mat)), stream);
+  } else {
+    LOG(FATAL) << "Not implemented";
+  }
+}
+
+void GpuSparseMatrix::copyFrom(const Matrix& src) {
+  copyFrom(src, HPPL_STREAM_1);
+  hl_stream_synchronize(HPPL_STREAM_1);
+}
+
+template <class T>
+void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data,
+                               hl_stream_t stream) {
+  CHECK_EQ(format_, SPARSE_CSR);
+  size_t nnz = 0;
+  for (size_t i = 0; i < height_; i++) {
+    int64_t id = ids[i];
+    nnz += indices[id + 1] - indices[id];
+  }
+
+  resize(height_, width_, nnz,
+         sizeof(T) == sizeof(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE,
+         format_);
+
+  rows_[0] = 0;
+  for (size_t i = 0; i < height_; i++) {
+    int64_t id = ids[i];
+    size_t colNum = indices[id + 1] - indices[id];
+    rows_[i + 1] = rows_[i] + colNum;
+
+    T* row = data + indices[id];
+    copyRow(rows_[i], colNum, row);
+  }
+
+  sMatrix_->format = HL_SPARSE_CSR;
+  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+  sMatrix_->rows = height_;
+  sMatrix_->cols = width_;
+  sMatrix_->nnz = nnz;
+  hl_memcpy_csr_matrix(sMatrix_.get(), value_, rows_, cols_, stream);
+}
+
+void GpuSparseMatrix::setRow(size_t row, size_t colNum,
+                             const unsigned int* cols, const real* values) {
+  CHECK_EQ(format_, SPARSE_CSR);
+  if (NO_VALUE == valueType_) {
+    CHECK_LT(row, height_);
+    CHECK(NULL != cols);
+    CHECK(NULL == values);
+  } else {
+    CHECK_LT(row, height_);
+    CHECK(NULL != cols);
+    CHECK(NULL != values);
+  }
+  if (0 == row) {
+    rows_[row] = 0;
+  }
+  rows_[row + 1] = rows_[row] + colNum;
+
+  memcpy(cols_ + rows_[row], cols, sizeof(*cols) * colNum);
+  if (FLOAT_VALUE == valueType_) {
+    memcpy(value_ + rows_[row], values, sizeof(*values) * colNum);
+  }
+
+  if (height_ - 1 == row) {
+    sMatrix_->format = HL_SPARSE_CSR;
+    sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+    sMatrix_->rows = height_;
+    sMatrix_->cols = width_;
+    sMatrix_->nnz = elementCnt_;
+    hl_memcpy_csr_matrix(sMatrix_.get(), value_, rows_, cols_,
+                         HPPL_STREAM_DEFAULT);
+  }
+}
+
+SparseValueType GpuSparseMatrix::getValueType() const { return valueType_; }
+
+void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
+  CHECK_EQ(format_, SPARSE_CSC);
+  int nnz = sMatrix_->nnz;
+  if (memAlloc) {
+    matTrans = std::make_shared<GpuSparseMatrix>(width_, height_, nnz,
+                                                 valueType_, format_, false);
+  } else {
+    CHECK(matTrans != nullptr);
+  }
+
+  CpuIVector rows(nnz);
+  CpuIVector cols(width_ + 1);
+  CpuIVector cols_full(nnz);
+  CpuVector value(nnz);
+  hl_stream_t stream = HPPL_STREAM_1;
+  hl_memcpy_from_csc_matrix(value.getData(), nnz, rows.getData(), nnz,
+                            cols.getData(), width_ + 1,
+                            sMatrix_.get(), stream);
+
+  hl_stream_synchronize(stream);
+
+  /*for every non zero number, get its column index*/
+  std::vector<Element> dataVec;
+  for (size_t i = 0; i < width_; i++) {
+    for (int j = cols.getData()[i]; j < cols.getData()[i + 1]; j++) {
+      cols_full.getData()[j] = i;
+    }
+  }
+
+  /*sort row index and column index by the ascending order*/
+  for (int i = 0; i < nnz; i++) {
+    dataVec.emplace_back(rows.getData()[i], cols_full.getData()[i],
+                         value.getData()[i]);
+  }
+  std::sort(dataVec.begin(), dataVec.end(), [](Element a, Element b) {
+    return a.row < b.row || (a.row == b.row && a.col < b.col);
+  });
+
+  /*get sorted data, row index, and col index, put them in the right place*/
+  cols.resize(height_ + 1);
+  rows.resize(nnz);
+  value.resize(nnz);
+
+  cols.getData()[0] = 0;
+  rows.getData()[0] = dataVec[0].col;
+  value.getData()[0] = dataVec[0].val;
+  for (int i = 1; i < nnz; i++) {
+    if (dataVec[i].row != dataVec[i - 1].row) {
+      for (int j = dataVec[i - 1].row + 1; j <= dataVec[i].row; j++) {
+        cols.getData()[j] = i;
+      }
+    }
+    rows.getData()[i] = dataVec[i].col;
+    value.getData()[i] = dataVec[i].val;
+  }
+  cols.getData()[height_] = nnz;
+
+  /*copy back from cpu*/
+  GpuSparseMatrixPtr dest =
+      std::dynamic_pointer_cast<GpuSparseMatrix>(matTrans);
+  hl_memcpy_csc_matrix((dest->sMatrix_).get(), value.getData(),
+                       rows.getData(), cols.getData(), stream);
+  hl_stream_synchronize(stream);
+}
+
+void GpuSparseMatrix::mul(const GpuMatrixPtr a, const GpuMatrixPtr b,
+                          real scaleAB, real scaleT) {
+  CHECK(a->useGpu_ && b->useGpu_) << "type not match";
+  CHECK(!trans_) << "trans not supported";
+  real* A_d = a->getData();
+  real* B_d = b->getData();
+  hl_sparse_matrix_s C_d = sMatrix_.get();
+  hl_trans_op_t a_trans = a->trans_ ? HPPL_OP_T : HPPL_OP_N;
+  hl_trans_op_t b_trans = b->trans_ ? HPPL_OP_T : HPPL_OP_N;
+
+  if (!a->trans_ && !b->trans_) {
+    CHECK(height_ == a->getHeight());
+    CHECK(width_ == b->getWidth());
+    CHECK(a->getWidth() == b->getHeight());
+  } else if (a->trans_ && !b->trans_) {
+    CHECK(height_ == a->getWidth());
+    CHECK(width_ == b->getWidth());
+    CHECK(a->getHeight() == b->getHeight());
+  } else if (!a->trans_ && b->trans_) {
+    CHECK(height_ == a->getHeight());
+    CHECK(width_ == b->getHeight());
+    CHECK(a->getWidth() == b->getWidth());
+  } else {
+    LOG(INFO) << "Not support";
+  }
+  int dimM = height_;
+  int dimN = width_;
+  int dimK = !b->trans_ ? b->getHeight() : b->getWidth();
+  hl_sparse_matrix_mul(A_d, a_trans, B_d, b_trans, C_d, dimM,
+                       dimN, dimK, scaleAB, scaleT);
+}
+
+void GpuSparseMatrix::mul(const MatrixPtr a, const MatrixPtr b, real scaleAB,
+                          real scaleT) {
+  if (std::dynamic_pointer_cast<GpuMatrix>(a) &&
+      std::dynamic_pointer_cast<GpuMatrix>(b)) {
+    GpuMatrixPtr a_ptr = std::dynamic_pointer_cast<GpuMatrix>(a);
+    GpuMatrixPtr b_ptr = std::dynamic_pointer_cast<GpuMatrix>(b);
+    mul(a_ptr, b_ptr, scaleAB, scaleT);
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+}
+
+template <class T>
+void printBuf(std::ostream& os, T* a, size_t len, const char* name) {
+  os << "\n: " << name << " [";
+  for (size_t i = 0; i < len; i++) {
+    os << a[i] << " ";
+  }
+  os << "]\n";
+}
+
+void GpuSparseMatrix::print(std::ostream& os) const {
+  if (format_ == SPARSE_CSC) {
+    int nnz = sMatrix_->nnz;
+    IVectorPtr rows = IVector::create(nnz, false);
+    IVectorPtr cols = IVector::create(width_ + 1, false);
+    VectorPtr value = Vector::create(nnz, false);
+    hl_stream_t stream = HPPL_STREAM_DEFAULT;
+    hl_memcpy_from_csc_matrix(
+        value->getData(), value->getSize(), rows->getData(), rows->getSize(),
+        cols->getData(), cols->getSize(), sMatrix_.get(), stream);
+    hl_stream_synchronize(stream);
+
+    printBuf(os, cols->getData(), width_ + 1, "col idx");
+    printBuf(os, rows->getData(), elementCnt_, "row idx");
+    printBuf(os, value->getData(), elementCnt_, "value");
+  }
+}
+
+void GpuSparseMatrix::copyFromCSR(CpuSparseMatrix& src, hl_stream_t stream) {
+  trans_ = src.trans_;
+  size_t nnz = src.getElementCnt();
+
+  resize(src.getHeight(), src.getWidth(), nnz, valueType_,
+         src.getFormat());
+  // if have different value type, only copy rows and cols
+  SparseValueType vType =
+    valueType_ != src.getValueType() ? NO_VALUE : valueType_;
+
+  sMatrix_->format = HL_SPARSE_CSR;
+  sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+  sMatrix_->rows = height_;
+  sMatrix_->cols = width_;
+  sMatrix_->nnz = nnz;
+
+  hl_memcpy_csr_matrix(sMatrix_.get(),
+                       vType == NO_VALUE ? NULL : src.getValue(),
+                       src.getRows(), src.getCols(), stream);
+
+  // restore type of sMatrix_
+  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+}
+
+void GpuSparseMatrix::copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream) {
+  trans_ = src.trans_;
+  size_t nnz = src.getElementCnt();
+
+  resize(src.getHeight(), src.getWidth(), nnz, valueType_,
+         src.getFormat());
+
+  // if have different value type, only copy rows and cols
+  SparseValueType vType =
+    valueType_ != src.getValueType() ? NO_VALUE : valueType_;
+
+  sMatrix_->format = HL_SPARSE_CSC;
+  sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+  sMatrix_->rows = height_;
+  sMatrix_->cols = width_;
+  sMatrix_->nnz = nnz;
+
+  hl_memcpy_csc_matrix(sMatrix_.get(),
+                       vType == NO_VALUE ? NULL : src.getValue(),
+                       src.getRows(), src.getCols(), stream);
+
+  // restore type of sMatrix_
+  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+}
+
+void GpuSparseMatrix::copyFrom(GpuSparseMatrix& src, hl_stream_t stream) {
+  CHECK(trans_ == src.trans_);
+  CHECK(format_ == src.getFormat());
+  resize(src.getHeight(), src.getWidth(), elementCnt_, valueType_,
+         src.getFormat());
+
+  size_t rowSize = format_ == SPARSE_CSC ? elementCnt_ : height_ + 1;
+  size_t colSize = format_ == SPARSE_CSC ? width_ + 1 : elementCnt_;
+
+  if (valueType_ == FLOAT_VALUE && src.getValueType() == FLOAT_VALUE) {
+    hl_memcpy_async(getValue(), src.getValue(),
+                    sizeof(real) * elementCnt_, stream);
+  }
+  CHECK(getRows());
+  CHECK(src.getRows());
+
+  hl_memcpy_async(getRows(), src.getRows(),
+                  sizeof(int) * rowSize, stream);
+  hl_memcpy_async(getCols(), src.getCols(),
+                  sizeof(int) * colSize, stream);
+}
+
+void GpuSparseMatrix::copyFrom(CpuSparseMatrix& src, hl_stream_t stream) {
+  if (format_ == SPARSE_CSR) {
+    copyFromCSR(src, stream);
+  } else {
+    copyFromCSC(src, stream);
+  }
+}
+
+void GpuSparseMatrix::trimFromCSR(const CpuSparseMatrix& src) {
+  trans_ = src.trans_;
+  int* srcCols = src.getCols();
+  size_t nnz = std::count_if(srcCols, srcCols + src.getElementCnt(),
+                             [this](size_t n) { return n < this->width_; });
+  resize(height_, width_, nnz, valueType_, format_);
+
+  rows_[0] = 0;
+  size_t index = 0;
+  for (size_t r = 0; r < height_; ++r) {
+    for (int i = src.getRows()[r]; i < src.getRows()[r + 1]; ++i) {
+      if (srcCols[i] < (int)width_) {
+        cols_[index] = srcCols[i];
+        if (valueType_ == FLOAT_VALUE) {
+          value_[index] = src.getValue()[i];
+        }
+        ++index;
+      }
+    }
+    rows_[r + 1] = index;
+  }
+  CHECK_EQ(index, nnz);
+
+  sMatrix_->format = HL_SPARSE_CSR;
+  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+  sMatrix_->rows = height_;
+  sMatrix_->cols = width_;
+  sMatrix_->nnz = nnz;
+
+  hl_memcpy_csr_matrix(
+      sMatrix_.get(), valueType_ == NO_VALUE ? NULL : value_, rows_, cols_,
+      /*default stream = */ HPPL_STREAM_DEFAULT);
+}
+
+void GpuSparseMatrix::trimFromCSC(const CpuSparseMatrix& src) {
+  trans_ = src.trans_;
+  size_t nnz = src.getCols()[width_] - src.getCols()[0];
+  resize(height_, width_, nnz, valueType_, format_);
+
+  cols_[0] = 0;
+  for (size_t i = 0; i < width_; i++) {
+    cols_[i + 1] = cols_[i] + (int)(src.getRowNum(i));
+  }
+  memcpy(rows_, src.getRows() + src.getCols()[0], sizeof(int) * nnz);
+  if (valueType_ == FLOAT_VALUE) {
+    memcpy(value_, src.getValue() + src.getCols()[0], sizeof(real) * nnz);
+  }
+
+  sMatrix_->format = HL_SPARSE_CSC;
+  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+  sMatrix_->rows = height_;
+  sMatrix_->cols = width_;
+  sMatrix_->nnz = nnz;
+
+  hl_memcpy_csc_matrix(
+      sMatrix_.get(), valueType_ == NO_VALUE ? NULL : value_, rows_, cols_,
+      /*default stream = */ HPPL_STREAM_DEFAULT);
+}
+
+void GpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
+  if (format_ == SPARSE_CSR) {
+    trimFromCSR(src);
+  } else {
+    trimFromCSC(src);
+  }
+}
+
+void GpuSparseMatrix::addBias(Matrix& b, real scale) {
+  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
+  hl_sparse_matrix_s A_d = sMatrix_.get();
+  hl_sparse_matrix_add_bias(A_d, b.getData(), scale);
+}
+
+void GpuSparseMatrix::add3(GpuMatrix* b) {
+  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
+  CHECK(height_ == b->getHeight());
+  CHECK(width_ == b->getWidth());
+  real* B_d = b->getData();
+  hl_sparse_matrix_s A_d = sMatrix_.get();
+  hl_sparse_matrix_add_dense(A_d, B_d, height_, width_, 1, 0);
+}
+
+void GpuSparseMatrix::add3(MatrixPtr b) {
+  if (dynamic_cast<GpuMatrix*>(b.get())) {
+    add3(dynamic_cast<GpuMatrix*>(b.get()));
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+}
+
+void GpuSparseMatrix::zeroMem() {
+  CHECK(valueType_ == FLOAT_VALUE);
+  real* value = getValue();
+  if (value == NULL) {
+    LOG(FATAL) << "value is nullptr";
+  }
+  hl_matrix_zero_mem(value, elementCnt_);
+}
+
+void GpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
+#ifndef PADDLE_ONLY_CPU
+  CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
+  size_t numSamples = getHeight();
+  size_t beam = maxVal.getWidth();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getHeight(), numSamples);
+  CHECK_EQ(format_, SPARSE_CSR) << "Only support SPARSE_CSR";
+
+  hl_sparse_matrix_top_k(maxVal.getData(),
+                         maxVal.getStride(),
+                         maxIds.getData(),
+                         sMatrix_.get(),
+                         beam,
+                         numSamples);
+#endif
+}
+
+template void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices,
+                                        sparse_non_value_t* data,
+                                        hl_stream_t stream);
+template void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices,
+                                        sparse_float_value_t* data,
+                                        hl_stream_t stream);
+}  // namespace paddle
diff --git a/paddle/math/SparseMatrix.h b/paddle/math/SparseMatrix.h
new file mode 100644
index 00000000000000..4b9a03302bf531
--- /dev/null
+++ b/paddle/math/SparseMatrix.h
@@ -0,0 +1,216 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+#include <cstddef>
+#include "Matrix.h"
+#include "CpuSparseMatrix.h"
+
+namespace paddle {
+
+typedef std::shared_ptr<_hl_sparse_matrix_s> hl_sparse_matrix_s_ptr;
+
+class GpuSparseMatrix : public Matrix {
+public:
+  MemoryHandlePtr sMemoryHandle_;
+  int* rows_;
+  int* cols_;
+  real* value_;
+  const char* end_; /* point to the end of sMemoryHandle_ */
+
+  hl_sparse_matrix_s_ptr sMatrix_;
+  SparseValueType valueType_;
+  SparseFormat format_;
+
+public:
+  GpuSparseMatrix(size_t height, size_t width,
+                  size_t nnz, /* used to allocate space */
+                  SparseValueType valueType = FLOAT_VALUE,
+                  SparseFormat format_ = SPARSE_CSR, bool trans = false);
+
+  GpuSparseMatrix(GpuMemHandlePtr dataHandle, hl_sparse_matrix_s_ptr sMatrix,
+                  size_t height, size_t width,
+                  size_t nnz, /* used to allocate space */
+                  SparseValueType valueType = FLOAT_VALUE,
+                  SparseFormat format_ = SPARSE_CSR, bool trans = false,
+                  MemoryHandlePtr sMemoryHandle = NULL);
+
+  GpuSparseMatrix(real* value, int* rows, int* cols, size_t height,
+                  size_t width, size_t nnz, SparseValueType valueType,
+                  SparseFormat format, bool trans);
+
+  GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix, size_t height, size_t width,
+                  size_t nnz, SparseValueType valueType, SparseFormat format,
+                  bool trans, MemoryHandlePtr sMemoryHandle);
+
+protected:
+  struct Element {
+    int row;
+    int col;
+    real val;
+    Element(int rowIn, int colIn, real valIn)
+        : row(rowIn), col(colIn), val(valIn) {}
+  };
+
+public:
+  ~GpuSparseMatrix() {}
+
+  void resize(size_t newHeight, size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType, SparseFormat format);
+
+  void resize(size_t newHeight, size_t newWidth);
+
+  void sparseResizeCSR();
+
+  void sparseResizeCSC();
+
+  void resizeCSR(size_t newHeight, size_t newWidth, size_t newNnz,
+                 SparseValueType valueType);
+
+  void resizeCSC(size_t newHeight, size_t newWidth, size_t newNnz,
+                 SparseValueType valueType);
+
+  void mul(const GpuMatrixPtr a, const GpuMatrixPtr b, real scaleAB,
+           real scaleT);
+  /// B = A , B.trans = !A.trans
+  MatrixPtr getTranspose();
+
+  /// B = A'
+  void transpose(MatrixPtr matTrans, bool memAlloc);
+
+  void copyFrom(const Matrix& src);
+  void copyFrom(const Matrix& src, hl_stream_t stream);
+  void copyFromCSR(CpuSparseMatrix& src, hl_stream_t stream);
+  void copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream);
+
+  void copyFrom(const IVector& src) { LOG(FATAL) << "not implemented"; }
+  void copyFrom(const IVector& src, hl_stream_t stream) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  template <class T>
+  void copyFrom(int64_t* ids, int64_t* indices, T* data, hl_stream_t stream);
+
+  void setRow(size_t row, size_t colNum, const unsigned int* cols,
+              const real* values);
+  SparseValueType getValueType() const;
+  SparseFormat getFormat() const { return format_; }
+
+  const int* getRowCols(size_t x) const { return cols_ + rows_[x]; }
+  const real* getRowValues(size_t x) const { return value_ + rows_[x]; }
+  size_t getColNum(size_t x) const { return rows_[x + 1] - rows_[x]; }
+  void print(std::ostream& os) const;
+
+  /**
+   * @brief only set value_ of FLOAT_VALUE sparse matrix to zero
+   */
+  void zeroMem();
+
+  /**
+   * @brief sparseMatrix += denseMatrix
+   *
+   * Named add3 just because add/add2 has been used in BaseMatrix.cu
+   * and they are not virtual function.
+   *
+   * Only add value of same (row, col) index in dense matrix
+   * and do not use others values.
+   *
+   * @param[in]  b   dense matrix
+   */
+  void add3(GpuMatrix* b);
+  void add3(MatrixPtr b);
+
+  /**
+   * @brief sparseMatrix[i,j] += bias[j], (j is the col index of sparse matrix)
+   *
+   * @param[in]  b      bias, dense matrix and height = 1
+   * @param[in]  scale  scale of b
+   */
+  void addBias(Matrix& b, real scale);
+
+  /**
+   * @brief return rows, which is gpu address
+   */
+  int* getRows() const {
+    CHECK(sMatrix_.get()) << "sMatrix_ is NULL";
+    return hl_sparse_matrix_get_rows(sMatrix_.get());
+  }
+
+  /**
+   * @brief return cols, which is gpu address
+   */
+  int* getCols() const {
+    CHECK(sMatrix_.get()) << "sMatrix_ is NULL";
+    return hl_sparse_matrix_get_cols(sMatrix_.get());
+  }
+
+  /**
+   * @brief return value, which is gpu address
+   */
+  real* getValue() const {
+    CHECK(sMatrix_.get()) << "sMatrix_ is NULL";
+    return hl_sparse_matrix_get_value(sMatrix_.get());
+  }
+
+  /**
+   * @brief return value_ of sparse matrix
+   *
+   * Some times CpuSparseMatrix maybe Matrix,
+   * if getValue, must dynamic_cast to CpuSparseMatrix,
+   * getData is convenient to get value
+   */
+  real* getData() { return getValue(); }
+  const real* getData() const { return getValue();}
+
+  /**
+   * @brief  Get top k value of each row in sparse matrix.
+   *
+   * Store the value in maxVal and theirs index in maxIds.
+   * k = maxVal.width
+   *
+   * @param[out]  maxIds    index of top k
+   * @param[out]  maxVal    value of top k
+   */
+  void rowMax(IVector& maxIds, Matrix& maxVal);
+
+protected:
+  void sparseResize();
+
+  void copyRow(int offsets, size_t colNum, const sparse_non_value_t* row);
+  void copyRow(int offsets, size_t colNum, const sparse_float_value_t* row);
+
+public:
+  void mul(const MatrixPtr a, const MatrixPtr b, real scaleAB, real scaleT);
+
+  void copyFrom(CpuSparseMatrix& src, hl_stream_t stream);
+  void copyFrom(GpuSparseMatrix& src, hl_stream_t stream);
+
+  void trimFrom(const CpuSparseMatrix& src);
+  void trimFromCSR(const CpuSparseMatrix& src);
+  void trimFromCSC(const CpuSparseMatrix& src);
+
+  // BaseMatrixT interface
+public:
+  bool isSparse() const {
+    return true;
+  }
+
+private:
+  using Matrix::mul;
+  using Matrix::copyFrom;
+};
+
+}  // namespace paddle
diff --git a/paddle/math/SparseRowMatrix.cpp b/paddle/math/SparseRowMatrix.cpp
new file mode 100644
index 00000000000000..0b5de252258a96
--- /dev/null
+++ b/paddle/math/SparseRowMatrix.cpp
@@ -0,0 +1,268 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "SparseRowMatrix.h"
+#include "CpuSparseMatrix.h"
+
+#include <cmath>
+#include <algorithm>
+
+#include "paddle/utils/Logging.h"
+
+#include "SIMDFunctions.h"
+
+#include "paddle/utils/Util.h"
+#include "paddle/utils/Thread.h"
+
+P_DEFINE_bool(allow_inefficient_sparse_update, false,
+              "Whether to allow inefficient sparse update");
+
+namespace paddle {
+
+const unsigned int SparseRowCpuMatrix::kUnusedId_ = -1U;
+
+void SparseRowCpuMatrix::init(size_t height, size_t width) {
+  // @TODO(yuyang18) Just remove this limit
+  CHECK(simd::vec_check(width)) << width;
+  height_ = height;
+  if (!indexDictHandle_) {
+    indexDictHandle_.reset(new IndexDict);
+    indexDictHandle_->globalIndices.assign(height, kUnusedId_);
+  }
+  localIndices_ = &indexDictHandle_->localIndices;
+  globalIndices_ = indexDictHandle_->globalIndices.data();
+}
+
+void SparseRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
+                             real scaleT) {
+  CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(a, b, this, scaleAB, scaleT);
+}
+
+void SparseRowCpuMatrix::copyFrom(const real* src, size_t size) {
+  LOG(FATAL) << "This should not be called";
+}
+
+void SparseRowCpuMatrix::zeroMem() {
+  apply(
+    [](real* buf, size_t len) {
+      memset(buf, 0, sizeof(real) * len);
+    });
+  clearRows();
+}
+
+void SparseRowCpuMatrix::applyL1Decay(real learningRate, real decayRate) {
+  apply([=](real* buf, size_t len) {
+      CpuVector value(0, nullptr);
+      value.subVecFrom(buf, 0, len);
+      value.applyL1(learningRate, decayRate);
+    });
+}
+
+void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value, IVector& t0,
+                                   real learningRate, int currentTime,
+                                   real decayRate, bool useL1, bool fini) {
+  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+
+  // t0 and value are vectors
+  CHECK_EQ(t0.getSize(), this->height_);
+  CHECK_EQ(value.width_, this->height_ * this->width_);
+
+  if (decayRate == 0.0f) {
+    if (fini) {
+      return;
+    }
+
+    for (size_t i = 0; i < localIndices.size(); ++i) {
+      real* g = getLocalRow(i);
+      real* v = value.rowBuf(localIndices[i]);
+      for (size_t j = 0; j < this->width_; ++j) {
+        v[j] -= learningRate * g[j];
+      }
+    }
+    return;
+  }  // else
+
+  if (useL1) {  // L1 decay
+    if (fini) {
+      for (size_t i = 0; i < this->height_; ++i) {
+        real* v = value.rowBuf(i);
+        int* t = t0.getData() + i;
+        if (t[0] < currentTime) {
+          // W(t0) -> W(t+1)
+          int tDiff = currentTime - t[0];
+          real delta = tDiff * learningRate * decayRate;
+          simd::decayL1(v, v, delta, this->width_);
+        }
+      }
+      return;
+    }  // else
+
+    for (size_t i = 0; i < localIndices.size(); ++i) {
+      real* g = getLocalRow(i);
+      real* v = value.rowBuf(localIndices[i]);
+      int* t = t0.getData() + localIndices[i];
+      if (t[0] < currentTime) {
+        // W(t0) -> W(t)
+        int tDiff = currentTime - t[0];
+        real delta = tDiff * learningRate * decayRate;
+        simd::decayL1(v, v, delta, this->width_);
+      }
+
+      // W(t) -> W(t+1)
+      for (size_t j = 0; j < this->width_; ++j) {
+        v[j] -= learningRate * g[j];
+      }
+      simd::decayL1(v, v, learningRate*decayRate, this->width_);
+
+      // state update to t+1
+      t[0] = currentTime + 1;
+    }
+
+  } else {  // L2 decay
+    if (fini) {
+      for (size_t i = 0; i < this->height_; ++i) {
+        real* v = value.rowBuf(i);
+        int* t = t0.getData() + i;
+        if (t[0] < currentTime) {
+          // W(t0) -> W(t+1)
+          int tDiff = currentTime - t[0];
+          real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate);
+          for (size_t j = 0; j < this->width_; ++j) {
+            v[j] *= recip;
+          }
+        }
+      }
+      return;
+    }  // else
+
+    real recipDecay = 1.0f / (1.0f + learningRate * decayRate);
+
+    for (size_t i = 0; i < localIndices.size(); ++i) {
+      real* g = getLocalRow(i);
+      real* v = value.rowBuf(localIndices[i]);
+      int* t = t0.getData() + localIndices[i];
+      if (t[0] < currentTime) {
+        // W(t0) -> W(t)
+        int tDiff = currentTime - t[0];
+        real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate);
+        for (size_t j = 0; j < this->width_; ++j) {
+          v[j] *= recip;
+        }
+      }
+
+      // W(t) -> W(t+1)
+      for (size_t j = 0; j < this->width_; ++j) {
+        v[j] = recipDecay * (v[j] - learningRate * g[j]);
+      }
+
+      // state update to t+1
+      t[0] = currentTime + 1;
+    }
+  }
+}
+
+void SparseRowCpuMatrix::addTo(BaseMatrix& dest, std::vector<uint32_t>& ids,
+                               size_t tid, size_t numThreads) {
+  CHECK(!dest.useGpu_);
+  CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);
+
+  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  for (size_t i = 0; i < localIndices.size(); ++i) {
+    uint32_t id = localIndices[i];
+    if (id % numThreads == tid) {
+      simd::addTo(dest.rowBuf(id), getLocalRow(i),
+                  this->width_);
+      ids.push_back(id);
+    }
+  }
+}
+
+void SparseRowCpuMatrix::addTo(SparseRowCpuMatrix& dest, size_t tid,
+                               size_t numThreads) {
+  CHECK(!dest.useGpu_);
+  CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);
+
+  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  for (size_t i = 0; i < localIndices.size(); ++i) {
+    uint32_t id = localIndices[i];
+    if (id % numThreads == tid) {
+      dest.checkIndex(id);
+      simd::addTo(dest.getRow(id), getLocalRow(i), this->width_);
+    }
+  }
+}
+
+void SparseRowCpuMatrix::zeroMemThread(size_t tid, size_t numThreads) {
+  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  for (size_t i = 0; i < localIndices.size(); ++i) {
+    uint32_t id = localIndices[i];
+    if (id % numThreads == tid) {
+      memset(this->getLocalRow(i), 0, this->width_ * sizeof(real));
+    }
+  }
+}
+
+void SparseAutoGrowRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b,
+                                     real scaleAB, real scaleT) {
+  CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(a, b, this, scaleAB,
+                                                        scaleT);
+}
+
+void CacheRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
+                            real scaleT) {
+  CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(a, b, this, scaleAB, scaleT);
+}
+
+void SparsePrefetchRowCpuMatrix::addRows(const unsigned int* ids, size_t len) {
+  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  localIndices.insert(localIndices.end(), ids, ids + len);
+}
+
+void SparsePrefetchRowCpuMatrix::addRows(MatrixPtr input) {
+  CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(input.get());
+  CHECK(mat) << "only support non value sparse matrix";
+  addRows(reinterpret_cast<const unsigned int*>(mat->getCols()),
+          mat->getElementCnt());
+}
+
+void SparsePrefetchRowCpuMatrix::addRows(IVectorPtr ids) {
+  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  size_t numSamples = ids->getSize();
+  int* index = ids->getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    if (index[i] == -1) continue;
+    localIndices.push_back((unsigned int)index[i]);
+  }
+}
+
+void SparsePrefetchRowCpuMatrix::setupIndices() {
+  auto& localIndices = indexDictHandle_->localIndices;
+  uniqueIds(localIndices);
+  // for each sparse row
+  for (size_t id = 0; id < localIndices.size(); ++id) {
+    globalIndices_[localIndices[id]] = id;  // sparse row -> local id
+  }
+  checkStoreSize();
+}
+
+void SparseRowCpuMatrix::checkIndices() {
+  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  for (size_t i = 0; i < localIndices.size(); ++i) {
+    CHECK_EQ(globalIndices_[localIndices[i]], i);
+  }
+  checkStoreSize();
+}
+
+}  // namespace paddle
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
new file mode 100644
index 00000000000000..2dcd81188d6431
--- /dev/null
+++ b/paddle/math/SparseRowMatrix.h
@@ -0,0 +1,315 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <algorithm>
+#include <string.h>
+#include "paddle/utils/CommandLineParser.h"
+#include "Matrix.h"
+#include "paddle/utils/Util.h"
+
+P_DECLARE_bool(allow_inefficient_sparse_update);
+
+namespace paddle {
+
+/**
+ * Sparse Row
+ */
+class SparseRowCpuMatrix : public CpuMatrix {
+public:
+  struct IndexDict {
+    // In the following, global id means the row id in the original matrix.
+    // Local id means the row id in the local storage which only contains
+    // the sparse rows.
+    std::vector<unsigned int> localIndices;   // local id -> global id
+    std::vector<unsigned int> globalIndices;  // global id -> local id
+  };
+  typedef std::shared_ptr<IndexDict> IndexDictPtr;
+
+  /// heightStore is max number of rows of the sparse matrix.
+  SparseRowCpuMatrix(CpuMemHandlePtr dataHandle,
+                     size_t height, size_t width,
+                     IndexDictPtr indexDictHandle = nullptr, bool trans = false)
+      : CpuMatrix(nullptr, height, width, trans),
+        storeMat_(dataHandle,
+                  dataHandle ? dataHandle->getSize() / sizeof(real) / width : 0,
+                  width, trans),
+        indexDictHandle_(indexDictHandle) {
+    init(height, width);
+  }
+
+  virtual ~SparseRowCpuMatrix() {}
+
+public:
+  /**
+   *  Get the row buf
+   *
+   *  @param row row id in the original matrix
+   */
+  real* getRow(size_t row) {
+    CHECK_NE(globalIndices_[row], kUnusedId_);
+    return getLocalRow(globalIndices_[row]);
+  }
+
+  /**
+   *  Get the row buf
+   *
+   *  @param row row id in local storage
+   */
+  real* getLocalRow(size_t row) {
+    if (storeMat_.getData()) return storeMat_.rowBuf(row);
+    if (rowStore_.size() <= row * width_) {
+      rowStore_.resize((row + 1) * width_);
+    }
+    return rowStore_.data() + row * width_;
+  }
+
+  /**
+   *  reserve the storage for rows according to current size of indexDictHandle.
+   *
+   *  This is only used when SparseRowCpuMatrix is constructed with
+   *  indexDictHandle.
+   */
+  void reserveStore() {
+    if (!storeMat_.getData() && !localIndices_->empty()) {
+      rowStore_.resize(localIndices_->size() * width_);
+    }
+  }
+
+  // row is the row id in the original matrix
+  virtual real* getRowBuf(size_t row) { return getRow(row); }
+
+  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+
+  /**
+   * Fill data according to row indexs added, setup indices inside.
+   *
+   * *src* and *size* are data and size of normal dense CpuMatrix.
+   */
+  virtual void copyFrom(const real* src, size_t size);
+  virtual void zeroMem();
+
+  /**
+   * apply L1 to all sparse rows, should be apply after indices ready.
+   */
+  void applyL1Decay(real learningRate, real decayRate);
+
+  void clearIndices() { clearRows(); }
+  void zeroMemThread(size_t tid, size_t numThreads);
+
+  /**
+   *  value -= grad * learningRate,  this is gradient.
+   *
+   * If L1 decay set use L1, else if L2 set use L2, otherwise no decay atall.
+   *
+   * t0 is a int vector used by L1/L2 decay, size = height of parameter matrix,
+   * store the time that each weight row last updated.
+   *
+   * Time is batchId, currentTime is current batchId.
+   *
+   * While pass finished, caller should call this func one more time
+   *  with (fini=true) to let weight decay catch up current time.
+   */
+  void sgdUpdate(BaseMatrix& value, IVector& t0, real learningRate,
+                 int currentTime, real decayRate, bool useL1,
+                 bool fini = false);
+
+  /**
+   *  merge rows in *this* to *dest* for designated thread
+   *
+   *  values add to *dest* matrix
+   *
+   *  ids occured in *this* append to *ids*
+   *  filtered by  (id % numThreads == tid)
+   */
+  void addTo(BaseMatrix& dest, std::vector<uint32_t>& ids, size_t tid,
+             size_t numThreads);
+
+  /**
+   *  the second version addTo(), *dest* is a SparseRowCpuMatrix.
+   *
+   *  The dest's indices should be setup already, addTo() will
+   *  check src ids is exist in dest's indices.
+   */
+  void addTo(SparseRowCpuMatrix& dest, size_t tid, size_t numThreads);
+
+  const IndexDictPtr& getIndexDictHandle() const { return indexDictHandle_; }
+
+  /**
+   *  check all local and global indices consistency
+   */
+  void checkIndices();
+  /**
+   *  check whether row *i* exist in indices
+   */
+  void checkIndex(size_t i) {
+    size_t localId = globalIndices_[i];
+    CHECK_LT(localId, localIndices_->size());
+    CHECK_EQ((*localIndices_)[localId], i);
+  }
+
+  std::vector<unsigned int>& getLocalIndices() const {
+    return indexDictHandle_->localIndices;
+  }
+
+protected:
+  template<typename Func>
+  void apply(Func f) {
+    real* data = storeMat_.getData() ? storeMat_.getData() : rowStore_.data();
+    f(data, localIndices_->size() * width_);
+  }
+
+  void init(size_t height, size_t width);
+
+  /// clear row indices.
+  void clearRows() {
+    for (auto id : *localIndices_) {
+      globalIndices_[id] = kUnusedId_;
+    }
+    localIndices_->clear();
+    rowStore_.clear();
+  }
+
+  inline void checkStoreSize() {
+    if (storeMat_.getData()) {
+      CHECK_LE(localIndices_->size(), storeMat_.getHeight());
+    } else if (!FLAGS_allow_inefficient_sparse_update) {
+      if (localIndices_->size() > 0.5 * height_) {
+        LOG(WARNING)
+            << "There are more than 0.5*height (" << localIndices_->size()
+            << ") rows are used for sparse "
+            << "update, which is not efficient. Considering not use "
+            << "sparse_update or set --allow_inefficient_sparse_update=true";
+      }
+    }
+  }
+
+  CpuMatrix storeMat_;
+  std::vector<real, AlignedAllocator<real, 32>> rowStore_;
+  IndexDictPtr indexDictHandle_;
+  std::vector<unsigned int>* localIndices_;  // =&indexDictHandle_->localIndices
+  unsigned int* globalIndices_;  // =indexDictHandle_->globalIndices.data();
+  static const unsigned int kUnusedId_;
+};
+
+class SyncThreadPool;
+
+/// For prefetching parameters from remote Parameter server
+class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
+public:
+  SparsePrefetchRowCpuMatrix(CpuMemHandlePtr dataHandle,
+                             size_t height, size_t width,
+                             IndexDictPtr indexDictHandle = nullptr,
+                             SyncThreadPool* pool = nullptr, bool trans = false)
+      : SparseRowCpuMatrix(dataHandle, height, width, indexDictHandle, trans),
+        pool_(pool) {}
+
+  /**
+   * Extract feature ids from *input*, to fill row indexs.
+   *
+   * *input* must be sparse matrix.
+   *
+   * Can call many times before setup.
+   */
+  void addRows(MatrixPtr input);
+  void addRows(IVectorPtr ids);
+
+  /**
+   * setup global indices of SparseRowMatrix after finish add rows.
+   */
+  void setupIndices();
+
+protected:
+  void addRows(const unsigned int* ids, size_t len);
+  SyncThreadPool* pool_;
+};
+
+class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {
+public:
+  SparseAutoGrowRowCpuMatrix(size_t height, size_t width,
+                             IndexDictPtr indexDictHandle = nullptr,
+                             bool trans = false)
+      : SparseRowCpuMatrix(nullptr, height, width, indexDictHandle, trans) {}
+
+  real* getRow(size_t row) {
+    auto id = globalIndices_[row];
+    if (id == kUnusedId_) {
+      id = globalIndices_[row] = localIndices_->size();
+      localIndices_->push_back(row);
+      checkStoreSize();
+    }
+    return getLocalRow(id);
+  }
+
+  virtual real* getRowBuf(size_t row) { return getRow(row); }
+
+  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+};
+
+class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {
+public:
+  CacheRowCpuMatrix(size_t height, size_t width,
+                    IndexDictPtr indexDictHandle = nullptr, bool trans = false)
+      : SparseAutoGrowRowCpuMatrix(height, width, indexDictHandle, trans),
+        sourceData_(nullptr) {}
+
+  void setSourceData(CpuVectorPtr sourceVec) {
+    sourceDataVec_ = sourceVec;
+    sourceData_ = sourceVec->getData();
+  }
+
+  real* getRow(size_t row) {
+    auto id = globalIndices_[row];
+    if (id == kUnusedId_) {
+      id = globalIndices_[row] = localIndices_->size();
+      localIndices_->push_back(row);
+      checkStoreSize();
+      memcpy(getLocalRow(id), sourceData_ + width_ * row,
+             sizeof(float) * width_);
+    }
+    return getLocalRow(id);
+  }
+
+  virtual real* getRowBuf(size_t row) { return getRow(row); }
+
+  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+
+public:
+  CpuVectorPtr sourceDataVec_;
+  real* sourceData_;
+};
+
+/**
+ * Sparse Row Ids Matrix.
+ *
+ * mostly same as CpuMatrix, but maintain sparse row ids occured,
+ * ids are hashed by worker thread id.
+ */
+class SparseRowIdsCpuMatrix : public CpuMatrix {
+public:
+  SparseRowIdsCpuMatrix(CpuMemHandlePtr dataHandle, size_t height, size_t width,
+                        bool trans = false)
+      : CpuMatrix(dataHandle, height, width, trans) {}
+
+  void setNumOfThreads(size_t numOfThreads) { idsArray_.resize(numOfThreads); }
+
+  std::vector<uint32_t>& getIds(size_t threadId) { return idsArray_[threadId]; }
+
+private:
+  std::vector<std::vector<uint32_t>> idsArray_;
+};
+
+}  // namespace paddle
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
new file mode 100644
index 00000000000000..9a879a964ec6d0
--- /dev/null
+++ b/paddle/math/Storage.cpp
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Util.h"
+#include "Allocator.h"
+#include "Storage.h"
+
+P_DEFINE_int32(pool_limit_size, 536870912,
+               "maximum memory size managed by a memory pool, default is 512M");
+
+namespace paddle {
+
+// Initialization StorageEngine singleton.
+// Other modules may rely on storage management,
+// so StorageEngine need to be initialized before other modules.
+static InitFunction __init_storage_engine(
+  StorageEngine::singleton, std::numeric_limits<int>::max());
+
+StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {
+}
+
+StorageEngine::~StorageEngine() {
+  if (cpuAllocator_) {
+    delete cpuAllocator_;
+  }
+  for (auto it : gpuAllocator_) {
+    delete it;
+  }
+}
+
+StorageEngine* StorageEngine::singleton() {
+  static StorageEngine storage;
+  return &storage;
+}
+
+PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) {
+  {
+    // if gpuAllocator_ has been constructed
+    ReadLockGuard guard(lock_);
+    if (deviceId < static_cast<int>(gpuAllocator_.size())
+        && (gpuAllocator_[deviceId] != nullptr)) {
+      return gpuAllocator_[deviceId];
+    }
+  }
+
+  {
+    // Construct gpuAllocator_
+    std::lock_guard<RWLock> guard(lock_);
+    if (deviceId >= static_cast<int>(gpuAllocator_.size())) {
+      gpuAllocator_.resize(deviceId + 1);
+    }
+    if (gpuAllocator_[deviceId] == nullptr) {
+      std::string name =
+        "gpu" + std::to_string(deviceId) + std::string("_pool");
+      gpuAllocator_[deviceId] = new PoolAllocator(
+        new GpuAllocator(), FLAGS_pool_limit_size, name);
+    }
+    return gpuAllocator_[deviceId];
+  }
+}
+
+PoolAllocator* StorageEngine::getCpuAllocator() {
+  {
+    // if cpuAllocator_ has been constructed
+    ReadLockGuard guard(lock_);
+    if (cpuAllocator_ != nullptr) {
+      return cpuAllocator_;
+    }
+  }
+
+  {
+    // Construct cpuAllocator_
+    std::lock_guard<RWLock> guard(lock_);
+    if (cpuAllocator_ == nullptr) {
+      if (FLAGS_use_gpu) {
+        cpuAllocator_ = new PoolAllocator(
+          new CudaHostAllocator(), FLAGS_pool_limit_size, "cuda_host_pool");
+      } else {
+        cpuAllocator_ = new PoolAllocator(
+          new CpuAllocator(), FLAGS_pool_limit_size, "cpu_pool");
+      }
+    }
+    return cpuAllocator_;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/math/Storage.h b/paddle/math/Storage.h
new file mode 100644
index 00000000000000..725de247e64c26
--- /dev/null
+++ b/paddle/math/Storage.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mutex>
+#include <vector>
+#include "paddle/utils/Locks.h"
+#include "PoolAllocator.h"
+
+namespace paddle {
+
+/**
+ * @brief Storage manager for multiple devices.
+ */
+class StorageEngine {
+public:
+  /**
+   * @return Storage singleton
+   */
+  static StorageEngine* singleton();
+
+  /**
+   * @return return one gpu allocator by deviceId
+   */
+  PoolAllocator* getGpuAllocator(int deviceId);
+
+  /**
+   * @return return cpu allocator
+   */
+  PoolAllocator* getCpuAllocator();
+
+protected:
+  StorageEngine();
+  ~StorageEngine();
+  RWLock lock_;
+  std::vector<PoolAllocator*> gpuAllocator_;
+  PoolAllocator* cpuAllocator_;
+};
+
+}  // namespace paddle
diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp
new file mode 100644
index 00000000000000..b1a459b86aa4ff
--- /dev/null
+++ b/paddle/math/Vector.cpp
@@ -0,0 +1,1030 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Util.h"
+#include "Vector.h"
+
+#include <memory>
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/ThreadLocal.h"
+#include "paddle/utils/Thread.h"
+#include "paddle/utils/Flags.h"
+#include "hl_gpu.h"
+#include "hl_table_apply.h"
+
+namespace paddle {
+
+template <class T>
+std::shared_ptr<VectorT<T>> VectorT<T>::create(size_t size, bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuVectorT<T>>(size);
+  } else {
+    return std::make_shared<CpuVectorT<T>>(size);
+  }
+}
+
+template <class T>
+std::shared_ptr<VectorT<T>> VectorT<T>::createParallelVector(
+    size_t size, bool useGpu, SyncThreadPool* pool) {
+  if (!useGpu && FLAGS_trainer_count > 1 && FLAGS_enable_parallel_vector &&
+      size >= (size_t)FLAGS_enable_parallel_vector) {
+    return std::make_shared<ParallelCpuVectorT<T>>(
+        size, pool ? pool : getGlobalSyncThreadPool());
+  } else {
+    return create(size, useGpu);
+  }
+}
+
+template <class T>
+std::shared_ptr<VectorT<T>> VectorT<T>::create(T* data, size_t size,
+                                               bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuVectorT<T>>(size, data);
+  } else {
+    return std::make_shared<CpuVectorT<T>>(size, data);
+  }
+}
+
+template <class T>
+std::shared_ptr<VectorT<T>> VectorT<T>::create(size_t size,
+                                               MemoryHandlePtr memoryHandle,
+                                               size_t offset) {
+  if (auto cpuMemHandle =
+      std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle)) {
+    return std::make_shared<CpuVectorT<T>>(size, cpuMemHandle, offset);
+  } else if (auto gpuMemHandle =
+             std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle)) {
+    return std::make_shared<GpuVectorT<T>>(size, gpuMemHandle, offset);
+  } else {
+    LOG(FATAL) << "Wrong";
+    return NULL;
+  }
+}
+
+template <class T>
+GpuVectorT<T>::GpuVectorT(size_t size)
+    : VectorT<T>(size, std::make_shared<GpuMemoryHandle>(sizeof(T) * size),
+                 0, /* offset = 0 */
+                 true /* useGpu = true */) {}
+
+template <class T>
+T GpuVectorT<T>::getElement(size_t i) const {
+  T elem = 0;
+  hl_memcpy_device2host(&elem, const_cast<T*>(&this->getData()[i]),
+                        sizeof(T));
+  return elem;
+}
+template <class T>
+void GpuVectorT<T>::setElement(size_t i, const T& value) {
+  hl_memcpy_host2device(&this->getData()[i], const_cast<T*>(&value),
+                        sizeof(T));
+}
+
+template <class T>
+T* GpuVectorT<T>::getPoint(const uint64_t beginPos) {
+  LOG(FATAL) << "Not implemented" << beginPos;
+  return NULL;
+}
+
+template <>
+int GpuVectorT<int>::getAbsSum() {
+  LOG(FATAL) << "Not implemented";
+  return 0;
+}
+
+template <>
+int GpuVectorT<int>::getSum() {
+  LOG(FATAL) << "Not implemented";
+  return 0;
+}
+
+template <>
+real GpuVectorT<real>::getAbsSum() {
+  real* A = this->getData();
+  real sum = 0;
+  hl_vector_abs_sum(A, &sum, this->getSize());
+  return sum;
+}
+
+template <>
+real GpuVectorT<real>::getSum() {
+  real* A = this->getData();
+  real sum = 0;
+  hl_vector_sum(A, &sum, this->getSize());
+  return sum;
+}
+
+template <>
+int GpuVectorT<int>::getMax() {
+  CpuIVector cpuIVec = CpuIVector(this->getSize());
+  copyTo(&cpuIVec);
+  return cpuIVec.getMax();
+}
+
+template <>
+int GpuVectorT<int>::getAbsMax() {
+  CpuIVector cpuIVec = CpuIVector(this->getSize());
+  copyTo(&cpuIVec);
+  return cpuIVec.getAbsMax();
+}
+
+template <class T>
+void GpuVectorT<T>::isEqualTo(const VectorT<T>& b, const T& value) {
+  BaseMatrixT<T>::isEqualTo((BaseMatrixT<T>&)b, value);
+}
+
+template <class T>
+void GpuVectorT<T>::selectFrom(const VectorT<T>& src, const VectorT<int>& ids) {
+#ifndef PADDLE_ONLY_CPU
+  hl_vector_select_from<T>(this->getData(),
+                           this->getSize(),
+                           src.getData(),
+                           src.getSize(),
+                           ids.getData(),
+                           ids.getSize());
+#endif
+}
+
+template <class Func>
+real gpuRowFunc(Func f, GpuVector& v) {
+  static ThreadLocal<std::unique_ptr<CpuVectorT<real>>> local;
+  if (!*local) {
+    (*local).reset(new CpuVector(1));
+  }
+  real* A = v.getData();
+  f(A, (*local)->getData(), 1, v.getSize());
+  return (*local)->getData()[0];
+}
+
+template <>
+real GpuVectorT<real>::getMax() {
+  return gpuRowFunc(hl_matrix_row_max, *this);
+}
+
+template <>
+real GpuVectorT<real>::getAbsMax() {
+  return std::max(gpuRowFunc(hl_matrix_row_max, *this),
+                  -gpuRowFunc(hl_matrix_row_min, *this));
+}
+
+template <>
+int GpuVectorT<int>::getMin() {
+  LOG(FATAL) << "Not implemented";
+  return 0;
+}
+
+template <>
+real GpuVectorT<real>::getMin() {
+  return gpuRowFunc(hl_matrix_row_min, *this);
+}
+
+template <class T>
+T GpuVectorT<T>::get(size_t pos) {
+  T val = (T)0;
+  hl_memcpy_device2host((void*)&val, (void*)(this->getData() + pos),
+                        sizeof(T));
+  return val;
+}
+
+template <class T>
+void GpuVectorT<T>::histogram(std::ostream& os, int type) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template<class T>
+void GpuVectorT<T>::zeroMem() {
+  BaseMatrixT<T>::zero();
+}
+
+template <class T>
+void GpuVectorT<T>::reset(const T& value) {
+  BaseMatrixT<T>::assign(value);
+}
+
+template <class T>
+void GpuVectorT<T>::fillSequence() {
+  LOG(FATAL) << "not implemented";
+}
+
+template <class T>
+void GpuVectorT<T>::copyFrom(const VectorT<T>& src) {
+  src.copyTo(this);
+}
+
+template <class T>
+void GpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
+  CHECK_EQ(src.getSize(), this->getSize());
+  hl_memcpy_async((void*)this->getData(), (void*)src.getData(),
+                  sizeof(T) * this->getSize(), stream);
+}
+
+template <class T>
+void GpuVectorT<T>::copyFrom(const T* gpuSrc, size_t size) {
+  CHECK(gpuSrc != NULL);
+  CHECK_LE(size, this->size_);
+
+  hl_memcpy((void*)this->getData(), (void*)gpuSrc, sizeof(T) * size);
+}
+
+template <class T>
+void GpuVectorT<T>::copyFrom(const T* gpuSrc, size_t size, hl_stream_t stream) {
+  CHECK(gpuSrc != NULL);
+  CHECK_LE(size, this->size_);
+
+  hl_memcpy_async((void*)this->getData(), (void*)gpuSrc,
+                  sizeof(T) * size, stream);
+}
+
+template <class T>
+void GpuVectorT<T>::copyTo(CpuVectorT<T>* dest) const {
+  CHECK_EQ(this->getSize(), dest->getSize());
+
+  hl_memcpy_device2host((void*)dest->getData(), (void*)this->getData(),
+                        sizeof(T) * this->getSize());
+}
+
+template <class T>
+void GpuVectorT<T>::copyTo(GpuVectorT<T>* dest) const {
+  CHECK_EQ(this->getSize(), dest->getSize());
+
+  hl_memcpy_device2device((void*)dest->getData(), (void*)this->getData(),
+                          sizeof(T) * this->getSize());
+}
+
+template <>
+void GpuVectorT<int>::rand() {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void GpuVectorT<int>::print(std::ostream& os, size_t num) const {
+  IVectorPtr dest = IVector::create(this->size_, false);
+  hl_memcpy_device2host((void*)dest->getData(), (void*)this->getData(),
+                        sizeof(int) * this->getSize());
+  dest->print(os, num);
+}
+
+template <>
+void GpuVectorT<real>::print(std::ostream& os, size_t num) const {
+  VectorPtr dest = Vector::create(this->size_, false);
+  hl_memcpy_device2host((void*)dest->getData(), (void*)this->getData(),
+                        sizeof(int) * this->getSize());
+  dest->print(os, num);
+}
+
+template <>
+void GpuVectorT<int>::printOneElement(std::ostream& os, size_t idx) const {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void GpuVectorT<real>::printOneElement(std::ostream& os, size_t idx) const {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void CpuVectorT<int>::rand() {
+  LOG(FATAL) << "Not implemented";
+}
+template <>
+void GpuVectorT<real>::rand(size_t classNum) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void CpuVectorT<real>::rand(size_t classNum) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void GpuVectorT<real>::rand() {
+  VectorPtr cPtr = Vector::create(this->size_, false);
+  cPtr->rand();
+
+  hl_memcpy_host2device(data_, cPtr->getData(), this->size_ * sizeof(real));
+}
+
+template <>
+void GpuVectorT<int>::rand(size_t classNum) {
+  IVectorPtr cPtr = IVector::create(this->size_, false);
+  cPtr->rand(classNum);
+
+  hl_memcpy_host2device(data_, cPtr->getData(), this->size_ * sizeof(int));
+}
+
+template <>
+void CpuVectorT<int>::rand(size_t classNum) {
+  size_t size = this->getSize();
+  int* data = this->getData();
+  for (size_t i = 0; i < size; i++) {
+    data[i] =
+        std::min(classNum - 1,
+                 size_t(::rand() * (1. / ((double)RAND_MAX + 1)) * classNum));
+  }
+}
+
+template <>
+void CpuVectorT<real>::rand() {
+  size_t size = this->getSize();
+  real* data = this->getData();
+  for (size_t i = 0; i < size; i++) {
+    data[i] = ::rand() * (1. / (double)RAND_MAX);
+    // data[ii] = ((temp > RAND_MAX/2)? 1 : -1) *
+    // sqrt( abs((temp-RAND_MAX/2))/(double(RAND_MAX))/2048 );
+  }
+}
+
+template <class T>
+void CpuVectorT<T>::randnorm(real, real) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <class T>
+void CpuVectorT<T>::uniform(real, real) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <class T>
+void GpuVectorT<T>::randnorm(real, real) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <class T>
+void GpuVectorT<T>::uniform(real, real) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void CpuVectorT<real>::randnorm(real mean, real std) {
+  size_t size = this->getSize();
+  real* data = this->getData();
+  unsigned int* seed = ThreadLocalRand::getSeed();
+  auto rand1 = [&]() { return (1. + ::rand_r(seed)) * (1. / (1. + RAND_MAX)); };
+  for (size_t i = 0; i < size - 1; i += 2) {
+    real r1 = rand1();
+    r1 = std::sqrt(-2 * std::log(r1));
+    real r2 = rand1();
+    data[i] = mean + std * r1 * cos(2 * M_PI * r2);
+    data[i + 1] = mean + std * r1 * sin(2 * M_PI * r2);
+  }
+  real r1 = rand1();
+  r1 = std::sqrt(-2 * std::log(r1));
+  real r2 = rand1();
+  data[size - 1] = mean + std * r1 * cos(2 * M_PI * r2);
+}
+
+template <>
+void CpuVectorT<real>::uniform(real left, real right) {
+  size_t size = this->getSize();
+  real* data = this->getData();
+  real range = right - left;
+  unsigned int* seed = ThreadLocalRand::getSeed();
+  auto rand1 = [&]() { return ::rand_r(seed) * (1. / (1. + RAND_MAX)); };
+  for (size_t i = 0; i < size; ++i) {
+    data[i] = rand1() * range + left;
+  }
+}
+
+template <>
+void GpuVectorT<real>::randnorm(real mean, real std) {
+  CpuVector cpuVec = CpuVector(this->getSize());
+  cpuVec.randnorm(mean, std);
+
+  hl_memcpy_host2device(data_, cpuVec.getData(),
+                        this->getSize() * sizeof(real));
+}
+
+template <>
+void GpuVectorT<real>::uniform(real left, real right) {
+  CpuVector cpuVec = CpuVector(this->getSize());
+  cpuVec.uniform(left, right);
+
+  hl_memcpy_host2device(data_, cpuVec.getData(),
+                        this->getSize() * sizeof(real));
+}
+
+template <class T>
+CpuVectorT<T>::CpuVectorT(size_t size)
+    : VectorT<T>(size, std::make_shared<CpuMemoryHandle>(sizeof(T) * size),
+                 0, /* offset = 0 */
+                 false /* useGpu = false */) {}
+
+template <class T>
+CpuVectorT<T>::CpuVectorT(const VectorT<T>& src)
+    : VectorT<T>(src.getSize(), src.getMemoryHandle(), 0, /* offset = 0 */
+                 false /* useGpu = false */) {
+  if (typeid(*this->memoryHandle_.get()) != typeid(CpuMemoryHandle)) {
+    this->memoryHandle_ =
+        std::make_shared<CpuMemoryHandle>(sizeof(T) * this->getSize());
+    this->data_ = reinterpret_cast<T*>(this->memoryHandle_->getBuf());
+  }
+  src.copyTo(this);
+}
+
+template <class T>
+T CpuVectorT<T>::getAbsSum() {
+  const T* A = this->getData();
+  size_t size = this->getSize();
+  T sum = 0;
+  for (size_t i = 0; i < size; i++) {
+    sum += (A[i] > 0) ? A[i] : -A[i];
+  }
+  return sum;
+}
+
+// cannot use above version, due to precision issue of float
+template <>
+real CpuVectorT<real>::getAbsSum() {
+  const real* A = this->getData();
+  size_t size = this->getSize();
+  double sum = 0;
+  for (size_t i = 0; i < size; i++) {
+    sum += (A[i] > 0) ? A[i] : -A[i];
+  }
+  return sum;
+}
+
+template <class T>
+T CpuVectorT<T>::getSum() {
+  const T* A = this->getData();
+  size_t size = this->getSize();
+  T sum = 0;
+  for (size_t i = 0; i < size; i++) {
+    sum += A[i];
+  }
+  return sum;
+}
+
+template <>
+real CpuVectorT<real>::getSum() {
+  const real* A = this->getData();
+  size_t size = this->getSize();
+  double sum = 0;
+  for (size_t i = 0; i < size; i++) {
+    sum += A[i];
+  }
+  return sum;
+}
+
+template <class T>
+T CpuVectorT<T>::get(size_t pos) {
+  return this->getData()[pos];
+}
+
+template <class T>
+T CpuVectorT<T>::getMax() {
+  const T* A = this->getData();
+  size_t size = this->getSize();
+  T res = A[0];
+  for (size_t i = 1; i < size; i++) {
+    if (res < A[i]) res = A[i];
+  }
+  return res;
+}
+
+template <class T>
+T CpuVectorT<T>::getAbsMax() {
+  const T* A = this->getData();
+  size_t size = this->getSize();
+  T res = std::abs(A[0]);
+  for (size_t i = 1; i < size; i++) {
+    if (res < std::abs(A[i])) res = std::abs(A[i]);
+  }
+  return res;
+}
+
+template <class T>
+T CpuVectorT<T>::getMin() {
+  const T* A = this->getData();
+  size_t size = this->getSize();
+  T res = A[0];
+  for (size_t i = 1; i < size; i++) {
+    if (res > A[i]) res = A[i];
+  }
+  return res;
+}
+
+template <class T>
+void CpuVectorT<T>::isEqualTo(const VectorT<T>& b, const T& value) {
+  size_t size = this->getSize();
+  CHECK_EQ(b.getSize(), size);
+
+  const T* B = b.getData();
+  T* A = this->getData();
+  for (size_t i = 0; i < size; i++) {
+    A[i] = (B[i] == value);
+  }
+}
+
+template <class T>
+void CpuVectorT<T>::selectFrom(const VectorT<T>& src, const VectorT<int>& ids) {
+  size_t size = this->getSize();
+  CHECK_EQ(ids.getSize(), size);
+
+  const int* indices = ids.getData();
+  const T* B = src.getData();
+  T* A = this->getData();
+  for (size_t i = 0; i < size; i++) {
+    int index = indices[i];
+    CHECK_LT(index, (int)src.getSize());
+    A[i] = B[index];
+  }
+}
+
+static int getSignAndExponentOfFloat(float a) {
+  uint32_t* pa = reinterpret_cast<uint32_t*>(&a);
+  return *pa >> 23;
+}
+
+template <class T>
+void CpuVectorT<T>::histogram(std::ostream& os, int type) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void CpuVectorT<real>::histogram(std::ostream& os, int type) {
+  int counters[512];
+  memset(counters, 0, sizeof(counters));
+  int counterZero = 0;
+
+  const real* A = this->getData();
+  size_t size = this->getSize();
+  for (size_t i = 0; i < size; i++) {
+    if (A[i] == 0.0f) {
+      ++counterZero;
+    } else {
+      ++counters[getSignAndExponentOfFloat(A[i])];
+    }
+  }
+
+  int64_t sum = 0;
+  float sizeNonZero = size - counterZero;
+  os << "zero:" << counterZero;
+  for (int i = 0; i < 256; i++) {
+    int counter = counters[i];
+    if (counter) {
+      os << " 2^" << i - 127 << ":" << counter / sizeNonZero * 100 << "%";
+      sum += counter * (i - 127);
+    }
+  }
+  for (int i = 0; i < 256; i++) {
+    int counter = counters[i + 256];
+    if (counter) {
+      os << " -2^" << i - 127 << ":" << counter / sizeNonZero * 100 << "%";
+      sum += counter * (i - 127);
+    }
+  }
+  os << ", nonzero_exponent_avg=" << sum / sizeNonZero;
+}
+
+template <class T>
+void CpuVectorT<T>::zeroMem() {
+  memset(this->getData(), 0, sizeof(T) * this->getSize());
+}
+
+template <class T>
+void CpuVectorT<T>::reset(const T& value) {
+  T* A = this->getData();
+  size_t size = this->getSize();
+  for (size_t i = 0; i < size; i++) {
+    A[i] = value;
+  }
+}
+
+template <class T>
+void CpuVectorT<T>::fillSequence() {
+  T* A = this->getData();
+  size_t size = this->getSize();
+  for (size_t i = 0; i < size; i++) {
+    A[i] = i;
+  }
+}
+
+template <class T>
+void CpuVectorT<T>::copyFrom(const VectorT<T>& src) {
+  src.copyTo(this);
+}
+
+template <class T>
+void CpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
+  if (typeid(src) == typeid(GpuVectorT<T>)) {
+    hl_memcpy_async((void*)this->getData(), (void*)src.getData(),
+                    sizeof(T) * this->getSize(), stream);
+  } else {
+    src.copyTo(this);
+  }
+}
+
+template <class T>
+void CpuVectorT<T>::copyFrom(const T* hostSrc, size_t size) {
+  CHECK(hostSrc != NULL);
+  CHECK_LE(size, this->size_);
+  memcpy(this->data_, hostSrc, sizeof(T) * size);
+}
+
+template <class T>
+void CpuVectorT<T>::copyFrom(const T* hostSrc, size_t size,
+                             hl_stream_t stream) {
+  (void)stream;
+
+  CHECK(hostSrc != NULL);
+  CHECK_LE(size, this->size_);
+  memcpy(this->data_, hostSrc, sizeof(T) * size);
+}
+
+template <class T>
+void CpuVectorT<T>::copyTo(CpuVectorT<T>* dest) const {
+  CHECK_EQ(this->getSize(), dest->getSize());
+  memcpy(dest->getData(), this->getData(), sizeof(T) * this->getSize());
+}
+
+template <class T>
+void CpuVectorT<T>::copyTo(GpuVectorT<T>* dest) const {
+  CHECK_EQ(this->getSize(), dest->getSize());
+  hl_memcpy_host2device((void*)dest->getData(), (void*)this->getData(),
+                        sizeof(T) * this->getSize());
+}
+
+template <>
+void CpuVectorT<real>::print(std::ostream& os, size_t num) const {
+  size_t w = size_ < num ? size_ : num;
+  os << "[";
+  for (size_t i = 0; i < w; ++i) {
+    os << data_[i] << " ";
+  }
+  os << "]" << std::endl;
+}
+
+template <>
+void CpuVectorT<int>::print(std::ostream& os, size_t num) const {
+  size_t w = size_ < num ? size_ : num;
+  os << "[";
+  for (size_t i = 0; i < w; ++i) {
+    os << (int)data_[i] << " ";
+  }
+  os << "]" << std::endl;
+}
+
+template <>
+void CpuVectorT<real>::printOneElement(std::ostream& os, size_t idx) const {
+  CHECK_LT(idx, size_);
+  os << data_[idx] << ";";
+}
+
+template <>
+void CpuVectorT<int>::printOneElement(std::ostream& os, size_t idx) const {
+  CHECK_LT(idx, size_);
+  os << (int)data_[idx] << ";";
+}
+
+template <class T>
+void ParallelCpuVectorT<T>::parallelExec(ExecFunc func) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void ParallelCpuVectorT<real>::parallelExec(ExecFunc func) {
+  pool_->exec([this, func](int tid, size_t numThreads) {
+    auto interval = calcSplitArrayInterval(this->getSize(), (size_t)tid,
+                                           numThreads, 8LU /*for avx*/);
+    // setup sub bufs
+    CpuVector subVec(0, nullptr);
+    subVec.subVecFrom(*this, interval);
+    func(subVec);
+  });
+}
+
+template <class T>
+void ParallelCpuVectorT<T>::exec(SyncThreadPool::JobFunc func) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void ParallelCpuVectorT<real>::exec(SyncThreadPool::JobFunc func) {
+  pool_->exec(func);
+}
+
+template <class T>
+CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, bool useGpu) : sync_(nullptr) {
+  if (!useGpu) {
+    cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size);
+  } else {
+    gpuVectorT_ = std::make_shared<GpuVectorT<T>>(size);
+  }
+  setSync(useGpu);
+}
+
+template <class T>
+CpuGpuVectorT<T>::CpuGpuVectorT(const std::shared_ptr<VectorT<T>>& src)
+  : sync_(nullptr) {
+  bool useGpu = src->useGpu();
+  if (useGpu) {
+    gpuVectorT_ = src;
+  } else {
+    cpuVectorT_ = src;
+  }
+  setSync(useGpu);
+}
+
+template <class T>
+CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, T* data, bool useGpu)
+  : sync_(nullptr) {
+  if (!useGpu) {
+    cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size, data);
+    setSync(DATA_AT_CPU);
+  } else {
+    gpuVectorT_ = std::make_shared<GpuVectorT<T>>(size, data);
+    setSync(DATA_AT_GPU);
+  }
+}
+
+template <class T>
+std::shared_ptr<CpuGpuVectorT<T>>
+CpuGpuVectorT<T>::create(size_t size, bool useGpu) {
+  return std::make_shared<CpuGpuVectorT<T>>(size, useGpu);
+}
+
+template <class T>
+void CpuGpuVectorT<T>::resize(size_t size, bool useGpu) {
+  if (useGpu) {
+    CHECK(gpuVectorT_) << "gpuVectorT_ is null";
+    // If memoryHandle_ is nullptr,
+    // the data may be owned by the caller when it was constructed.
+    // It should not resize for this case.
+    if (gpuVectorT_->getMemoryHandle()) {
+      gpuVectorT_->resize(size);
+    } else {
+      CHECK_EQ(gpuVectorT_->getSize(), size);
+    }
+  } else {
+    CHECK(cpuVectorT_) << "cpuVectorT_ is null";
+    // If memoryHandle_ is nullptr,
+    // the data may be owned by the caller when it was constructed.
+    // It should not resize for this case.
+    if (cpuVectorT_->getMemoryHandle()) {
+      cpuVectorT_->resize(size);
+    } else {
+      CHECK_EQ(cpuVectorT_->getSize(), size);
+    }
+  }
+  setSync(useGpu);
+}
+
+template <class T>
+void CpuGpuVectorT<T>::resizeOrCreate(
+    std::shared_ptr<CpuGpuVectorT<T>>& vec,
+    size_t size, bool useGpu) {
+  if (vec) {
+    vec->resize(size, useGpu);
+  } else {
+    vec = create(size, useGpu);
+  }
+}
+
+template <class T>
+void CpuGpuVectorT<T>::resizeOrCreate(size_t size, bool useGpu) {
+  if (useGpu && (!gpuVectorT_)) {
+    gpuVectorT_ = VectorT<T>::create(size, true);
+  } else if ((!useGpu) && (!cpuVectorT_)) {
+    cpuVectorT_ = VectorT<T>::create(size, false);
+  } else {
+    this->resize(size, useGpu);
+  }
+}
+
+template <class T>
+CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
+  size_t offset, size_t size) : sync_(nullptr) {
+  CHECK_LE(offset + size, static_cast<size_t>(src.getSize()));
+#ifndef PADDLE_ONLY_CPU
+  SyncedFlag* flag = src.getSync();
+  if (*flag == DATA_AT_CPU) {
+    src.copyToGpu();  // will set synchronous data between CPU and GPU
+  } else if (*flag == DATA_AT_GPU) {
+    src.copyToCpu();  // will set synchronous data between CPU and GPU
+  }
+#endif
+  auto cMemHandle = (src.getVector(false))->getMemoryHandle();
+  cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size,
+    std::dynamic_pointer_cast<CpuMemoryHandle>(cMemHandle), offset);
+#ifndef PADDLE_ONLY_CPU
+  auto gMemHandle = (src.getVector(true))->getMemoryHandle();
+  gpuVectorT_ = std::make_shared<GpuVectorT<T>>(size,
+    std::dynamic_pointer_cast<GpuMemoryHandle>(gMemHandle), offset);
+  src.setSync(SYNCED);
+#endif
+  setSync(src.getSync());
+}
+
+template <class T>
+std::shared_ptr<const VectorT<T>>
+CpuGpuVectorT<T>::getVector(bool useGpu) const {
+  auto * self = const_cast<CpuGpuVectorT<T>*>(this);
+  if (useGpu) {
+    self->copyToGpu();
+    return std::const_pointer_cast<const VectorT<T>>(gpuVectorT_);
+  } else {
+    self->copyToCpu();
+    return std::const_pointer_cast<const VectorT<T>>(cpuVectorT_);
+  }
+}
+
+template <class T>
+std::shared_ptr<VectorT<T>>& CpuGpuVectorT<T>::getMutableVector(bool useGpu) {
+  setSync(useGpu);
+  if (useGpu) {
+    copyToGpu();
+    return gpuVectorT_;
+  } else {
+    copyToCpu();
+    return cpuVectorT_;
+  }
+}
+
+template <class T>
+const T* CpuGpuVectorT<T>::getData(bool useGpu) const {
+  auto self = const_cast<CpuGpuVectorT<T>*>(this);
+  if (useGpu) {
+    self->copyToGpu();
+    return gpuVectorT_->getData();
+  } else {
+    self->copyToCpu();
+    return cpuVectorT_->getData();
+  }
+}
+
+// Operation will change data and need to reset sync_ & syncFlag_.
+#define MUTABLE_VECTOR_OP(OP, useGpu, args...) \
+  do {                                         \
+    setSync(useGpu);                           \
+    if (useGpu) {                              \
+      copyToGpu();                             \
+      return gpuVectorT_->OP(args);            \
+    } else {                                   \
+      copyToCpu();                             \
+      return cpuVectorT_->OP(args);            \
+    }                                          \
+  } while (0)
+
+template <class T>
+T* CpuGpuVectorT<T>::getMutableData(bool useGpu) {
+  MUTABLE_VECTOR_OP(getData, useGpu);
+}
+
+template <class T>
+void CpuGpuVectorT<T>::zeroMem(bool useGpu) {
+  MUTABLE_VECTOR_OP(zeroMem, useGpu);
+}
+
+template <class T>
+void CpuGpuVectorT<T>::fillSequence(bool useGpu) {
+  MUTABLE_VECTOR_OP(fillSequence, useGpu);
+}
+
+template <class T>
+void CpuGpuVectorT<T>::setElement(size_t i, const T& value, bool useGpu) {
+  MUTABLE_VECTOR_OP(setElement, useGpu, i, value);
+}
+
+template <class T>
+T CpuGpuVectorT<T>::getElement(size_t i) const {
+  switch (*this->getSync()) {
+    case SYNCED:
+    case DATA_AT_CPU:
+      return cpuVectorT_->getElement(i);
+      break;
+    case DATA_AT_GPU:
+      return gpuVectorT_->getElement(i);
+      break;
+    default:
+      LOG(FATAL) << "Not support";
+      break;
+  }
+}
+
+template <class T>
+void CpuGpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
+  auto cVec = dynamic_cast<const CpuVectorT<T>*>(&src);
+  auto gVec = dynamic_cast<const GpuVectorT<T>*>(&src);
+  if (cVec) {
+    copyToCpu(cVec->getData(), cVec->getSize(), stream);
+  } else if (gVec) {
+    copyToGpu(gVec->getData(), gVec->getSize(), stream);
+  } else {
+    LOG(FATAL) << "Invalid type of src";
+  }
+}
+
+template <class T>
+void CpuGpuVectorT<T>::copyFrom(const T* data, size_t size, bool useGpu) {
+  if (useGpu) {
+    copyToGpu(data, size);
+  } else {
+    copyToCpu(data, size);
+  }
+}
+
+template <class T>
+void CpuGpuVectorT<T>::copyFrom(const T* data, size_t size,
+    hl_stream_t stream, bool useGpu) {
+  if (useGpu) {
+    copyToGpu(data, size, stream);
+  } else {
+    copyToCpu(data, size, stream);
+  }
+}
+
+template <class T>
+void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src,
+    size_t offset, size_t size, bool useGpu, hl_stream_t stream) {
+  if (useGpu) {
+    VectorT<T>::resizeOrCreate(gpuVectorT_, size, true);
+    gpuVectorT_->copyFrom(src.getData(true) + offset, size, stream);
+  } else {
+    VectorT<T>::resizeOrCreate(cpuVectorT_, size, false);
+    cpuVectorT_->copyFrom(src.getData(false) + offset, size, stream);
+  }
+  setSync(useGpu);
+}
+
+template <class T>
+void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src,
+    hl_stream_t stream) {
+  switch (*src.getSync()) {
+    case DATA_AT_CPU:
+      copyFrom(*(src.getVector(false)), stream);
+      break;
+    case DATA_AT_GPU:
+      copyFrom(*(src.getVector(true)), stream);
+      break;
+    case SYNCED:
+      copyFrom(*(src.getVector(false)), stream);
+      copyFrom(*(src.getVector(true)), stream);
+      setSync(SYNCED);
+      break;
+    default:
+      LOG(FATAL) << "Not support";
+      break;
+  }
+}
+
+template <class T>
+void CpuGpuVectorT<T>::copyToCpu() {
+  switch (*this->getSync()) {
+    case DATA_AT_GPU:
+      CHECK(gpuVectorT_);
+      this->resizeOrCreate(gpuVectorT_->getSize(), false);
+      cpuVectorT_->copyFrom(*gpuVectorT_, HPPL_STREAM_DEFAULT);
+      setSync(SYNCED);
+      break;
+    case DATA_AT_CPU:
+    case SYNCED:
+      CHECK(cpuVectorT_);
+      break;
+    default:
+      LOG(FATAL) << "Not support";
+      break;
+  }
+}
+
+template <class T>
+void CpuGpuVectorT<T>::copyToGpu() {
+  switch (*this->getSync()) {
+    case DATA_AT_CPU:
+      CHECK(cpuVectorT_);
+      this->resizeOrCreate(cpuVectorT_->getSize(), true);
+      gpuVectorT_->copyFrom(*cpuVectorT_, HPPL_STREAM_DEFAULT);
+      setSync(SYNCED);
+      break;
+    case DATA_AT_GPU:
+    case SYNCED:
+      CHECK(gpuVectorT_);
+      break;
+    default:
+      LOG(FATAL) << "Not support";
+      break;
+  }
+}
+
+template class VectorT<real>;
+template class VectorT<int>;
+template class CpuVectorT<real>;
+template class CpuVectorT<int>;
+template class GpuVectorT<real>;
+template class GpuVectorT<int>;
+template class CpuGpuVectorT<real>;
+template class CpuGpuVectorT<int>;
+
+}  // namespace paddle
diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
new file mode 100644
index 00000000000000..ee0a83bf038f04
--- /dev/null
+++ b/paddle/math/Vector.h
@@ -0,0 +1,699 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <memory>
+#include <cmath>
+
+#include <hl_gpu.h>
+
+#include "MemoryHandle.h"
+#include "paddle/utils/TypeDefs.h"
+#include "BaseMatrix.h"
+#include "paddle/utils/Thread.h"
+
+namespace paddle {
+
+template <class T>
+class GpuVectorT;
+template <class T>
+class CpuVectorT;
+
+template <class T>
+class BaseVector;
+
+class SyncThreadPool;
+
+template<class T>
+class BaseVector : public BaseMatrixT<T> {
+public:
+  BaseVector(size_t size, T* data, bool useGpu)
+    : BaseMatrixT<T>(1, size, data, false, useGpu),
+      size_(this->width_) {}
+
+  ~BaseVector() {}
+
+protected:
+  size_t& size_;
+};
+
+/**
+ * Copy or assignemnt constructor will share the data as opposed to making a
+ * copy of the original data. To make a copy of the orinal data, use copyFrom()
+ * instead.
+ */
+template <class T>
+class VectorT : public BaseVector<T> {
+protected:
+  VectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset, bool useGpu)
+      : BaseVector<T>(size,
+                      reinterpret_cast<T*>(memoryHandle->getBuf()) + offset,
+                      useGpu) {
+    memoryHandle_ = memoryHandle;
+  }
+
+  // data is still owned by the caller.
+  // data should be valid during the life of this vector.
+  // Caller is responsible for release the memory.
+  VectorT(size_t size, T* data, bool useGpu)
+      : BaseVector<T>(size, data, useGpu) {}
+
+public:
+  virtual ~VectorT() {}
+
+  static std::shared_ptr<VectorT<T>> create(size_t size, bool useGpu);
+
+  static std::shared_ptr<VectorT<T>> create(T* data, size_t size, bool useGpu);
+
+  static std::shared_ptr<VectorT<T>> create(size_t size,
+                                            MemoryHandlePtr memoryHandle,
+                                            size_t offset = 0);
+
+  // owner can set SyncThreadPool,
+  // if not set, will use globalSyncThreadPool,
+  // which can be used in main thread only.
+  static std::shared_ptr<VectorT<T>> createParallelVector(
+      size_t size, bool useGpu, SyncThreadPool* pool = nullptr);
+
+  size_t getSize() const { return this->size_; }
+  const T* getData() const { return this->data_; }
+  T* getData() { return this->data_; }
+
+  virtual void zeroMem() = 0;
+  // set all elements to value
+  virtual void reset(const T& value) = 0;
+  // fill data by 0, 1, 2, ...
+  virtual void fillSequence() = 0;
+
+  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }
+
+  /**
+   * resizing to a big vector will not preserve old values.
+   */
+  void resize(size_t newSize) {
+    if (!memoryHandle_ || newSize * sizeof(T) > memoryHandle_->getAllocSize()) {
+      memoryHandle_ = newMemory(newSize * sizeof(T));
+      this->data_ = reinterpret_cast<T*>(memoryHandle_->getBuf());
+    }
+    this->size_ = newSize;
+  }
+
+  static void resizeOrCreate(std::shared_ptr<VectorT<T>>& vec, size_t size,
+                             bool useGpu) {
+    if (vec) {
+      vec->resize(size);
+    } else {
+      vec = create(size, useGpu);
+    }
+  }
+
+  virtual MemoryHandlePtr newMemory(size_t size) = 0;
+
+  /**
+   * form sub vector from *src*, shallow copy
+   */
+  void subVecFrom(const VectorT<T>& src, size_t start, size_t size) {
+    CHECK_EQ(BaseVector<T>::useGpu_, src.useGpu_);
+    CHECK_LT(start, src.size_);
+    CHECK_LE(start + size, src.size_);
+
+    BaseVector<T>::size_ = size;
+    BaseVector<T>::data_ = const_cast<T*>(src.data_) + start;
+  }
+
+  std::shared_ptr<VectorT<T>> subVec(size_t start, size_t size) {
+    CHECK_LE(start + size, static_cast<size_t>(getSize()));
+    return VectorT<T>::create(getData() + start, size, BaseVector<T>::useGpu_);
+  }
+
+  /**
+   * form sub vector from *src*, shallow copy
+   */
+  void subVecFrom(const T* src, size_t start, size_t size) {
+    BaseVector<T>::size_ = size;
+    BaseVector<T>::data_ = const_cast<T*>(src) + start;
+  }
+
+  /**
+   * form sub vector from *src*, shallow copy
+   * in *interval* [interval.first, interval.second)
+   */
+  void subVecFrom(const VectorT<T>& src, std::pair<size_t, size_t> interval) {
+    subVecFrom(src, interval.first, interval.second - interval.first);
+  }
+
+  /**
+   * This function will crash if the size of src and dest is different.
+   */
+  virtual void copyFrom(const VectorT<T>& src) = 0;
+
+  /**
+   * If use_gpu, this function will push the copy-task to the specifed-stream
+   * and return immediately.
+   *
+   * If not use GPU, this function is same as
+   * the copyFrom(const VectorT<T>& src), which use stream HPPL_STREAM_DEFAULT.
+   */
+  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream) = 0;
+
+  /**
+   * copy size elements from src
+   *
+   * If this is GpuVector, src can be cpu or gpu memory
+   *
+   * If this is CpuVector, src is assumed to be cpu memory
+   */
+  virtual void copyFrom(const T* src, size_t size) = 0;
+
+  /**
+   * copy size elements from src
+   *
+   * If this is GpuVector, src can be cpu or gpu memory
+   *
+   * If this is CpuVector, src is assumed to be cpu memory,
+   */
+  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream) = 0;
+
+  /**
+   * exec a func in single/multi thread
+   */
+  virtual void exec(SyncThreadPool::JobFunc func) { func(0, 1); }
+
+  /// Get the buffer point with beginPos
+  virtual T* getPoint(const uint64_t beginPos) = 0;
+
+  /// Get the value for the i'th element
+  virtual T getElement(size_t i) const = 0;
+  virtual void setElement(size_t i, const T& value) = 0;
+
+  //----------  math operations ----------------
+
+  // sum of the absolute value of each elements
+  virtual T getAbsSum() = 0;
+
+  virtual T getSum() = 0;
+  virtual T getMax() = 0;
+  virtual T getAbsMax() = 0;
+  virtual T getMin() = 0;
+
+  /// element-wise calc:  this = (b == value)
+  virtual void isEqualTo(const VectorT<T>& b, const T& value) = 0;
+
+  /// select elements indexed by *ids* from vector *src*
+  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids) = 0;
+
+  enum HistogramType {
+    HISTOGRAM_EXPONENT = 0,
+  };
+
+  /**
+   * @brief  print histogram of vector values
+   *
+   * @note   only exponent histogram supported currently
+   */
+  virtual void histogram(std::ostream& os, int type = HISTOGRAM_EXPONENT) = 0;
+
+  /// generate uniform random value for each element
+  virtual void rand() = 0;
+  /**
+   * generate uniform random value for each element,
+   * data range is from 0 to (classes - 1).
+   */
+  virtual void rand(size_t classes) = 0;
+
+  /**
+   * Debug use only. Very inefficient for GPU vector.
+   * get the value at pos.
+   */
+  virtual T get(size_t pos) = 0;
+
+  /**
+   * generate univariate Gaussian distributed random numbers
+   * with given mean and standardDeviation.
+   */
+  virtual void randnorm(real mean, real standardDeviation) = 0;
+
+  /**
+   * generate uniform distributed random numbers
+   * with given range.
+   */
+  virtual void uniform(real left, real right) = 0;
+
+  /// print the first "num" elements of the Vector
+  virtual void print(std::ostream& os, size_t num) const = 0;
+
+  /// print the "idx" element of the Vector
+  virtual void printOneElement(std::ostream& os, size_t idx) const = 0;
+
+protected:
+  friend class GpuVectorT<T>;
+  friend class CpuVectorT<T>;
+  virtual void copyTo(CpuVectorT<T>* dest) const = 0;
+  virtual void copyTo(GpuVectorT<T>* dest) const = 0;
+  MemoryHandlePtr memoryHandle_;
+};
+
+template <class T>
+std::ostream& operator<<(std::ostream& os, const VectorT<T>& vec) {
+  vec.print(os, vec.getSize());
+  return os;
+}
+
+template <class T>
+class GpuVectorT : public VectorT<T> {
+public:
+  explicit GpuVectorT(size_t size);
+  GpuVectorT(size_t size, GpuMemHandlePtr memHandle, size_t offset)
+      : VectorT<T>(size, memHandle, offset, true) {}
+
+  // data is still owned by the caller.
+  // data should be valid during the life of this vector.
+  // Caller is responsible for release the memory.
+  GpuVectorT(size_t size, T* data) : VectorT<T>(size, data, true) {}
+
+  virtual MemoryHandlePtr newMemory(size_t size) {
+    return std::make_shared<GpuMemoryHandle>(size);
+  }
+  virtual void zeroMem();
+  virtual void reset(const T& value);
+  virtual void fillSequence();
+
+  virtual void copyFrom(const T* src, size_t size);
+  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream);
+  virtual void copyFrom(const VectorT<T>& src);
+  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream);
+  virtual T getElement(size_t i) const;
+  virtual void setElement(size_t i, const T& value);
+  virtual T* getPoint(const uint64_t beginPos);
+
+  virtual T getAbsSum();
+  virtual T getSum();
+  virtual T getMax();
+  virtual T getAbsMax();
+  virtual T getMin();
+  virtual void isEqualTo(const VectorT<T>& b, const T& value);
+  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids);
+  virtual void histogram(std::ostream& os, int type);
+  virtual void rand();
+  virtual void rand(size_t classes);
+  virtual void randnorm(real mean, real standardDeviation);
+  virtual void uniform(real left, real right);
+  virtual T get(size_t pos);
+  virtual void print(std::ostream& os, size_t num) const;
+  virtual void printOneElement(std::ostream& os, size_t idx) const;
+
+protected:
+  virtual void copyTo(CpuVectorT<T>* dest) const;
+  virtual void copyTo(GpuVectorT<T>* dest) const;
+};
+
+template <class T>
+class CpuVectorT : public VectorT<T> {
+public:
+  explicit CpuVectorT(size_t size);
+  CpuVectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset)
+      : VectorT<T>(size, memoryHandle, offset, false) {}
+
+  // data is still owned by the caller.
+  // data should be valid during the life of this vector.
+  // Caller is responsible for release the memory.
+  CpuVectorT(size_t size, T* data) : VectorT<T>(size, data, false) {}
+
+  /**
+   * If src is a CpuVector, the new CpuVector will share the data with src
+   *
+   * If src is a GpuVector, the new CpuVector will copy data from src
+   */
+  explicit CpuVectorT(const VectorT<T>& src);
+
+  virtual MemoryHandlePtr newMemory(size_t size) {
+    return std::make_shared<CpuMemoryHandle>(size);
+  }
+
+  virtual void zeroMem();
+  virtual void reset(const T& value);
+  virtual void fillSequence();
+  virtual void copyFrom(const T* src, size_t size);
+  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream);
+  virtual void copyFrom(const VectorT<T>& src);
+  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream);
+  virtual void copyTo(CpuVectorT<T>* dest) const;
+  virtual void copyTo(GpuVectorT<T>* dest) const;
+
+  /// Get the buffer point with beginPos
+  virtual T* getPoint(const uint64_t beginPos) {
+    return this->getData() + beginPos;
+  }
+
+  virtual T getElement(size_t i) const { return this->getData()[i]; }
+  virtual void setElement(size_t i, const T& value) {
+    this->getData()[i] = value;
+  }
+
+  virtual T getAbsSum();
+  virtual T getSum();
+  virtual T getMax();
+  virtual T getAbsMax();
+  virtual T getMin();
+  virtual void isEqualTo(const VectorT<T>& b, const T& value);
+  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids);
+  virtual void histogram(std::ostream& os, int type);
+  virtual void rand();
+  virtual void rand(size_t classes);
+  virtual void randnorm(real mean, real standardDeviation);
+  virtual void uniform(real left, real right);
+  virtual T get(size_t pos);
+  virtual void print(std::ostream& os, size_t num) const;
+  virtual void printOneElement(std::ostream& os, size_t idx) const;
+};
+
+template <class T>
+class ParallelCpuVectorT : public CpuVectorT<T> {
+public:
+  ParallelCpuVectorT(size_t size, SyncThreadPool* pool)
+      : CpuVectorT<T>(size), pool_(pool) {}
+
+  virtual void zeroMem() {
+    parallelExec([](CpuVectorT<T>& vec) { vec.CpuVectorT<T>::zeroMem(); });
+  }
+  virtual void randnorm(real mean, real standardDeviation) {
+    parallelExec([=](CpuVectorT<T>& vec) {
+      vec.CpuVectorT<T>::randnorm(mean, standardDeviation);
+    });
+  }
+  virtual void uniform(real left, real right) {
+    parallelExec(
+        [=](CpuVectorT<T>& vec) { vec.CpuVectorT<T>::uniform(left, right); });
+  }
+
+  virtual void exec(SyncThreadPool::JobFunc jobFunc);
+
+private:
+  typedef std::function<void(CpuVectorT<T>& vec)> ExecFunc;
+  void parallelExec(ExecFunc func);
+  SyncThreadPool* pool_;
+};
+
+/**
+ * A class to do conversion between CpuVector and GpuVector automatically.
+ */
+template <class T>
+class CpuGpuVectorT {
+public:
+  /**
+   * @brief An enum type of SyncedFlag using to
+   *        mark data memory is in CPU or GPU.
+   *
+   * DATA_AT_CPU: data is located in CPU.
+   *
+   * DATA_AT_GPU: data is located in GPU.
+   *
+   * SYNCED: data is located in CPU and GPU simultaneously.
+   */
+  enum SyncedFlag {
+    DATA_AT_CPU = 0,
+    DATA_AT_GPU = 1,
+    SYNCED = 2
+  };
+
+  /**
+   * @brief A constructor, create cpuVectorT_ or gpuVectorT_.
+   *
+   * @param[in] size    data size.
+   * @param[in] useGpu  use gpu or not.
+   */
+  explicit CpuGpuVectorT(size_t size, bool useGpu);
+
+  /**
+   * @brief A constructor, create CpuGpuVectorT by VectorT.
+   *
+   * If src is CpuVector, cpuVectorT_ is shared data with src.
+   *
+   * If src is GpuVector, gpuVectorT_ is shared data with src.
+   */
+  explicit CpuGpuVectorT(const std::shared_ptr<VectorT<T>>& src);
+
+  /**
+   * @brief A constructor.
+   *
+   * If useGpu is true, data should be located in device and
+   * create gpuVectorT_ with data.
+   *
+   * If useGpu is false, data should be located in host and
+   * create cpuVectorT_ with data.
+   *
+   * @note Data is owned by the caller and should be valid during
+   *       the life of this vector.
+   *       Caller is responsible for release the memory.
+   */
+  CpuGpuVectorT(size_t size, T* data, bool useGpu);
+
+  CpuGpuVectorT(CpuGpuVectorT<T>& src,
+    size_t offset, size_t size);
+
+  virtual ~CpuGpuVectorT() {}
+
+  static std::shared_ptr<CpuGpuVectorT<T>> create(size_t size, bool useGpu);
+
+  /**
+   * @brief resize vector.
+   *
+   * If useGpu is true, resize gpuVectorT_ and set syncFlag_ to DATA_AT_GPU,
+   *
+   * otherwise resize cpuVectorT_ and set syncFlag_ to DATA_AT_CPU.
+   */
+  void resize(size_t size, bool useGpu);
+
+  /**
+   * @brief resize or create CpuGpuVectorT.
+   */
+  static void resizeOrCreate(std::shared_ptr<CpuGpuVectorT<T>>& vec,
+                             size_t size, bool useGpu);
+
+
+  /**
+   * @brief return a const cpuVectorT_ or gpuVectorT_.
+   *
+   * If useGpu is true, return gpuVectorT_.
+   *
+   * If useGpu is false, return cpuVectorT_.
+   *
+   * @note Caller should not change the data.
+   *       If caller changes const attribute,
+   *       should set syncFlag_.
+   */
+  std::shared_ptr<const VectorT<T>> getVector(bool useGpu) const;
+
+  /**
+   * @brief return a const cpuVectorT_ or gpuVectorT_.
+   *
+   * @note: This interface will change syncFlag_, so if you will
+   *        not change the data, you should call getVector.
+   */
+  std::shared_ptr<VectorT<T>>& getMutableVector(bool useGpu);
+
+  /**
+   * @brief return const T* data.
+   *
+   * If useGpu is true, return device data.
+   *
+   * If useGpu is false, return host data.
+   */
+  const T* getData(bool useGpu) const;
+
+// TODO(yuyang18): Make getData more c++ style.
+//  inline T* getData(bool useGpu) {
+//    return getMutableData(useGpu);
+//  }
+
+  T* getMutableData(bool useGpu);
+
+  /**
+   * If useGpu is true, gpuVectorT_->Op().
+   *
+   * If useGpu is false, cpuVectorT_->Op().
+   *
+   * Op is zeroMem, fillSequence, ...
+   */
+  void zeroMem(bool useGpu);
+  void fillSequence(bool useGpu);
+  void setElement(size_t i, const T& value, bool useGpu);
+
+  /**
+   * @brief return i-th element.
+   */
+  T getElement(size_t i) const;
+
+  /**
+   * @brief return vector size.
+   */
+  size_t getSize() const {
+    size_t size = 0;
+    switch (*sync_) {
+      case SYNCED:
+      case DATA_AT_CPU:
+        size = cpuVectorT_->getSize();
+        break;
+      case DATA_AT_GPU:
+        size = gpuVectorT_->getSize();
+        break;
+      default:
+        LOG(FATAL) << "Not support";
+        break;
+    }
+    return size;
+  }
+
+  /// copy data to cpuVectorT_.
+  inline void copyToCpu(const T* data, size_t size) {
+    this->resizeOrCreate(size, false);
+    cpuVectorT_->copyFrom(data, size);
+    setSync(DATA_AT_CPU);
+  }
+  /// copy data to cpuVectorT_ using specifed-stream.
+  inline void copyToCpu(const T* data, size_t size, hl_stream_t stream) {
+    this->resizeOrCreate(size, false);
+    cpuVectorT_->copyFrom(data, size, stream);
+    setSync(DATA_AT_CPU);
+  }
+
+  /// copy data to gpuVectorT_.
+  inline void copyToGpu(const T* data, size_t size) {
+    this->resizeOrCreate(size, true);
+    gpuVectorT_->copyFrom(data, size);
+    setSync(DATA_AT_GPU);
+  }
+  /// copy data to gpuVectorT_ using specifed-stream.
+  inline void copyToGpu(const T* data, size_t size, hl_stream_t stream) {
+    this->resizeOrCreate(size, true);
+    gpuVectorT_->copyFrom(data, size, stream);
+    setSync(DATA_AT_GPU);
+  }
+
+  /**
+   * @brief copy from src using specifed-stream.
+   *
+   * If src is CpuVectorT, copy to cpuVectorT_.
+   *
+   * If src is GpuVectorT, copy to gpuVectorT_.
+   */
+  void copyFrom(const VectorT<T>& src, hl_stream_t stream);
+
+  /**
+   * @brief copy data.
+   *
+   * If useGpu is false, copy host data to cpuVectorT_.
+   *
+   * If useGpu is true, copy device data to gpuVectorT_.
+   *
+   * @note  data address should consistent with useGpu.
+   */
+  void copyFrom(const T* data, size_t size, bool useGpu);
+  void copyFrom(const T* data, size_t size, hl_stream_t stream, bool useGpu);
+
+  /**
+   * @brief copy from (src + offset) using specifed-stream.
+   */
+  void copyFrom(CpuGpuVectorT<T>& src, size_t offset, size_t size,
+                bool useGpu, hl_stream_t stream);
+
+  /**
+   * @brief copy from src using specifed-stream.
+   */
+  void copyFrom(CpuGpuVectorT<T>& src, hl_stream_t stream);
+
+  /**
+   * @brief return sync_.
+   */
+  inline SyncedFlag* getSync() const {
+    return sync_;
+  }
+
+  /**
+   * @brief set sync_.
+   */
+  inline void setSync(SyncedFlag* sync) {
+    sync_ = sync;
+  }
+
+  inline void setSync(SyncedFlag syncFlag) {
+    if (sync_) {
+      *sync_ = syncFlag;
+    } else {
+      syncFlag_ = syncFlag;
+      sync_ = &syncFlag_;
+    }
+  }
+
+  inline void setSync(bool useGpu) {
+    SyncedFlag flag = useGpu ? DATA_AT_GPU : DATA_AT_CPU;
+    setSync(flag);
+  }
+
+protected:
+  void resizeOrCreate(size_t size, bool useGpu);
+
+  /**
+   * @brief copy between cpuVectorT_ and gpuVectorT_.
+   *
+   * If syncFlag_ is DATA_AT_CPU and SYNCED, do nothing.
+   *
+   * If syncFlag_ is DATA_AT_GPU, copy gpuVectorT_ to cpuVectorT_
+   *   and set syncFlag_ to SYNCED.
+   */
+  void copyToCpu();
+
+  /**
+   * @brief copy between cpuVectorT_ and gpuVectorT_.
+   *
+   * If syncFlag_ is DATA_AT_GPU and SYNCED, do nothing.
+   *
+   * If syncFlag_ is DATA_AT_CPU, copy cpuVectorT_ to gpuVectorT_
+   *   and set syncFlag_ to SYNCED.
+   */
+  void copyToGpu();
+
+  /// host pointer.
+  std::shared_ptr<VectorT<T>> cpuVectorT_;
+  /// device pointer.
+  std::shared_ptr<VectorT<T>> gpuVectorT_;
+  /// specify current data address.
+  SyncedFlag syncFlag_;
+  SyncedFlag* sync_;
+};
+
+typedef VectorT<real> Vector;
+typedef CpuVectorT<real> CpuVector;
+typedef GpuVectorT<real> GpuVector;
+
+typedef VectorT<int> IVector;
+typedef CpuVectorT<int> CpuIVector;
+typedef GpuVectorT<int> GpuIVector;
+
+typedef std::shared_ptr<Vector> VectorPtr;
+typedef std::shared_ptr<CpuVector> CpuVectorPtr;
+typedef std::shared_ptr<GpuVector> GpuVectorPtr;
+
+typedef std::shared_ptr<IVector> IVectorPtr;
+typedef std::shared_ptr<CpuIVector> CpuIVectorPtr;
+typedef std::shared_ptr<GpuIVector> GpuIVectorPtr;
+
+typedef CpuGpuVectorT<real> CpuGpuVector;
+typedef CpuGpuVectorT<int> ICpuGpuVector;
+typedef std::shared_ptr<CpuGpuVector> CpuGpuVectorPtr;
+typedef std::shared_ptr<ICpuGpuVector> ICpuGpuVectorPtr;
+
+}  // namespace paddle
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
new file mode 100644
index 00000000000000..eb72f11e1c6538
--- /dev/null
+++ b/paddle/math/tests/CMakeLists.txt
@@ -0,0 +1,15 @@
+# unittest for common package
+
+add_simple_unittest(test_ExecViaCpu)
+add_simple_unittest(test_SIMDFunctions)
+add_simple_unittest(test_matrix)
+
+# TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
+add_unittest(test_matrixCompare
+    test_matrixCompare.cpp
+    ../../gserver/tests/TestUtil.cpp)
+
+add_simple_unittest(test_sparseMatrixCompare)
+add_simple_unittest(test_perturbation)
+add_simple_unittest(test_CpuGpuVector)
+add_simple_unittest(test_Allocator)
diff --git a/paddle/math/tests/test_Allocator.cpp b/paddle/math/tests/test_Allocator.cpp
new file mode 100644
index 00000000000000..c94e7f043c04a4
--- /dev/null
+++ b/paddle/math/tests/test_Allocator.cpp
@@ -0,0 +1,128 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <gtest/gtest.h>
+#include "paddle/utils/Util.h"
+#include "paddle/utils/Logging.h"
+#define private public
+#include "paddle/math/MemoryHandle.h"
+#include "paddle/math/Allocator.h"
+#include "paddle/math/PoolAllocator.h"
+
+using namespace paddle;     // NOLINT
+
+template<typename Allocator>
+void testPoolAllocator() {
+  PoolAllocator* pool = new PoolAllocator(new Allocator(), /* sizeLimit */1024);
+
+  /* alloc from system memory */
+  void* ptr1 = pool->alloc(10);
+  void* ptr2 = pool->alloc(200);
+  void* ptr3 = pool->alloc(200);
+  pool->free(ptr1, 10);
+  pool->free(ptr2, 200);
+  pool->free(ptr3, 200);
+  pool->printAll();
+  EXPECT_EQ((size_t)2, pool->pool_.size());
+  EXPECT_EQ((size_t)1, pool->pool_[10].size());
+  EXPECT_EQ((size_t)2, pool->pool_[200].size());
+  EXPECT_EQ(ptr1, pool->pool_[10][0]);
+  EXPECT_EQ(ptr2, pool->pool_[200][0]);
+  EXPECT_EQ(ptr3, pool->pool_[200][1]);
+
+  /* alloc from pool */
+  void* ptr4 = pool->alloc(10);
+  void* ptr5 = pool->alloc(200);
+  pool->printAll();
+  EXPECT_EQ((size_t)0, pool->pool_[10].size());
+  EXPECT_EQ((size_t)1, pool->pool_[200].size());
+  EXPECT_EQ(ptr1, ptr4);
+  EXPECT_EQ(ptr3, ptr5);
+  pool->free(ptr4, 10);
+  pool->free(ptr5, 200);
+
+  /* alloc size > sizeLimit */
+  void* ptr6 = pool->alloc(1024);
+  pool->free(ptr6, 1024);
+  EXPECT_LE((size_t)1024, pool->poolMemorySize_);
+
+  void* ptr7 = pool->alloc(1);
+  EXPECT_EQ((size_t)0, pool->poolMemorySize_);
+  EXPECT_EQ((size_t)0, pool->pool_.size());
+  pool->free(ptr7, 1);
+
+  delete pool;
+}
+
+TEST(Allocator, Pool) {
+  testPoolAllocator<CpuAllocator>();
+#ifndef PADDLE_ONLY_CPU
+  testPoolAllocator<GpuAllocator>();
+#endif
+}
+
+TEST(MemoryHandle, Cpu) {
+  for (auto size : {10, 30, 50, 100, 200, 512, 1000, 1023, 1024, 1025, 8193}) {
+    CpuMemoryHandle handle(size);
+    EXPECT_LE(handle.getSize(), handle.getAllocSize());
+  }
+
+  void* ptr1;
+  void* ptr2;
+  {
+    CpuMemoryHandle handle(256);
+    ptr1 = handle.getBuf();
+  }
+  {
+    CpuMemoryHandle handle(256);
+    ptr2 = handle.getBuf();
+  }
+  EXPECT_EQ(ptr1, ptr2);
+}
+
+#ifndef PADDLE_ONLY_CPU
+TEST(MemoryHandle, Gpu) {
+  int numGpu = hl_get_device_count();
+
+  /* alloc from system memory */
+  void* ptr3[numGpu];
+  void* ptr4[numGpu];
+  for (int i = 0; i < numGpu; i++) {
+    SetDevice device(i);
+    GpuMemoryHandle handle1(30);
+    GpuMemoryHandle handle2(30);
+    GpuMemoryHandle handle3(4000);
+    GpuMemoryHandle handle4(500);
+    ptr3[i] = handle3.getBuf();
+    ptr4[i] = handle4.getBuf();
+  }
+
+  /* alloc from pool */
+  for (int i = 0; i < numGpu; i++) {
+    SetDevice device(i);
+    GpuMemoryHandle handle1(30);
+    GpuMemoryHandle handle3(4000);
+    GpuMemoryHandle handle4(500);
+    EXPECT_EQ(ptr3[i], handle3.getBuf());
+    EXPECT_EQ(ptr4[i], handle4.getBuf());
+  }
+}
+#endif
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/math/tests/test_CpuGpuVector.cpp b/paddle/math/tests/test_CpuGpuVector.cpp
new file mode 100644
index 00000000000000..7b50b020cda931
--- /dev/null
+++ b/paddle/math/tests/test_CpuGpuVector.cpp
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_ONLY_CPU
+
+#include "paddle/utils/Util.h"
+#include "paddle/math/Vector.h"
+#include "test_matrixUtil.h"
+#include <gtest/gtest.h>
+
+using namespace paddle;  // NOLINT
+
+TEST(CpuGpuVector, getData) {
+  size_t size = 500;
+  hl_stream_t stream(HPPL_STREAM_DEFAULT);
+  CpuVectorPtr cpuVec = std::make_shared<CpuVector>(size);
+  GpuVectorPtr gpuVec = std::make_shared<GpuVector>(size);
+  cpuVec->uniform(0.0, 10.0);
+  gpuVec->copyFrom(*cpuVec, stream);
+  hl_stream_synchronize(stream);
+
+  CpuGpuVectorPtr vec = std::make_shared<CpuGpuVector>(gpuVec);
+  auto a = vec->getData(false);
+  auto b = cpuVec->getData();
+  hl_stream_synchronize(stream);
+  checkDataEqual(a, b, size);
+}
+
+TEST(CpuGpuVector, subCreate) {
+  size_t size1 = 1024;
+  size_t offset = 100;
+  size_t size2 = 500;
+  hl_stream_t stream(HPPL_STREAM_DEFAULT);
+  CpuGpuVectorPtr v1 = std::make_shared<CpuGpuVector>(size1, /*useGpu*/ false);
+  auto vec = v1->getMutableVector(false);
+  vec->uniform(0.0, 10.0);
+  auto v2 = std::make_shared<CpuGpuVector>(*v1, offset, size2);
+  CHECK_EQ(*v1->getSync(), *v2->getSync());
+
+  // check subVec equal
+  checkDataEqual(v1->getData(false) + offset, v2->getData(false), size2);
+
+  CpuVectorPtr v1Check = std::make_shared<CpuVector>(size1);
+  CpuVectorPtr v2Check = std::make_shared<CpuVector>(size2);
+  v1Check->copyFrom(*(v1->getVector(true)), stream);
+  v2Check->copyFrom(*(v2->getVector(true)), stream);
+  hl_stream_synchronize(stream);
+
+  checkDataEqual(v2->getData(false), v2Check->getData(), size2);
+  checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
+
+  CpuVectorPtr noise = std::make_shared<CpuVector>(size2);
+  noise->uniform(0.0, 1.0);
+  auto v = v2->getMutableVector(false);  // will change header
+  // add noise to subVec
+  v->add(*noise);
+
+  // check v1_cpu_data == v2_cpu_data
+  checkDataEqual(v1->getData(false) + offset, v2->getData(false), size2);
+
+  v1Check->copyFrom(*(v1->getVector(true)), stream);
+  v2Check->copyFrom(*(v2->getVector(true)), stream);
+  hl_stream_synchronize(stream);
+
+  // check v1_gpu_data == v2_gpu_data
+  checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  int ret = RUN_ALL_TESTS();
+  return ret;
+}
+
+#endif
diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp
new file mode 100644
index 00000000000000..ae201f172373ca
--- /dev/null
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/utils/PythonUtil.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include <paddle/utils/Util.h>
+#include "paddle/math/SparseMatrix.h"
+
+using namespace paddle;  // NOLINT
+
+const int height = 10;
+const int width = 16;
+
+real f(Matrix& mat1, const Matrix& mat2, IVector& vec1, const IVector& vec2,
+       real scalar) {
+  CHECK(!mat1.useGpu());
+  CHECK(!mat2.useGpu());
+  CHECK(!vec1.useGpu());
+  CHECK(!vec2.useGpu());
+  mat1.copyFrom(mat2);
+  vec1.copyFrom(vec2);
+
+  return scalar;
+}
+
+class Functor {
+public:
+  real operator()(Matrix& mat1, const Matrix& mat2, IVector& vec1,
+                  const IVector& vec2, real scalar) {
+    a_ = f(mat1, mat2, vec1, vec2, scalar);
+    return a_;
+  }
+
+private:
+  real a_;
+};
+
+template <typename F>
+void testWrapper(F&& f) {
+  MatrixPtr cpumat1 = Matrix::create(height, width, false, /*useGpu=*/false);
+  MatrixPtr cpumat2 = Matrix::create(height, width, false, /*useGpu=*/false);
+
+  IVectorPtr cpuvec1 = IVector::create(height, /*useGpu=*/false);
+  IVectorPtr cpuvec2 = IVector::create(height, /*useGpu=*/false);
+
+  const real scalar = 1.23456;
+
+  MatrixPtr gpumat1 = Matrix::create(height, width, false, /*useGpu=*/true);
+  MatrixPtr gpumat2 = Matrix::create(height, width, false, /*useGpu=*/true);
+  IVectorPtr gpuvec1 = IVector::create(height, /*useGpu=*/true);
+  IVectorPtr gpuvec2 = IVector::create(height, /*useGpu=*/true);
+
+  cpumat2->randomizeUniform();
+  cpuvec2->rand(width);
+  gpumat2->copyFrom(*cpumat2);
+  gpuvec2->copyFrom(*cpuvec2);
+
+  real ret = execViaCpu(f, *gpumat1, *gpumat2, *gpuvec1, *gpuvec2, 1.23456);
+  EXPECT_EQ(ret, scalar);
+  cpumat1->copyFrom(*gpumat1);
+  cpuvec1->copyFrom(*gpuvec1);
+
+  for (int i = 0; i < height; ++i) {
+    EXPECT_EQ(cpuvec1->getElement(i), cpuvec2->getElement(i));
+    for (int j = 0; j < width; ++j) {
+      EXPECT_EQ(cpumat1->getElement(i, j), cpumat2->getElement(i, j));
+    }
+  }
+  gpumat1->resize(height, 1);
+  execViaCpu2(&CpuMatrix::selectElements, *gpumat1, *gpumat2, *gpuvec1);
+
+  cpumat1->resize(height, 1);
+  cpumat1->selectElements(*cpumat2, *cpuvec1);
+  for (int i = 0; i < height; ++i) {
+    EXPECT_EQ(cpumat1->getElement(i, 0), gpumat1->getElement(i, 0));
+  }
+}
+
+#ifndef PADDLE_ONLY_CPU
+TEST(ExecViaCpu, test1) {
+  testWrapper(f);
+  testWrapper(&f);
+
+  auto lambda =
+      [](Matrix& mat1, const Matrix& mat2, IVector& vec1, const IVector& vec2,
+         real scalar) -> real { return f(mat1, mat2, vec1, vec2, scalar); };
+  LOG(INFO) << "lambda is_class=" << std::is_class<decltype(lambda)>::value
+            << " is_function=" << std::is_function<decltype(lambda)>::value;
+  testWrapper(lambda);
+
+  Functor functor;
+  testWrapper(functor);
+}
+#endif
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/math/tests/test_SIMDFunctions.cpp b/paddle/math/tests/test_SIMDFunctions.cpp
new file mode 100644
index 00000000000000..631d0516cf409f
--- /dev/null
+++ b/paddle/math/tests/test_SIMDFunctions.cpp
@@ -0,0 +1,177 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+
+#include "paddle/math/SIMDFunctions.h"
+#include "paddle/utils/Util.h"
+
+#include <gtest/gtest.h>
+
+#include <random>
+#include <functional>
+#include <algorithm>
+#include <memory>
+
+#include <malloc.h>
+#include <time.h>
+
+static constexpr size_t VECTOR_LEN = 3072;
+static constexpr size_t BATCH_SIZE = 64;
+static constexpr size_t ALIGN = 32;
+static_assert(VECTOR_LEN % ALIGN == 0, "VECTOR_LEN % ALIGN == 0");
+static_assert(BATCH_SIZE % ALIGN == 0, "BATCH_SIZE % ALIGN == 0");
+static constexpr float EPSILON = 1e-5;
+static std::mt19937 RandomEngine(time(0));
+
+inline static std::unique_ptr<float[]> NewVector(size_t len = VECTOR_LEN,
+                                                 size_t align = ALIGN) {
+  return std::unique_ptr<float[]>((float*)memalign(align, len * sizeof(float)));
+}
+
+inline static std::unique_ptr<float[]> NewRandomVector(size_t len = VECTOR_LEN,
+                                                       size_t align = ALIGN) {
+  std::uniform_real_distribution<float> dist(-100.0f, 100.0f);
+  auto generator = std::bind(dist, RandomEngine);
+  auto retv = NewVector(len, align);
+  std::generate_n(retv.get(), len, generator);
+  return retv;
+}
+
+TEST(SIMDFunction, addTo) {
+  typedef std::function<void(float*, const float*, size_t)> AddToMethodType;
+
+  AddToMethodType naive = paddle::simd::naive::addTo<float>;
+  AddToMethodType simd = paddle::simd::addTo<float>;
+
+  auto A = NewRandomVector();
+  auto B = NewRandomVector();
+
+  auto ACopy = NewVector();
+  memcpy(ACopy.get(), A.get(), VECTOR_LEN * sizeof(float));
+
+  naive(A.get(), B.get(), VECTOR_LEN);
+  simd(ACopy.get(), B.get(), VECTOR_LEN);
+
+  for (size_t i = 0; i < VECTOR_LEN; ++i) {
+    ASSERT_NEAR(A[i], ACopy[i], EPSILON);
+  }
+}
+
+TEST(SIMDFunction, batchAddTo) {
+  auto A = NewRandomVector();
+  auto ACopy = NewVector();
+  memcpy(ACopy.get(), A.get(), sizeof(float) * VECTOR_LEN);
+
+  std::vector<std::unique_ptr<float[]>> B;
+  for (size_t i = 0; i < BATCH_SIZE; ++i) {
+    B.emplace_back(NewRandomVector());
+  }
+  std::unique_ptr<float* []> BRaw(new float*[BATCH_SIZE]);
+  for (size_t i = 0; i < BATCH_SIZE; ++i) {
+    BRaw[i] = B[i].get();
+  }
+
+  typedef std::function<void(float*, const float**, int, size_t)>
+      BatchAddToMethodType;
+
+  BatchAddToMethodType naive = paddle::simd::naive::batchAddTo<float>;
+  BatchAddToMethodType simd = paddle::simd::batchAddTo<float>;
+
+  naive(A.get(), (const float**)BRaw.get(), BATCH_SIZE, VECTOR_LEN);
+  simd(ACopy.get(), (const float**)BRaw.get(), BATCH_SIZE, VECTOR_LEN);
+
+  for (size_t i = 0; i < VECTOR_LEN; ++i) {
+    ASSERT_NEAR(A[i], ACopy[i], EPSILON);
+  }
+}
+
+TEST(SIMDFunction, colMax) {
+  auto A = NewRandomVector(VECTOR_LEN * BATCH_SIZE);
+  auto naiveResult = NewVector(BATCH_SIZE);
+  auto simdResult = NewVector(BATCH_SIZE);
+
+  typedef std::function<void(float*, const float*, int, int)> ColMaxMethodType;
+  ColMaxMethodType naive = paddle::simd::naive::colMax<float>;
+  ColMaxMethodType simd = paddle::simd::colMax<float>;
+
+  naive(naiveResult.get(), A.get(), BATCH_SIZE, VECTOR_LEN);
+  simd(simdResult.get(), A.get(), BATCH_SIZE, VECTOR_LEN);
+
+  for (size_t i = 0; i < BATCH_SIZE; ++i) {
+    ASSERT_NEAR(naiveResult[i], simdResult[i], EPSILON);
+  }
+}
+
+TEST(SIMDFunction, decayL1_WithLR) {
+  auto dest = NewRandomVector();
+  auto src = NewRandomVector();
+  auto lr = NewRandomVector();
+  auto lambda = 0.23f;
+
+  auto simd_dest = NewVector();
+  memcpy(simd_dest.get(), dest.get(), sizeof(float) * VECTOR_LEN);
+
+  typedef std::function<void(float*, float*, float*, float, size_t)>
+      DecayL1MethodType;
+
+  DecayL1MethodType naive = [](float* d, float* s, float* lr, float l,
+                               size_t len) {
+    paddle::simd::naive::decayL1<float>(d, s, lr, l, len);
+  };
+
+  DecayL1MethodType simd = [](float* d, float* s, float* lr, float l,
+                              size_t len) {
+    paddle::simd::decayL1<float>(d, s, lr, l, len);
+  };
+
+  naive(dest.get(), src.get(), lr.get(), lambda, VECTOR_LEN);
+  simd(simd_dest.get(), src.get(), lr.get(), lambda, VECTOR_LEN);
+
+  for (size_t i = 0; i < VECTOR_LEN; ++i) {
+    ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
+  }
+}
+
+TEST(SIMDFunction, decayL1_WithoutLR) {
+  auto dest = NewRandomVector();
+  auto src = NewRandomVector();
+  auto lambda = 0.23;
+
+  auto simd_dest = NewVector();
+  memcpy(simd_dest.get(), dest.get(), sizeof(float) * VECTOR_LEN);
+
+  typedef std::function<void(float*, float*, float, size_t)> DecayL1MethodType;
+
+  DecayL1MethodType naive = [](float* d, float* s, float l, size_t len) {
+    paddle::simd::naive::decayL1<float>(d, s, l, len);
+  };
+
+  DecayL1MethodType simd = [](float* d, float* s, float l, size_t len) {
+    paddle::simd::decayL1<float>(d, s, l, len);
+  };
+
+  naive(dest.get(), src.get(), lambda, VECTOR_LEN);
+  simd(simd_dest.get(), src.get(), lambda, VECTOR_LEN);
+
+  for (size_t i = 0; i < VECTOR_LEN; ++i) {
+    ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
+  }
+}
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/math/tests/test_batchTranspose.cpp b/paddle/math/tests/test_batchTranspose.cpp
new file mode 100644
index 00000000000000..737504da388be7
--- /dev/null
+++ b/paddle/math/tests/test_batchTranspose.cpp
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "test_matrixUtil.h"
+#include "hl_batch_transpose.h"
+
+using namespace paddle;  // NOLINT
+
+#ifndef PADDLE_ONLY_CPU
+TEST(MatrixBatchTransTest, test_batch_matrix_transpose) {
+  const int nx = 100;
+  const int ny = 50;
+  const int numSamples = 50;
+
+  MatrixPtr cMat = Matrix::create(numSamples, nx * ny, false, false);
+  MatrixPtr gMat = Matrix::create(numSamples, nx * ny, false, true);
+
+  MatrixPtr cBatchTransMat = Matrix::create(numSamples, nx * ny, false, false);
+  MatrixPtr gBatchTransMat = Matrix::create(numSamples, nx * ny, false, true);
+  MatrixPtr cMat_d2h = Matrix::create(numSamples, nx * ny, false, false);
+
+  real* cData = cMat->getData();
+  real* gold = cBatchTransMat->getData();
+
+  // host
+  for (int sample_id = 0; sample_id < numSamples; ++sample_id)
+    for (int j = 0; j < ny; j++)
+      for (int i = 0; i < nx; i++)
+        cData[sample_id * nx * ny + j * nx + i] = j * nx + i;
+
+  // correct result for error checking
+  for (int sample_id = 0; sample_id < numSamples; ++sample_id)
+    for (int j = 0; j < ny; j++)
+      for (int i = 0; i < nx; i++)
+        gold[sample_id * nx * ny + i * ny + j] =
+            cData[sample_id * nx * ny + j * nx + i];
+  // device
+  gMat->copyFrom(*cMat, HPPL_STREAM_DEFAULT);
+  batchTranspose(gMat->getData(), gBatchTransMat->getData(), nx, ny,
+                 numSamples);
+  cMat_d2h->copyFrom(*gBatchTransMat, HPPL_STREAM_DEFAULT);
+  checkMatrixEqual(cBatchTransMat, cMat_d2h);
+}
+#endif
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/math/tests/test_matrix.cpp b/paddle/math/tests/test_matrix.cpp
new file mode 100644
index 00000000000000..71c9622420aef7
--- /dev/null
+++ b/paddle/math/tests/test_matrix.cpp
@@ -0,0 +1,515 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/utils/PythonUtil.h>
+#include <vector>
+#include "test_matrixUtil.h"
+
+using namespace paddle;  // NOLINT
+
+TEST(Matrix, CopyCpuMatrixToSparseMatrix) {
+  const size_t HEIGHT = 20;
+  const size_t WIDTH = 10;
+  const size_t WIDTH_TEST = 15;
+  MatrixPtr testMatrix(
+      new CpuSparseMatrix(HEIGHT, WIDTH, HEIGHT * 5, FLOAT_VALUE, SPARSE_CSR));
+  MatrixPtr testCpuMatrix(new CpuMatrix(HEIGHT, WIDTH));
+  testCpuMatrix->randomizeUniform();
+  testMatrix->copyFrom(*testCpuMatrix, HPPL_STREAM_DEFAULT);
+  MatrixPtr mulCpuMatrix(new CpuMatrix(WIDTH, WIDTH_TEST));
+  mulCpuMatrix->randomizeUniform();
+  MatrixPtr ret1(new CpuMatrix(HEIGHT, WIDTH_TEST)),
+      ret2(new CpuMatrix(HEIGHT, WIDTH_TEST));
+  ret1->zeroMem();
+  ret2->zeroMem();
+  ret1->mul(testMatrix, mulCpuMatrix, 1.0, 1.0);
+  ret2->mul(testCpuMatrix, mulCpuMatrix, 1.0, 1.0);
+  checkMatrixEqual(ret1, ret2);
+}
+
+struct MatrixPara {
+  size_t height;
+  size_t width;
+  bool trans;
+  bool sparse;
+  size_t nnz;
+  SparseFormat format;
+};
+
+#ifndef PADDLE_ONLY_CPU
+void test_sparse_matrix_mul(MatrixPara paraA, MatrixPara paraB,
+                            MatrixPara paraC) {
+  // for cpu sparse matrix mul
+  MatrixPtr cpuMatrixA, cpuMatrixB, cpuMatrixC, gpuMatrixC_d2h;
+  // for gpu sparse matrix mul
+  MatrixPtr gpuMatrixA, gpuMatrixB, gpuMatrixC;
+  // for cpu dense matrix mul
+  MatrixPtr cpuDenseA, cpuDenseB, cpuDenseC;
+
+  if (paraA.sparse) {
+    cpuMatrixA = Matrix::createSparseMatrix(paraA.height, paraA.width,
+                                            paraA.nnz, FLOAT_VALUE,
+                                            paraA.format, paraA.trans, false);
+    gpuMatrixA = Matrix::createSparseMatrix(paraA.height, paraA.width,
+                                            paraA.nnz, FLOAT_VALUE,
+                                            paraA.format, paraA.trans, true);
+  } else {
+    cpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, false);
+    gpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, true);
+  }
+  cpuDenseA = Matrix::create(paraA.height, paraA.width, paraA.trans, false);
+
+  if (paraB.sparse) {
+    cpuMatrixB = Matrix::createSparseMatrix(paraB.height, paraB.width,
+                                            paraB.nnz, FLOAT_VALUE,
+                                            paraB.format, paraB.trans, false);
+    gpuMatrixB = Matrix::createSparseMatrix(paraB.height, paraB.width,
+                                            paraB.nnz, FLOAT_VALUE,
+                                            paraB.format, paraB.trans, true);
+  } else {
+    cpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, false);
+    gpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, true);
+  }
+  cpuDenseB = Matrix::create(paraB.height, paraB.width, paraB.trans, false);
+
+  if (paraC.sparse) {
+    cpuMatrixC = Matrix::createSparseMatrix(paraC.height, paraC.width,
+                                            paraC.nnz, FLOAT_VALUE,
+                                            paraC.format, paraC.trans, false);
+    gpuMatrixC = Matrix::createSparseMatrix(paraC.height, paraC.width,
+                                            paraC.nnz, FLOAT_VALUE,
+                                            paraC.format, paraC.trans, true);
+    gpuMatrixC_d2h = Matrix::createSparseMatrix(
+        paraC.height, paraC.width, paraC.nnz, FLOAT_VALUE, paraC.format,
+        paraC.trans, false);
+  } else {
+    cpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, false);
+    gpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, true);
+    gpuMatrixC_d2h =
+        Matrix::create(paraC.height, paraC.width, paraC.trans, false);
+  }
+  cpuDenseC = Matrix::create(paraC.height, paraC.width, paraC.trans, false);
+
+  /*matrix init*/
+  hl_stream_t stream(HPPL_STREAM_1);
+  cpuMatrixA->randomizeUniform();
+  cpuMatrixB->randomizeUniform();
+  cpuMatrixC->randomizeUniform();
+
+  gpuMatrixA->copyFrom(*cpuMatrixA, stream);
+  gpuMatrixB->copyFrom(*cpuMatrixB, stream);
+  gpuMatrixC->copyFrom(*cpuMatrixC, stream);
+
+  cpuDenseA->copyFrom(*cpuMatrixA);
+  cpuDenseB->copyFrom(*cpuMatrixB);
+  cpuDenseC->copyFrom(*cpuMatrixC);
+
+  hl_stream_synchronize(stream);
+
+  /*matrix mul*/
+  cpuMatrixC->mul(cpuMatrixA, cpuMatrixB, 1.0, 1.0);
+  gpuMatrixC->mul(gpuMatrixA, gpuMatrixB, 1.0, 1.0);
+  cpuDenseC->mul(cpuDenseA, cpuDenseB, 1.0, 1.0);
+
+  gpuMatrixC_d2h->copyFrom(*gpuMatrixC, stream);
+  hl_stream_synchronize(stream);
+
+  /*check result*/
+  if (paraC.sparse) {
+    checkSMatrixEqual(
+        std::dynamic_pointer_cast<CpuSparseMatrix>(cpuMatrixC),
+        std::dynamic_pointer_cast<CpuSparseMatrix>(gpuMatrixC_d2h));
+    checkSMatrixEqual2Dense(
+        std::dynamic_pointer_cast<CpuSparseMatrix>(cpuMatrixC),
+        std::dynamic_pointer_cast<CpuMatrix>(cpuDenseC));
+  } else {
+    checkMatrixEqual(cpuMatrixC, gpuMatrixC_d2h);
+    checkMatrixEqual(cpuMatrixC, cpuDenseC);
+  }
+}
+
+TEST(Matrix, SparseMatrixMul) {
+  const size_t DIM_M = 4;
+  const size_t DIM_N = 4;
+  const size_t DIM_K = 8;
+  const size_t NNZ = 5;
+  for (auto format : {SPARSE_CSC, SPARSE_CSR}) {
+    std::string str_format = format == SPARSE_CSC ? "CSC" : "CSR";
+    LOG(INFO) << "test dense mul " << str_format;
+    test_sparse_matrix_mul(
+        {DIM_M, DIM_K, /*trans*/ false, /*sparse*/ false, NNZ, format},
+        {DIM_K, DIM_N, /*trans*/ false, /*sparse*/ true, NNZ, format},
+        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format});
+
+    LOG(INFO) << "test dense mul " << str_format << "  trans";
+    test_sparse_matrix_mul(
+        {DIM_M, DIM_K, /*trans*/ false, /*sparse*/ false, NNZ, format},
+        {DIM_N, DIM_K, /*trans*/ true, /*sparse*/ true, NNZ, format},
+        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format});
+
+    LOG(INFO) << "test dense mul dense 2 " << str_format;
+    test_sparse_matrix_mul(
+        {DIM_M, DIM_K, /*trans*/ false, /*sparse*/ false, NNZ, format},
+        {DIM_K, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format},
+        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ true, NNZ, format});
+
+    LOG(INFO) << "test denseT mul dense 2 " << str_format;
+    test_sparse_matrix_mul(
+        {DIM_K, DIM_M, /*trans*/ true, /*sparse*/ false, NNZ, format},
+        {DIM_K, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format},
+        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ true, NNZ, format});
+  }
+}
+
+TEST(Matrix, CopySparseMatrixToGpuSparseMatrix) {
+  const size_t HEIGHT = 20;
+  const size_t WIDTH = 10;
+  const size_t WIDTH_TEST = 15;
+  MatrixPtr testMatrix(
+      new CpuSparseMatrix(HEIGHT, WIDTH, HEIGHT * 2, FLOAT_VALUE, SPARSE_CSR));
+  MatrixPtr testCpuMatrix(new CpuMatrix(HEIGHT, WIDTH));
+  testCpuMatrix->randomizeUniform();
+  testMatrix->copyFrom(*testCpuMatrix, HPPL_STREAM_DEFAULT);
+
+  MatrixPtr testGpuMatrix = testMatrix->clone(HEIGHT, WIDTH, true);
+  hl_stream_t gpuStream(HPPL_STREAM_3);
+  testGpuMatrix->copyFrom(*testMatrix, gpuStream);
+  hl_stream_synchronize(gpuStream);
+
+  MatrixPtr mulCpuMatrix(new CpuMatrix(WIDTH, WIDTH_TEST));
+  mulCpuMatrix->randomizeUniform();
+  MatrixPtr mulGpuMatrix(new GpuMatrix(WIDTH, WIDTH_TEST));
+  mulGpuMatrix->copyFrom(*mulCpuMatrix);
+  MatrixPtr ret1(new CpuMatrix(HEIGHT, WIDTH_TEST));
+  MatrixPtr ret2(new GpuMatrix(HEIGHT, WIDTH_TEST));
+  ret1->zeroMem();
+  ret2->zeroMem();
+  ret1->mul(testMatrix, mulCpuMatrix, 1.0, 1.0);
+  ret2->mul(testGpuMatrix, mulGpuMatrix, 1.0, 1.0);
+  checkMatrixEqual(ret1, ret2);
+}
+
+#endif
+
+TEST(Matrix, SparseMatrixTranspose) {
+  for (auto height : {10, 50, 100}) {
+    for (auto width : {10, 50, 100}) {
+      auto nnz = height * width;
+      for (auto valueType : {FLOAT_VALUE, NO_VALUE}) {
+        for (auto format : {SPARSE_CSR, SPARSE_CSC}) {
+          for (auto sparseRate : {0.1, 0.2, 0.5}) {
+            MatrixPtr matA = Matrix::createSparseMatrix(
+                height, width, size_t(nnz * sparseRate), valueType, format);
+            MatrixPtr matB(new CpuSparseMatrix(
+                width, height, size_t(nnz * sparseRate), valueType, format));
+            matA->randomizeUniform();
+            matA->transpose(matB, false);
+
+            /*dense matrix transpose*/
+            CpuMatrixPtr matC(new CpuMatrix(height, width));
+            matC->copyFrom(*matA);
+            CpuMatrixPtr matD(new CpuMatrix(width, height));
+            matC->transpose(matD, false);
+            /*check result*/
+            checkSMatrixEqual2Dense(
+                std::dynamic_pointer_cast<CpuSparseMatrix>(matB), matD);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Matrix, CpuSparseMatrixSubMatrix) {
+  const size_t HEIGHT = 10;
+  const size_t WIDTH = 10;
+  const size_t NNZ = HEIGHT * WIDTH;
+  for (auto valueType : {FLOAT_VALUE, NO_VALUE}) {
+    size_t startRow = 3;
+    size_t rowNum = 2;
+    real sparseRate = 0.1;
+    /*sparse matrix init and get subMatrix*/
+    CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
+        HEIGHT, WIDTH, size_t(NNZ * sparseRate), valueType, SPARSE_CSR);
+    matA->randomizeUniform();
+    CpuSparseMatrixPtr matB = std::dynamic_pointer_cast<CpuSparseMatrix>(
+        matA->subMatrix(startRow, rowNum));
+
+    int start = matA->getRows()[startRow];
+    int end = matA->getRows()[startRow + rowNum];
+
+    /*compare two matrix*/
+    ASSERT_EQ(matB->getElementCnt(), size_t(end - start));
+    if (valueType == FLOAT_VALUE) {
+      for (size_t i = 0; i < matB->getElementCnt(); i++) {
+        ASSERT_FLOAT_EQ(matB->getValue()[start + i],
+                        matA->getValue()[start + i]);
+      }
+    }
+
+    for (size_t i = 0; i < matB->getElementCnt(); i++) {
+      ASSERT_EQ(matB->getCols()[start + i], matA->getCols()[start + i]);
+    }
+    for (size_t i = 0; i < rowNum; i++) {
+      ASSERT_EQ(matB->getRows()[i], matA->getRows()[startRow + i]);
+    }
+  }
+}
+
+void sparseValid(int* major, int* minor, size_t nnz, size_t majorLen,
+                 size_t minorLen) {
+  CHECK_EQ(nnz, size_t(major[majorLen - 1]));
+  CHECK_EQ(nnz, minorLen);
+  for (size_t i = 0; i < majorLen - 1; i++) {
+    EXPECT_LE(major[i], major[i + 1]);
+    for (int j = major[i]; j < major[i + 1] - 1; j++) {
+      EXPECT_LE(minor[j], minor[j + 1]);
+    }
+  }
+}
+
+TEST(Matrix, CpuSparseMatrixRandUniform) {
+  const size_t HEIGHT = 5;
+  const size_t WIDTH = 10;
+  const size_t NNZ = HEIGHT * WIDTH;
+  int* major = nullptr;
+  int* minor = nullptr;
+  size_t majorLen = 0;
+  size_t minorLen = 0;
+  size_t nnz = 0;
+  for (auto valueType : {NO_VALUE, FLOAT_VALUE}) {
+    for (auto format : {SPARSE_CSR, SPARSE_CSC}) {
+      CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
+          HEIGHT, WIDTH, size_t(NNZ * 0.1), valueType, format);
+      matA->randomizeUniform();
+      nnz = matA->getElementCnt();
+      if (format == SPARSE_CSR) {
+        majorLen = matA->getHeight() + 1;
+        minorLen = matA->getElementCnt();
+        major = matA->getRows();
+        minor = matA->getCols();
+      } else {
+        majorLen = matA->getWidth() + 1;
+        minorLen = matA->getElementCnt();
+        major = matA->getCols();
+        minor = matA->getRows();
+      }
+      sparseValid(major, minor, nnz, majorLen, minorLen);
+    }
+  }
+}
+
+TEST(Matrix, CpuSparseMatrixCopyFrom) {
+  size_t height = 10;
+  size_t width = 8;
+  int64_t indices[11] = {0, 1, 5, 5, 9, 13, 15, 17, 19, 30, 32};
+  sparse_non_value_t data[32];
+  for (size_t i = 0; i < 32; i++) {
+    data[i].col = ::rand() % width;
+  }
+  CpuSparseMatrixPtr mat = std::make_shared<CpuSparseMatrix>(
+      height, width, 32, NO_VALUE, SPARSE_CSR, false);
+  mat->copyFrom(indices, data);
+
+  /*compare indices*/
+  size_t sum = 0;
+  CHECK_EQ(sum, size_t(mat->getRows()[0]));
+  for (size_t i = 1; i < height + 1; i++) {
+    sum += indices[i] - indices[i - 1];
+    CHECK_EQ(sum, size_t(mat->getRows()[i]));
+  }
+  CHECK_EQ(mat->getElementCnt(), size_t(indices[height] - indices[0]));
+  for (size_t i = 0; i < mat->getElementCnt(); i++) {
+    CHECK_EQ(size_t(mat->getCols()[i]), size_t(data[i].col));
+  }
+}
+
+TEST(Matrix, SparseMatrixCSRFormatTrimFrom) {
+  size_t height = 10;
+  size_t width = 8;
+  int64_t indices[11] = {0, 1, 5, 5, 9, 13, 15, 17, 19, 27, 32};
+  sparse_float_value_t data[32];
+  int value[32] = {
+      1,                       // row_0 : 1
+      5, 3, 1, 6,              // row_1 : 4
+      0, 1, 2, 3,              // row_3 : 4
+      4, 5, 6, 7,              // row_4 : 4
+      2, 3,                    // row_5 : 2
+      3, 5,                    // row_6 : 2
+      0, 1,                    // row_7 : 2
+      0, 1, 2, 3, 4, 5, 6, 7,  // row_8 : 8
+      2, 4, 7, 3, 1            // row_9 : 5
+  };
+  for (size_t i = 0; i < 32; i++) {
+    data[i].col = value[i];
+    data[i].value = float(value[i]);
+  }
+  CpuSparseMatrixPtr mat = std::make_shared<CpuSparseMatrix>(
+      height, width, 32, FLOAT_VALUE, SPARSE_CSR, false);
+  mat->copyFrom(indices, data);
+
+  /*compare indices*/
+  size_t sum = 0;
+  CHECK_EQ(sum, size_t(mat->getRows()[0]));
+  for (size_t i = 1; i < height + 1; i++) {
+    sum += indices[i] - indices[i - 1];
+    CHECK_EQ(sum, size_t(mat->getRows()[i]));
+  }
+  CHECK_EQ(mat->getElementCnt(), size_t(indices[height] - indices[0]));
+  for (size_t i = 0; i < mat->getElementCnt(); i++) {
+    CHECK_EQ(size_t(mat->getCols()[i]), size_t(data[i].col));
+  }
+
+  size_t trimedWidth = 4;
+  int64_t trimedIndices[11] = {0, 1, 3, 3, 7, 7, 9, 10, 12, 16, 19};
+  sparse_float_value_t trimedData[19];
+  int trimedValue[19] = {
+      1,           // row_0 : 1
+      3, 1,        // row_1 : 2
+      0, 1, 2, 3,  // row_3 : 4
+      2, 3,        // row_5 : 2
+      3,           // row_6 : 1
+      0, 1,        // row_7 : 2
+      0, 1, 2, 3,  // row_8 : 4
+      2, 3, 1      // row_9 : 3
+  };
+  for (size_t i = 0; i < 19; i++) {
+    trimedData[i].col = trimedValue[i];
+    trimedData[i].value = float(trimedValue[i]);
+  }
+  CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
+      height, trimedWidth, 19, FLOAT_VALUE, SPARSE_CSR, false);
+  matA->copyFrom(trimedIndices, trimedData);
+
+  /*compare indices*/
+  sum = 0;
+  CHECK_EQ(sum, size_t(matA->getRows()[0]));
+  for (size_t i = 1; i < height + 1; i++) {
+    sum += trimedIndices[i] - trimedIndices[i - 1];
+    CHECK_EQ(sum, size_t(matA->getRows()[i]));
+  }
+  CHECK_EQ(matA->getElementCnt(),
+           size_t(trimedIndices[height] - trimedIndices[0]));
+  for (size_t i = 0; i < matA->getElementCnt(); i++) {
+    CHECK_EQ(size_t(matA->getCols()[i]), size_t(trimedData[i].col));
+  }
+
+  CpuSparseMatrixPtr matB = std::make_shared<CpuSparseMatrix>(
+      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, false);
+  matB->trimFrom(*mat);
+  checkSMatrixEqual2(matA, matB);
+
+#ifndef PADDLE_ONLY_CPU
+  GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
+      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, true);
+  matC->trimFrom(*mat);
+
+  CpuSparseMatrixPtr matD = std::make_shared<CpuSparseMatrix>(
+      height, trimedWidth, matC->getElementCnt(), FLOAT_VALUE, SPARSE_CSR,
+      false);
+  matD->copyFrom(*matC, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  checkSMatrixEqual2(matA, matD);
+#endif
+}
+
+TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
+  size_t height = 8;
+  size_t width = 10;
+  int indices[11] = {0, 1, 5, 5, 9, 13, 15, 17, 19, 27, 32};
+  int value[32] = {
+      1,                       // col_0 : 1
+      5, 3, 1, 6,              // col_1 : 4
+      0, 1, 2, 3,              // col_3 : 4
+      4, 5, 6, 7,              // col_4 : 4
+      2, 3,                    // col_5 : 2
+      3, 5,                    // col_6 : 2
+      0, 1,                    // col_7 : 2
+      0, 1, 2, 3, 4, 5, 6, 7,  // col_8 : 8
+      2, 4, 7, 3, 1            // col_9 : 5
+  };
+  std::vector<int> rows(value, value + 32);
+  std::vector<int> cols(indices, indices + 11);
+  std::vector<real> values(value, value + 32);
+  CpuSparseMatrixPtr mat = std::make_shared<CpuSparseMatrix>(
+      height, width, 32, FLOAT_VALUE, SPARSE_CSC, false);
+  mat->copyFrom(rows, cols, values);
+
+  /*compare indices*/
+  size_t sum = 0;
+  CHECK_EQ(sum, size_t(mat->getCols()[0]));
+  for (size_t i = 1; i < width + 1; i++) {
+    sum += indices[i] - indices[i - 1];
+    CHECK_EQ(sum, size_t(mat->getCols()[i]));
+  }
+  CHECK_EQ(mat->getElementCnt(), size_t(indices[width] - indices[0]));
+  for (size_t i = 0; i < mat->getElementCnt(); i++) {
+    CHECK_EQ(size_t(mat->getRows()[i]), size_t(value[i]));
+  }
+
+  size_t trimedWidth = 5;
+  int trimedIndices[6] = {0, 1, 5, 5, 9, 13};
+  int trimedValue[13] = {
+      1,  // col_0 : 1
+      5, 3, 1,
+      6,  // col_1 : 4
+      0, 1, 2,
+      3,  // col_3 : 4
+      4, 5, 6,
+      7  // col_4 : 4
+  };
+  std::vector<int> rowsA(trimedValue, trimedValue + 13);
+  std::vector<int> colsA(trimedIndices, trimedIndices + 6);
+  std::vector<real> valuesA(trimedValue, trimedValue + 13);
+  CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
+      height, trimedWidth, 13, FLOAT_VALUE, SPARSE_CSC, false);
+  matA->copyFrom(rowsA, colsA, valuesA);
+
+  /*compare indices*/
+  sum = 0;
+  CHECK_EQ(sum, size_t(matA->getCols()[0]));
+  for (size_t i = 1; i < trimedWidth + 1; i++) {
+    sum += trimedIndices[i] - trimedIndices[i - 1];
+    CHECK_EQ(sum, size_t(matA->getCols()[i]));
+  }
+  CHECK_EQ(matA->getElementCnt(),
+           size_t(trimedIndices[trimedWidth] - trimedIndices[0]));
+  for (size_t i = 0; i < matA->getElementCnt(); i++) {
+    CHECK_EQ(size_t(matA->getRows()[i]), size_t(rowsA[i]));
+  }
+
+  CpuSparseMatrixPtr matB = std::make_shared<CpuSparseMatrix>(
+      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, false);
+  matB->trimFrom(*mat);
+  checkSMatrixEqual2(matA, matB);
+
+#ifndef PADDLE_ONLY_CPU
+  GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
+      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, true);
+  matC->trimFrom(*mat);
+
+  CpuSparseMatrixPtr matD = std::make_shared<CpuSparseMatrix>(
+      height, trimedWidth, matC->getElementCnt(), FLOAT_VALUE, SPARSE_CSC,
+      false);
+  matD->copyFrom(*matC, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  checkSMatrixEqual2(matA, matD);
+#endif
+}
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
new file mode 100644
index 00000000000000..7caade444b827f
--- /dev/null
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -0,0 +1,1828 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_ONLY_CPU
+/// This unittest checks GpuMatrix/CpuMatrix get same result, so disable when
+/// only cpu version.
+
+#include "paddle/utils/Util.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+#include <gtest/gtest.h>
+#include "paddle/gserver/tests/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+template<class T>
+void VectorCheckEqual(const VectorT<T>& vector1, const VectorT<T>& vector2) {
+  CHECK(vector1.getSize() == vector2.getSize());
+
+  const T* data1 = vector1.getData();
+  const T* data2 = vector2.getData();
+  size_t size = vector1.getSize();
+  int count = 0;
+  for (size_t i = 0; i < size; i++) {
+    if (data1[i] != data2[i]) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+void MatrixCheckEqual(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      if (data1[i * width + j] != data2[i * width + j]) {
+        count++;
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      real a = data1[i * width + j];
+      real b = data2[i * width + j];
+      if (fabs(a - b) > err) {
+        if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
+          count++;
+        }
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+void testMatrixProjectionForward(int contextStart, int contextLength,
+                                 bool padding, int batchSize, int inputDim) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  int pad = std::max(0, -contextStart) +
+            std::max(0, contextStart + contextLength - 1);
+  if (pad == 0) padding = false;
+  MatrixPtr cpuWeight = nullptr;
+  MatrixPtr gpuWeight = nullptr;
+  if (padding) {
+    cpuWeight = std::make_shared<CpuMatrix>(pad, inputDim);
+    gpuWeight = std::make_shared<GpuMatrix>(pad, inputDim);
+    cpuWeight->randomizeUniform();
+    gpuWeight->copyFrom(*cpuWeight);
+  }
+
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+
+  MatrixPtr cpuOutput =
+      std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
+  MatrixPtr gpuOutput =
+      std::make_shared<GpuMatrix>(batchSize, inputDim * contextLength);
+  cpuOutput->randomizeUniform();
+  gpuOutput->copyFrom(*cpuOutput);
+
+  // calculate
+  int beginPad = std::max(0, -contextStart);
+  cpuOutput->contextProjectionForward(cpuInput, cpuWeight, *cpuSequence,
+                                      contextLength, contextStart, beginPad,
+                                      padding);
+
+  gpuOutput->contextProjectionForward(gpuInput, gpuWeight, *gpuSequence,
+                                      contextLength, contextStart, beginPad,
+                                      padding);
+
+  // check
+  MatrixPtr outputCheck =
+      std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
+  outputCheck->copyFrom(*gpuOutput);
+
+  MatrixCheckEqual(*cpuOutput, *outputCheck);
+}
+
+void testMatrixProjectionBackward(int contextStart, int contextLength,
+                                  bool padding, int batchSize, int inputDim) {
+  MatrixPtr cpuOutputGrad =
+      std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
+  MatrixPtr gpuOutputGrad =
+      std::make_shared<GpuMatrix>(batchSize, inputDim * contextLength);
+  cpuOutputGrad->randomizeUniform();
+  gpuOutputGrad->copyFrom(*cpuOutputGrad);
+
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+
+  MatrixPtr cpuInputGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInputGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInputGrad->randomizeUniform();
+  gpuInputGrad->copyFrom(*cpuInputGrad);
+
+  int pad = std::max(0, -contextStart) +
+            std::max(0, contextStart + contextLength - 1);
+  if (pad == 0) padding = false;
+  MatrixPtr cpuWeightGrad = nullptr;
+  MatrixPtr gpuWeightGrad = nullptr;
+  if (padding) {
+    cpuWeightGrad = std::make_shared<CpuMatrix>(pad, inputDim);
+    gpuWeightGrad = std::make_shared<GpuMatrix>(pad, inputDim);
+    cpuWeightGrad->randomizeUniform();
+    gpuWeightGrad->copyFrom(*cpuWeightGrad);
+  }
+
+  // calculate
+  int beginPad = std::max(0, -contextStart);
+  cpuOutputGrad->contextProjectionBackward(cpuInputGrad, cpuWeightGrad,
+                                           *cpuSequence, contextLength,
+                                           contextStart, beginPad, padding);
+  gpuOutputGrad->contextProjectionBackwardData(gpuInputGrad, *gpuSequence,
+                                               contextLength, contextStart);
+  if (padding) {
+    gpuOutputGrad->contextProjectionBackwardWeight(
+        gpuWeightGrad, *gpuSequence, contextLength,
+        contextStart, pad, beginPad);
+  }
+
+  // check
+  MatrixPtr inputGradCheck = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  inputGradCheck->copyFrom(*gpuInputGrad);
+  MatrixCheckErr(*cpuInputGrad, *inputGradCheck);
+
+  if (padding) {
+    MatrixPtr weightGradChcek = std::make_shared<CpuMatrix>(pad, inputDim);
+    weightGradChcek->copyFrom(*gpuWeightGrad);
+    MatrixCheckErr(*cpuWeightGrad, *weightGradChcek);
+  }
+}
+
+TEST(Matrix, projection) {
+  for (auto contextStart : {-5, -3, -1, 0, 3}) {
+    for (auto contextLength : {1, 2, 5, 7}) {
+      for (auto trainablePadding : {false, true}) {
+        for (auto batchSize : {1, 2, 5, 20, 100}) {
+          for (auto inputDim : {15, 32, 63, 128, 200}) {
+            VLOG(3) << " contextStart=" << contextStart
+                      << " contextLength=" << contextLength
+                      << " trainablePadding=" << trainablePadding
+                      << " batchSize=" << batchSize << " inputDim=" << inputDim;
+            testMatrixProjectionForward(contextStart, contextLength,
+                                        trainablePadding, batchSize, inputDim);
+            testMatrixProjectionBackward(contextStart, contextLength,
+                                         trainablePadding, batchSize, inputDim);
+          }
+        }
+      }
+    }
+  }
+}
+
+void testMatrixMaxSequence(int batchSize, int inputDim) {
+  // forward
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+
+  int newBatchSize = cpuSequence->getSize() - 1;
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
+  cpuOutput->zero();
+  gpuOutput->zero();
+
+  IVectorPtr cpuIndex = nullptr;
+  IVectorPtr gpuIndex = nullptr;
+  IVector::resizeOrCreate(cpuIndex, newBatchSize * inputDim, false);
+  IVector::resizeOrCreate(gpuIndex, newBatchSize * inputDim, true);
+  cpuIndex->zeroMem();
+  gpuIndex->zeroMem();
+
+  cpuOutput->maxSequenceForward(*cpuInput, *cpuSequence, *cpuIndex);
+  gpuOutput->maxSequenceForward(*gpuInput, *gpuSequence, *gpuIndex);
+
+  // check
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
+  outputCheck->copyFrom(*gpuOutput);
+  MatrixCheckEqual(*cpuOutput, *outputCheck);
+
+  IVectorPtr indexCheck = nullptr;
+  IVector::resizeOrCreate(indexCheck, newBatchSize * inputDim, false);
+  indexCheck->copyFrom(*gpuIndex);
+  VectorCheckEqual(*cpuIndex, *indexCheck);
+
+  // backward
+  MatrixPtr cpuOutputGrad = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
+  MatrixPtr gpuOutputGrad = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
+  cpuOutputGrad->randomizeUniform();
+  gpuOutputGrad->copyFrom(*cpuOutputGrad);
+
+  MatrixPtr cpuInputGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInputGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInputGrad->randomizeUniform();
+  gpuInputGrad->copyFrom(*cpuInputGrad);
+
+  cpuInputGrad->maxSequenceBackward(*cpuOutputGrad, *cpuSequence, *cpuIndex);
+  gpuInputGrad->maxSequenceBackward(*gpuOutputGrad, *gpuSequence, *gpuIndex);
+
+  // check
+  MatrixPtr inputGradCheck = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  inputGradCheck->copyFrom(*gpuInputGrad);
+  MatrixCheckEqual(*cpuInputGrad, *inputGradCheck);
+}
+
+TEST(Matrix, maxSequence) {
+  for (auto batchSize : {1, 10, 128, 1000, 6000}) {
+    for (auto inputDim : {1, 32, 100, 512}) {
+      VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
+      testMatrixMaxSequence(batchSize, inputDim);
+    }
+  }
+}
+
+void testMatrixGetSum(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+#ifndef PADDLE_TYPE_DOUBLE
+  int x = log10(height * width);
+  real err = 1e-6 * pow(10, x);
+#else
+  real err = 1e-8;
+#endif
+
+  real cpuSum = cpuInput->getSum();
+  real gpuSum = gpuInput->getSum();
+
+  EXPECT_LE(fabs(cpuSum - gpuSum), err);
+}
+
+void testMatrixZeroAtOffset(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr cpuTest = std::make_shared<CpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  cpuTest->copyFrom(*cpuA);
+
+  int columnOffset = rand() % width;  // NOLINT we just use rand() for test.
+  int numColumns = rand() % (width - columnOffset);  // NOLINT
+
+  cpuA->zeroAtOffset(columnOffset, numColumns);
+  gpuA->zeroAtOffset(columnOffset, numColumns);
+
+  /* cpuTest */
+  real* a = cpuTest->getData() + columnOffset;
+  for (int64_t i = 0; i < height; ++i) {
+    for (int64_t j = 0; j < numColumns; ++j) {
+      a[i * width + j] = 0;
+    }
+  }
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckEqual(*cpuA, *outputCheck);
+  MatrixCheckEqual(*cpuA, *cpuTest);
+}
+
+void testMatrixBinaryAdd(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  cpuA->add(*cpuB);
+  gpuA->add(*gpuB);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckEqual(*cpuA, *outputCheck);
+}
+
+void testMatrixAssign(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  cpuA->assign(2.5);
+  gpuA->assign(2.5);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckEqual(*cpuA, *outputCheck);
+}
+
+void testMatrixAdd(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  cpuA->add(2.5);
+  gpuA->add(2.5);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckEqual(*cpuA, *outputCheck);
+}
+
+void testMatrixSqrt(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  cpuA->sqrt();
+  gpuA->sqrt();
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckErr(*cpuA, *outputCheck);
+}
+
+void testMatrixTanhDerivative(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  cpuA->tanhDerivative(*cpuB);
+  gpuA->tanhDerivative(*gpuB);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckErr(*cpuA, *outputCheck);
+}
+
+void testMatrixTanh(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  cpuA->tanh(*cpuB);
+  gpuA->tanh(*gpuB);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckErr(*cpuA, *outputCheck);
+}
+
+void testMatrixTernarySub(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuC->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  gpuC->copyFrom(*cpuC);
+
+  cpuA->sub(*cpuB, *cpuC);
+  gpuA->sub(*gpuB, *gpuC);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckEqual(*cpuA, *outputCheck);
+}
+
+void testMatrixSumOfSquaresBp(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuC->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  gpuC->copyFrom(*cpuC);
+
+  cpuA->sumOfSquaresBp(*cpuB, *cpuC);
+  gpuA->sumOfSquaresBp(*gpuB, *gpuC);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckErr(*cpuA, *outputCheck);
+}
+
+void testMatrixBinaryRowScale(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, 1);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, 1);
+
+  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, 1);
+  MatrixPtr gpuA1 = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuB1 = std::make_shared<GpuMatrix>(height, 1);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  cpuA1->copyFrom(*cpuA);
+  cpuB1->copyFrom(*cpuB);
+  gpuA1->copyFrom(*cpuA);
+  gpuB1->copyFrom(*cpuB);
+
+  cpuA->addColVector(*cpuB);
+  gpuA->addColVector(*gpuB);
+  cpuA1->addColumnVector(*cpuB1);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckEqual(*cpuA, *outputCheck);
+
+  MatrixCheckEqual(*cpuA, *cpuA1);
+}
+
+void testMatrixAddBias(int height, int width, real scale) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+
+  cpuA->addBias(*cpuB, scale);
+  gpuA->addBias(*gpuB, scale);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckErr(*cpuA, *outputCheck);
+}
+
+void testMatrixTernaryRowScale(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
+
+  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuC1 = std::make_shared<CpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuC->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  gpuC->copyFrom(*cpuC);
+  cpuA1->copyFrom(*cpuA);
+  cpuB1->copyFrom(*cpuB);
+  cpuC1->copyFrom(*cpuC);
+
+  int columnOffset = rand() % width;  // NOLINT
+
+  cpuA->rowScale(columnOffset, *cpuB, *cpuC);
+  gpuA->rowScale(columnOffset, *gpuB, *gpuC);
+  cpuA1->rowScale2(columnOffset, *cpuB1, *cpuC1);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckEqual(*cpuA, *outputCheck);
+
+  MatrixCheckEqual(*cpuA, *cpuA1);
+}
+
+void testMatrixTernaryRowDotMul(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
+
+  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuC1 = std::make_shared<CpuMatrix>(height, width);
+
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuC->randomizeUniform();
+  cpuA1->copyFrom(*cpuA);
+  cpuB1->copyFrom(*cpuB);
+  cpuC1->copyFrom(*cpuC);
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  gpuC->copyFrom(*cpuC);
+
+  int columnOffset = rand() % width;  // NOLINT
+
+  cpuA->rowDotMul(columnOffset, *cpuB, *cpuC);
+  gpuA->rowDotMul(columnOffset, *gpuB, *gpuC);
+  cpuA1->rowDotMul2(columnOffset, *cpuB1, *cpuC1);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckErr(*cpuA, *cpuA1);
+  MatrixCheckErr(*cpuA, *outputCheck);
+}
+
+void testMatrixAddDotMulMMV(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuC = std::make_shared<CpuMatrix>(1, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuC = std::make_shared<GpuMatrix>(1, width);
+
+  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuC1 = std::make_shared<CpuMatrix>(1, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuC->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  gpuC->copyFrom(*cpuC);
+  cpuA1->copyFrom(*cpuA);
+  cpuB1->copyFrom(*cpuB);
+  cpuC1->copyFrom(*cpuC);
+
+  cpuA->addDotMulMMV(*cpuB, *cpuC);
+  gpuA->addDotMulMMV(*gpuB, *gpuC);
+  cpuA1->addDotMulMMV2(*cpuB1, *cpuC1);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckErr(*cpuA, *outputCheck);
+  MatrixCheckEqual(*cpuA, *cpuA1);
+}
+
+void testMatrixTranspose(int height, int width) {
+  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr cpuT = std::make_shared<CpuMatrix>(width, height);
+  MatrixPtr gpuT = std::make_shared<GpuMatrix>(width, height);
+
+  cpu->randomizeUniform();
+  gpu->copyFrom(*cpu);
+  cpu->transpose(cpuT, false);
+  gpu->transpose(gpuT, false);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(width, height);
+  outputCheck->copyFrom(*gpuT);
+  MatrixCheckEqual(*cpuT, *outputCheck);
+}
+
+TEST(Matrix, unary) {
+  for (auto height : {1, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+
+      // applyUnary
+      testMatrixAssign(height, width);
+      testMatrixAdd(height, width);
+      testMatrixSqrt(height, width);
+
+      // applyBinary
+      testMatrixBinaryAdd(height, width);
+      testMatrixTanh(height, width);
+      testMatrixTanhDerivative(height, width);
+
+      // applyTernary
+      testMatrixTernarySub(height, width);
+      testMatrixSumOfSquaresBp(height, width);
+
+      // asRowVector
+      testMatrixAddBias(height, width, 1.0);
+      testMatrixAddBias(height, width, 3.5);
+      testMatrixAddDotMulMMV(height, width);
+
+      // asColVector
+      testMatrixTernaryRowScale(height, width);
+      testMatrixBinaryRowScale(height, width);
+
+      // sum
+      testMatrixGetSum(height, width);
+
+      // transpose
+      testMatrixTranspose(height, width);
+    }
+  }
+}
+
+void testMatrixSoftmax(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
+
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+  cpuOutput->zero();
+  gpuOutput->zero();
+  cpuInput->softmax(*cpuOutput);
+  gpuInput->softmax(*gpuOutput);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuOutput);
+  MatrixCheckErr(*cpuOutput, *outputCheck);
+}
+
+void testSequenceSoftmax(int batchSize) {
+  // forward
+  int inputDim = 1;
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+
+  cpuInput->sequenceSoftmax(*cpuInput, *cpuSequence);
+  gpuInput->sequenceSoftmax(*gpuInput, *gpuSequence);
+
+  // check
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  outputCheck->copyFrom(*gpuInput);
+  MatrixCheckErr(*cpuInput, *outputCheck);
+}
+
+
+void testMatrixSoftmaxThreshold(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
+
+  cpuInput->randomizeUniform();
+  cpuInput->getData()[0] = 100.0;
+  gpuInput->copyFrom(*cpuInput);
+  cpuOutput->zero();
+  gpuOutput->zero();
+  cpuInput->softmax(*cpuOutput);
+  gpuInput->softmax(*gpuOutput);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuOutput);
+  // check output zero
+  int cpuCount = 0;
+  int gpuCount = 0;
+  auto zeroNum = [](MatrixPtr out, int& count) {
+    for (size_t i = 0; i < out->getHeight(); i++) {
+      for (size_t j = 0; j < out->getWidth(); j++) {
+        if (out->getElement(i, j) == 0) count++;
+      }
+    }
+  };
+  zeroNum(cpuOutput, cpuCount);
+  zeroNum(outputCheck, gpuCount);
+  EXPECT_EQ(cpuCount, 0) << "Cpu softmax output value 0";
+  EXPECT_EQ(gpuCount, 0) << "Gpu softmax output value 0";
+}
+
+void testMatrixSoftmaxBp(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
+
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+  cpuOutput->randomizeUniform();
+  gpuOutput->copyFrom(*cpuOutput);
+  gpuOutput->softmaxBackward(*gpuInput);
+
+  MatrixPtr sftMaxSum = std::make_shared<CpuMatrix>(height, 1);
+  MatrixPtr sftMaxDot = std::make_shared<CpuMatrix>(height, width);
+  sftMaxDot->dotMul(*cpuOutput, *cpuInput);
+  sftMaxSum->colMerge(*sftMaxDot);
+  cpuOutput->softmaxDerivative(*cpuInput, *sftMaxSum);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuOutput);
+  MatrixCheckErr(*cpuOutput, *outputCheck);
+}
+
+TEST(Matrix, softmax) {
+  for (auto height : {1, 11, 73, 128, 200}) {
+    for (auto width : {1, 32, 100, 512, 1000}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+
+      testMatrixSoftmax(height, width);
+      testMatrixSoftmaxBp(height, width);
+      testMatrixSoftmaxThreshold(height, width);
+    }
+    testSequenceSoftmax(height);
+  }
+}
+
+void testMatrixAddDotMulVMM(int height, int width, int endCol = 0) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(1, width);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(1, width);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
+
+  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(1, width);
+  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuC1 = std::make_shared<CpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuC->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  gpuC->copyFrom(*cpuC);
+  cpuA1->copyFrom(*cpuA);
+  cpuB1->copyFrom(*cpuB);
+  cpuC1->copyFrom(*cpuC);
+
+  if (!endCol) {
+    cpuA->addDotMulVMM(*cpuB, *cpuC);
+    gpuA->addDotMulVMM(*gpuB, *gpuC);
+    cpuA1->addDotMulVMM2(*cpuB1, *cpuC1);
+
+    MatrixCheckErr(*cpuA, *cpuA1);
+  } else {
+    MatrixPtr subCpuA = cpuA->subColMatrix(0, endCol);
+    MatrixPtr subCpuB = cpuB->subColMatrix(0, endCol);
+    MatrixPtr subCpuC = cpuC->subColMatrix(0, endCol);
+    MatrixPtr subGpuA = gpuA->subColMatrix(0, endCol);
+    MatrixPtr subGpuB = gpuB->subColMatrix(0, endCol);
+    MatrixPtr subGpuC = gpuC->subColMatrix(0, endCol);
+    subCpuA->addDotMulVMM(*subCpuB, *subCpuC);
+    subGpuA->addDotMulVMM(*subGpuB, *subGpuC);
+  }
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, width);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckErr(*cpuA, *outputCheck);
+}
+
+void testMatrixRowSum(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, 1);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, 1);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
+
+  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(height, 1);
+  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA1 = std::make_shared<GpuMatrix>(height, 1);
+  MatrixPtr gpuB1 = std::make_shared<GpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  cpuA1->copyFrom(*cpuA);
+  cpuB1->copyFrom(*cpuB);
+  gpuA1->copyFrom(*cpuA);
+  gpuB1->copyFrom(*cpuB);
+
+  cpuA->colMerge(*cpuB);
+  gpuA->colMerge(*gpuB);
+
+  cpuB1->rowSum(*cpuA1);
+  gpuB1->rowSum(*gpuA1);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, 1);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckErr(*cpuA, *outputCheck);
+  outputCheck->copyFrom(*gpuA1);
+  MatrixCheckErr(*cpuA1, *outputCheck);
+}
+
+void testMatrixRowMax(int height, int width, int endCol = 0) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, 1);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, 1);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+
+  if (!endCol) {
+    cpuB->rowMax(*cpuA);
+    gpuB->rowMax(*gpuA);
+  } else {
+    MatrixPtr subCpuB = cpuB->subColMatrix(0, endCol);
+    MatrixPtr subGpuB = gpuB->subColMatrix(0, endCol);
+    subCpuB->rowMax(*cpuA);
+    subGpuB->rowMax(*gpuA);
+  }
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, 1);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckErr(*cpuA, *outputCheck);
+}
+
+void testMatrixColSum(int height, int width, int endCol = 0) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(1, width);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(1, width);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+
+  if (!endCol) {
+    cpuA->accumulateColSum(*cpuB);
+    gpuA->accumulateColSum(*gpuB);
+  } else {
+    MatrixPtr subCpuA = cpuA->subColMatrix(0, endCol);
+    MatrixPtr subGpuA = gpuA->subColMatrix(0, endCol);
+    MatrixPtr subCpuB = cpuB->subColMatrix(0, endCol);
+    MatrixPtr subGpuB = gpuB->subColMatrix(0, endCol);
+    subCpuA->accumulateColSum(*subCpuB);
+    subGpuA->accumulateColSum(*subGpuB);
+  }
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, width);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckErr(*cpuA, *outputCheck);
+}
+
+void testMatrixColMax(int height, int width, int endCol = 0) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(1, width);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(1, width);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+
+  if (!endCol) {
+    cpuB->colMax(*cpuA);
+    gpuB->colMax(*gpuA);
+  } else {
+    MatrixPtr subCpuA = cpuA->subColMatrix(0, endCol);
+    MatrixPtr subGpuA = gpuA->subColMatrix(0, endCol);
+    MatrixPtr subCpuB = cpuB->subColMatrix(0, endCol);
+    MatrixPtr subGpuB = gpuB->subColMatrix(0, endCol);
+    subCpuB->colMax(*subCpuA);
+    subGpuB->colMax(*subGpuA);
+  }
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, width);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckErr(*cpuA, *outputCheck);
+}
+
+void testMatrixCollectBias(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(1, width);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(1, width);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+
+  real scale = 1.0f / (rand() % 10);  // NOLINT
+
+  cpuA->collectBias(*cpuB, scale);
+  gpuA->collectBias(*gpuB, scale);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, width);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckErr(*cpuA, *outputCheck);
+}
+
+void testMatrixSumOfSquares(int height, int width, int endCol = 0) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, 1);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, 1);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuC->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  gpuC->copyFrom(*cpuC);
+
+  if (!endCol) {
+    cpuA->sumOfSquares(*cpuB, *cpuC);
+    gpuA->sumOfSquares(*gpuB, *gpuC);
+  } else {
+    MatrixPtr subCpuB = cpuB->subColMatrix(0, endCol);
+    MatrixPtr subCpuC = cpuC->subColMatrix(0, endCol);
+    MatrixPtr subGpuB = gpuB->subColMatrix(0, endCol);
+    MatrixPtr subGpuC = gpuC->subColMatrix(0, endCol);
+    cpuA->sumOfSquares(*subCpuB, *subCpuC);
+    gpuA->sumOfSquares(*subGpuB, *subGpuC);
+  }
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, 1);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckErr(*cpuA, *outputCheck);
+}
+
+void testMatrixBinaryClassificationError(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
+
+  MatrixPtr cpuA2 = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuB2 = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuC2 = std::make_shared<CpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuC->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  gpuC->copyFrom(*cpuC);
+  cpuA2->copyFrom(*cpuA);
+  cpuB2->copyFrom(*cpuB);
+  cpuC2->copyFrom(*cpuC);
+
+  real scale = 0.5;
+  int columnOffset = rand() % width;  // NOLINT
+
+  cpuA->binaryClassificationError(columnOffset, *cpuB, *cpuC, scale);
+  gpuA->binaryClassificationError(columnOffset, *gpuB, *gpuC, scale);
+  cpuA2->binaryClassificationError2(columnOffset, *cpuB2, *cpuC2, scale);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuA);
+  MatrixCheckErr(*cpuA, *outputCheck);
+  MatrixCheckErr(*cpuA, *cpuA2);
+}
+
+TEST(Matrix, aggregate) {
+  for (auto height : {1, 11, 16, 32, 64, 73, 128, 200, 1024, 2345}) {
+    for (auto width : {1, 9, 16, 32, 64, 100, 512, 1000, 1024, 2453}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+      testMatrixRowSum(height, width);
+      testMatrixRowMax(height, width);
+      testMatrixColSum(height, width);
+      testMatrixColMax(height, width);
+      testMatrixCollectBias(height, width);
+      testMatrixTernaryRowDotMul(height, width);
+      testMatrixAddDotMulVMM(height, width);
+
+      testMatrixSumOfSquares(height, width);
+      testMatrixBinaryClassificationError(height, width);
+    }
+  }
+}
+
+TEST(Matrix, aggregate2) {
+  for (auto height : {16, 32, 128, 512, 1024}) {
+    for (auto width :
+         {16, 32, 64, 128, 256, 512, 768, 1024, 2048, 3072, 4096}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+
+      int endCol = rand() % width;  // NOLINT
+      testMatrixRowMax(height, width, endCol);
+      testMatrixSumOfSquares(height, width, endCol);
+      testMatrixColSum(height, width, endCol);
+      testMatrixColMax(height, width, endCol);
+      testMatrixAddDotMulVMM(height, width, endCol);
+    }
+  }
+}
+
+void testMatrixAddAtOffset(int height, int width1, int width2) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width1);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width2);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width1);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width2);
+
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+  cpuOutput->randomizeUniform();
+  gpuOutput->copyFrom(*cpuOutput);
+
+  int columnOffset = 0;
+  int offset = std::abs(width1 - width2);
+  if (offset) {
+    columnOffset = rand() % offset;  // NOLINT
+  }
+  cpuOutput->addAtOffset(*cpuInput, columnOffset);
+  gpuOutput->addAtOffset(*gpuInput, columnOffset);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width2);
+  outputCheck->copyFrom(*gpuOutput);
+  MatrixCheckEqual(*cpuOutput, *outputCheck);
+}
+
+void testMatrixAssignAtOffset(int height, int width1, int width2) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width1);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width2);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width1);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width2);
+
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+  cpuOutput->randomizeUniform();
+  gpuOutput->copyFrom(*cpuOutput);
+
+  int columnOffset = 0;
+  int offset = std::abs(width1 - width2);
+  if (offset) {
+    columnOffset = rand() % offset;  // NOLINT
+  }
+  cpuOutput->assignAtOffset(*cpuInput, columnOffset);
+  gpuOutput->assignAtOffset(*gpuInput, columnOffset);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width2);
+  outputCheck->copyFrom(*gpuOutput);
+  MatrixCheckEqual(*cpuOutput, *outputCheck);
+}
+
+TEST(Matrix, AtOffset) {
+  for (auto height : {1, 11, 73, 128, 200}) {
+    for (auto width1 : {1, 32, 100, 512, 1000}) {
+      for (auto width2 : {1, 32, 100, 512, 1000}) {
+        VLOG(3) << " height=" << height << " width1=" << width1
+                  << " width2=" << width2;
+
+        testMatrixAddAtOffset(height, width1, width2);
+        testMatrixAssignAtOffset(height, width1, width2);
+      }
+    }
+  }
+}
+
+void testMatrixSelectRows(int numSamples, int tableSize, int inputDim) {
+  MatrixPtr cpuTable = std::make_shared<CpuMatrix>(tableSize, inputDim);
+  MatrixPtr gpuTable = std::make_shared<GpuMatrix>(tableSize, inputDim);
+  cpuTable->randomizeUniform();
+  gpuTable->copyFrom(*cpuTable);
+
+  IVectorPtr cpuIds;
+  IVectorPtr gpuIds;
+  cpuIds = VectorT<int>::create(numSamples, false);
+  gpuIds = VectorT<int>::create(numSamples, true);
+  cpuIds->rand(tableSize);
+  gpuIds->copyFrom(*cpuIds);
+
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, inputDim);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, inputDim);
+  cpuOutput->randomizeUniform();
+  gpuOutput->copyFrom(*cpuOutput);
+
+  cpuOutput->selectRows(*cpuTable, *cpuIds);
+  gpuOutput->selectRows(*gpuTable, *gpuIds);
+
+  // check
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(numSamples, inputDim);
+  outputCheck->copyFrom(*gpuOutput);
+  MatrixCheckEqual(*cpuOutput, *outputCheck);
+}
+
+void testMatrixAddToRows(int numSamples, int tableSize, int inputDim) {
+  MatrixPtr cpuTable = std::make_shared<CpuMatrix>(tableSize, inputDim);
+  MatrixPtr gpuTable = std::make_shared<GpuMatrix>(tableSize, inputDim);
+  cpuTable->randomizeUniform();
+  gpuTable->copyFrom(*cpuTable);
+
+  IVectorPtr cpuIds;
+  IVectorPtr gpuIds;
+  cpuIds = VectorT<int>::create(numSamples, false);
+  gpuIds = VectorT<int>::create(numSamples, true);
+  cpuIds->rand(tableSize);
+  gpuIds->copyFrom(*cpuIds);
+
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, inputDim);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, inputDim);
+  cpuOutput->randomizeUniform();
+  gpuOutput->copyFrom(*cpuOutput);
+
+  cpuOutput->addToRows(*cpuTable, *cpuIds);
+  gpuOutput->addToRows(*gpuTable, *gpuIds);
+
+  // check
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(tableSize, inputDim);
+  outputCheck->copyFrom(*gpuTable);
+  MatrixCheckErr(*cpuTable, *outputCheck);
+}
+
+TEST(Matrix, tableProjection) {
+  for (auto numSamples : {10, 100, 1000, 10000, 80000}) {
+    for (auto tableSize : {10, 100}) {
+      for (auto inputDim : {20, 50}) {
+        VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
+                  << " inputDim=" << inputDim;
+        testMatrixSelectRows(numSamples, tableSize, inputDim);
+        testMatrixAddToRows(numSamples, tableSize, inputDim);
+      }
+    }
+  }
+}
+
+void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
+  int heightA = transa == false ? dimM : dimK;
+  int widthA = transa == false ? dimK : dimM;
+  int heightB = transb == false ? dimK : dimN;
+  int widthB = transb == false ? dimN : dimK;
+  int heightC = dimM;
+  int widthC = dimN;
+
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(heightA, widthA, transa);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(heightB, widthB, transb);
+  MatrixPtr cpuC = std::make_shared<CpuMatrix>(heightC, widthC);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(heightA, widthA, transa);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(heightB, widthB, transb);
+  MatrixPtr gpuC = std::make_shared<GpuMatrix>(heightC, widthC);
+
+  real alpha = 1.5;
+  real beta = 2.0;
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuC->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  gpuC->copyFrom(*cpuC);
+
+  cpuC->mul(cpuA, cpuB, alpha, beta);
+  gpuC->mul(gpuA, gpuB, alpha, beta);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(heightC, widthC);
+  outputCheck->copyFrom(*gpuC);
+  MatrixCheckErr(*cpuC, *outputCheck);
+}
+
+void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
+  int heightA = transa == false ? dimM : dimK;
+  int widthA = transa == false ? dimK : dimM;
+  int heightB = transb == false ? dimK : dimN;
+  int widthB = transb == false ? dimN : dimK;
+  int heightC = dimM;
+  int widthC = dimN;
+
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(heightA, widthA, transa);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(heightB, widthB, transb);
+  MatrixPtr cpuC = std::make_shared<CpuMatrix>(heightC, widthC);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(heightA, widthA, transa);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(heightB, widthB, transb);
+  MatrixPtr gpuC = std::make_shared<GpuMatrix>(heightC, widthC);
+
+  real alpha = 1.5;
+  real beta = 2.0;
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuC->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  gpuC->copyFrom(*cpuC);
+
+  auto subSize = [](int& start, int& end, int dim) {
+    if (dim == 1) {
+      start = 0;
+      end = dim;
+    } else {
+      int subDim = rand() % (dim - 1) + 1;  // NOLINT
+      start = rand() % (dim - subDim);      // NOLINT
+      end = start + subDim;
+    }
+  };
+
+  auto subMatrix = [](MatrixPtr& sub, MatrixPtr matrix, size_t startRow,
+                      size_t endRow, size_t startCol, size_t endCol) {
+    if (!matrix->isTransposed()) {
+      sub = matrix->subMatrix(startRow, endRow, startCol, endCol);
+    } else {
+      sub = matrix->subMatrix(startCol, endCol, startRow, endRow);
+    }
+  };
+
+  int startM, endM;
+  int startN, endN;
+  int startK, endK;
+  subSize(startM, endM, dimM);
+  subSize(startN, endN, dimN);
+  subSize(startK, endK, dimK);
+
+  MatrixPtr subCpuA;
+  MatrixPtr subCpuB;
+  MatrixPtr subGpuA;
+  MatrixPtr subGpuB;
+  subMatrix(subCpuA, cpuA, startM, endM, startK, endK);
+  subMatrix(subGpuA, gpuA, startM, endM, startK, endK);
+  subMatrix(subCpuB, cpuB, startK, endK, startN, endN);
+  subMatrix(subGpuB, gpuB, startK, endK, startN, endN);
+  MatrixPtr subCpuC = cpuC->subMatrix(startM, endM, startN, endN);
+  MatrixPtr subGpuC = gpuC->subMatrix(startM, endM, startN, endN);
+
+  subCpuC->mul(subCpuA, subCpuB, alpha, beta);
+  subGpuC->mul(subGpuA, subGpuB, alpha, beta);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(heightC, widthC);
+  outputCheck->copyFrom(*gpuC);
+  MatrixCheckErr(*cpuC, *outputCheck);
+}
+
+TEST(Matrix, mul) {
+  for (auto transa : {false, true}) {
+    for (auto transb : {false, true}) {
+      for (auto dimM : {1, 9, 53, 127, 345, 1023, 2135}) {
+        for (auto dimN : {1, 5, 37, 256, 1024}) {
+          for (auto dimK : {8, 45, 346, 784, 1025}) {
+            if (true == transa && true == transb) {
+              continue;
+            }
+            VLOG(3) << setiosflags(ios::left) << setfill(' ')
+                      << " transa=" << transa << " transb=" << transb
+                      << " dimM=" << setw(5) << dimM << " dimN=" << setw(5)
+                      << dimN << " dimK=" << setw(5) << dimK;
+
+            testMatrixMul(transa, transb, dimM, dimN, dimK);
+            testSubMatrixMul(transa, transb, dimM, dimN, dimK);
+          }
+        }
+      }
+    }
+  }
+}
+
+void testVectorRowFunc(int size) {
+  CpuVectorPtr cpu = std::make_shared<CpuVectorT<real>>(size);
+  GpuVectorPtr gpu = std::make_shared<GpuVectorT<real>>(size);
+
+  cpu->rand();
+  gpu->copyFrom(*cpu);
+
+  EXPECT_EQ(cpu->getMax(), gpu->getMax());
+  EXPECT_EQ(cpu->getMin(), gpu->getMin());
+  EXPECT_EQ(cpu->getAbsMax(), gpu->getAbsMax());
+}
+
+TEST(Vector, rowFunc) {
+  for (auto size : {1, 5, 31, 90, 150, 500, 1000, 4000}) {
+    VLOG(3) << " size=" << size;
+    testVectorRowFunc(size);
+  }
+}
+
+template<class T>
+void testVectorReset(int size) {
+  std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
+  std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
+
+  T value = (T)((int)rand() % 100 + 1.0f / ((int)rand() % 100));
+  cpu->reset(value);
+  gpu->reset(value);
+
+  std::shared_ptr<CpuVectorT<T>> out = std::make_shared<CpuVectorT<T>>(size);
+  out->copyFrom(*gpu);
+  VectorCheckEqual(*cpu, *out);
+}
+
+template<class T>
+void testVecortSelectFrom(int size) {
+  std::shared_ptr<CpuVectorT<T>> cpuDst = std::make_shared<CpuVectorT<T>>(size);
+  std::shared_ptr<GpuVectorT<T>> gpuDst = std::make_shared<GpuVectorT<T>>(size);
+  std::shared_ptr<CpuVectorT<T>>
+    cpuSrc = std::make_shared<CpuVectorT<T>>(size*2);
+  std::shared_ptr<GpuVectorT<T>>
+    gpuSrc = std::make_shared<GpuVectorT<T>>(size*2);
+  CpuIVectorPtr cpuIds = std::make_shared<CpuVectorT<int>>(size);
+  GpuIVectorPtr gpuIds = std::make_shared<GpuVectorT<int>>(size);
+
+  if (std::is_same<T, real>::value) {
+    cpuSrc->rand();
+  } else {
+    cpuSrc->rand(100000);
+  }
+  gpuSrc->copyFrom(*cpuSrc);
+  cpuIds->rand(size);
+  gpuIds->copyFrom(*cpuIds);
+
+  cpuDst->selectFrom(*cpuSrc, *cpuIds);
+  gpuDst->selectFrom(*gpuSrc, *gpuIds);
+
+  std::shared_ptr<CpuVectorT<T>> out = std::make_shared<CpuVectorT<T>>(size);
+  out->copyFrom(*gpuDst);
+  VectorCheckEqual(*cpuDst, *out);
+}
+
+template<class T>
+void testVecotrZeroMem(int size) {
+  std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
+  std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
+
+  cpu->zeroMem();
+  gpu->zeroMem();
+
+  std::shared_ptr<CpuVectorT<T>> out = std::make_shared<CpuVectorT<T>>(size);
+  out->copyFrom(*gpu);
+  VectorCheckEqual(*cpu, *out);
+}
+
+template<class T>
+void testVectorIsEqual(int size) {
+  std::shared_ptr<CpuVectorT<T>> cpuA = std::make_shared<CpuVectorT<T>>(size);
+  std::shared_ptr<CpuVectorT<T>> cpuB = std::make_shared<CpuVectorT<T>>(size);
+  std::shared_ptr<GpuVectorT<T>> gpuA = std::make_shared<GpuVectorT<T>>(size);
+  std::shared_ptr<GpuVectorT<T>> gpuB = std::make_shared<GpuVectorT<T>>(size);
+
+  if (std::is_same<T, real>::value) {
+    cpuB->rand();
+  } else {
+    cpuB->rand(100000);
+  }
+  gpuB->copyFrom(*cpuB);
+
+  T value = (T)((int)rand() % 100 + 1.0f / ((int)rand() % 100));
+  cpuA->isEqualTo(*cpuB, value);
+  gpuA->isEqualTo(*gpuB, value);
+
+  std::shared_ptr<CpuVectorT<T>> out = std::make_shared<CpuVectorT<T>>(size);
+  out->copyFrom(*gpuA);
+  VectorCheckEqual(*cpuA, *out);
+}
+
+TEST(Vector, Equal) {
+  for (auto size : {1, 5, 31, 90, 150, 500, 1000, 4000}) {
+    VLOG(3) << " size=" << size;
+    testVectorReset<int>(size);
+    testVectorReset<real>(size);
+    testVecortSelectFrom<int>(size);
+    testVecortSelectFrom<real>(size);
+    testVecotrZeroMem<int>(size);
+    testVecotrZeroMem<real>(size);
+    testVectorIsEqual<int>(size);
+    testVectorIsEqual<real>(size);
+  }
+}
+
+void testMatrixTopK(int samples, int dim, int beamSize) {
+  MatrixPtr cpuSrc = std::make_shared<CpuMatrix>(samples, dim);
+  MatrixPtr gpuSrc = std::make_shared<GpuMatrix>(samples, dim);
+  MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize);
+  MatrixPtr gpuVal = std::make_shared<GpuMatrix>(samples, beamSize);
+  IVectorPtr cpuIds = std::make_shared<CpuIVector>(samples * beamSize);
+  IVectorPtr gpuIds = std::make_shared<GpuIVector>(samples * beamSize);
+
+  cpuSrc->randomizeUniform();
+  gpuSrc->copyFrom(*cpuSrc);
+
+  cpuSrc->rowMax(*cpuIds, *cpuVal);
+  gpuSrc->rowMax(*gpuIds, *gpuVal);
+
+  MatrixPtr outVal = std::make_shared<CpuMatrix>(samples, beamSize);
+  outVal->copyFrom(*gpuVal);
+  MatrixCheckEqual(*cpuVal, *outVal);
+}
+
+TEST(Matrix, topK) {
+  for (auto samples : {1, 5, 31, 90, 150, 500}) {
+    for (auto dim : {1, 5 , 8, 10, 15, 64, 80, 120, 256, 300,
+                     1280, 5120, 50000}) {
+      for (auto beamSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
+        if (beamSize > dim) continue;
+        VLOG(3) << " samples=" << samples
+                << " beamSize=" << beamSize
+                << " dim=" << dim;
+        testMatrixTopK(samples, dim, beamSize);
+      }
+    }
+  }
+}
+
+void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) {
+  int nnz = samples * dim * ratio;
+  MatrixPtr cpuSrc = std::make_shared<CpuSparseMatrix>(samples, dim, nnz);
+  MatrixPtr gpuSrc = std::make_shared<GpuSparseMatrix>(samples, dim, nnz);
+  MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize);
+  MatrixPtr gpuVal = std::make_shared<GpuMatrix>(samples, beamSize);
+  IVectorPtr cpuIds = std::make_shared<CpuIVector>(samples * beamSize);
+  IVectorPtr gpuIds = std::make_shared<GpuIVector>(samples * beamSize);
+
+  cpuSrc->randomizeUniform();
+  gpuSrc->copyFrom(*cpuSrc);
+  cpuVal->zero();
+  cpuIds->zero();
+  gpuVal->zero();
+  gpuIds->zero();
+
+  cpuSrc->rowMax(*cpuIds, *cpuVal);
+  gpuSrc->rowMax(*gpuIds, *gpuVal);
+
+  MatrixPtr outCheckMaxVal = std::make_shared<CpuMatrix>(samples, beamSize);
+  outCheckMaxVal->copyFrom(*gpuVal);
+  MatrixCheckEqual(*cpuVal, *outCheckMaxVal);
+
+  IVectorPtr outCheckIds = std::make_shared<CpuIVector>(samples * beamSize);
+  outCheckIds->copyFrom(*gpuIds);
+
+  const int* data1 = cpuIds->getData();
+  const int* data2 = outCheckIds->getData();
+  size_t size = cpuIds->getSize();
+  for (size_t i = 0; i < size; i++) {
+    if (data1[i] == -1 && data1[i] != data2[i]) {
+      EXPECT_EQ(data1[i], data2[i]);
+    }
+  }
+}
+
+TEST(SMatrix, topK) {
+  for (auto samples : {1, 5, 100}) {
+    for (auto dim : {10000, 10000, 50000}) {
+      for (auto beamSize : {1, 5, 40, 100, 500}) {
+        for (auto ratio : {0.01, 0.001}) {
+          if (beamSize > dim) continue;
+          VLOG(3) << " samples=" << samples
+                  << " beamSize=" << beamSize
+                  << " dim=" << dim
+                  << " ratio=" << ratio;
+          testSMatrixTopK(samples, dim, beamSize, ratio);
+        }
+      }
+    }
+  }
+}
+
+void testMatrixCopyByRowIndex(int outHeight, int inHeight, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(inHeight, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(inHeight, width);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(outHeight, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(outHeight, width);
+  cpuOutput->zero();
+  gpuOutput->zero();
+
+  IVectorPtr cpuRowIndex = IVector::create(outHeight, false);
+  IVectorPtr gpuRowIndex = IVector::create(outHeight, true);
+  cpuRowIndex->rand(inHeight);
+  gpuRowIndex->copyFrom(*cpuRowIndex);
+
+  cpuOutput->copyByRowIndex(*cpuInput, *cpuRowIndex);
+  gpuOutput->copyByRowIndex(*gpuInput, *gpuRowIndex);
+
+  // check
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(outHeight, width);
+  outputCheck->copyFrom(*gpuOutput);
+  MatrixCheckEqual(*cpuOutput, *outputCheck);
+}
+
+TEST(Matrix, copyByRowIndex) {
+  for (auto outHeight : {31, 500, 1000}) {
+    for (auto inHeight : {17, 257, 500, 1200}) {
+      for (auto width : {512, 1024}) {
+        VLOG(3) << outHeight << " " << inHeight << " " << width;
+        testMatrixCopyByRowIndex(outHeight, inHeight, width);
+      }
+    }
+  }
+}
+
+void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+
+  int newBatchSize = cpuSequence->getSize() - 1;
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
+  cpuOutput->zero();
+  gpuOutput->zero();
+
+  cpuOutput->sequenceAvgForward(*cpuInput, *cpuSequence, mode);
+  gpuOutput->sequenceAvgForward(*gpuInput, *gpuSequence, mode);
+
+  // check
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
+  outputCheck->copyFrom(*gpuOutput);
+  MatrixCheckErr(*cpuOutput, *outputCheck);
+}
+
+TEST(Matrix, sequenceAvgForward) {
+  for (auto batchSize : {10, 128, 6000}) {
+    for (auto inputDim : {32, 100, 512}) {
+      for (auto mode : {0, 1, 2}) {
+        VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim
+                << " mode=" << mode;
+        testMatrixSequenceAvgForward(batchSize, inputDim, mode);
+      }
+    }
+  }
+}
+
+void testCosSim(int heightX, int heightY, int width, real scale) {
+  MatrixPtr prevOutX = CpuMatrix::create(heightX, width, false, false);
+  MatrixPtr prevOutY = CpuMatrix::create(heightY, width, false, false);
+  MatrixPtr output = CpuMatrix::create(heightX, 1, false, false);
+
+  prevOutX->randomizeUniform();
+  prevOutY->randomizeUniform();
+  prevOutX->add(-0.5);
+  prevOutY->add(-0.5);
+  output->randomizeUniform();
+
+  MatrixPtr prevOutXGpu = GpuMatrix::create(heightX, width, false, true);
+  MatrixPtr prevOutYGpu = GpuMatrix::create(heightY, width, false, true);
+  MatrixPtr outputGpu = GpuMatrix::create(heightX, 1, false, true);
+
+  prevOutXGpu->copyFrom(*prevOutX);
+  prevOutYGpu->copyFrom(*prevOutY);
+  outputGpu->copyFrom(*output);
+
+  output->cosSim(*prevOutX, *prevOutY, scale);
+  outputGpu->cosSim(*prevOutXGpu, *prevOutYGpu, scale);
+
+  MatrixPtr outputCheck = CpuMatrix::create(heightX, 1, false, false);
+  outputCheck->copyFrom(*outputGpu);
+  MatrixCheckErr(*output, *outputCheck);
+}
+
+TEST(Matrix, cosSim) {
+  for (auto heightX : {10, 100, 1000}) {
+    for (auto heightY : {1, heightX}) {
+      for (auto width : {10, 100, 1000}) {
+        for (auto scale : {1.0, 2.0}) {
+          testCosSim(heightX, heightY, width, scale);
+        }
+      }
+    }
+  }
+}
+
+void testCosSimDerivate(int heightX, int heightY, int width,
+                        real scale) {
+  MatrixPtr prevOutX = CpuMatrix::create(heightX, width, false, false);
+  MatrixPtr prevOutY = CpuMatrix::create(heightY, width, false, false);
+  MatrixPtr grad = CpuMatrix::create(heightX, 1, false, false);
+  MatrixPtr output = CpuMatrix::create(heightX, 1, false, false);
+  MatrixPtr prevGradX = CpuMatrix::create(heightX, width, false, false);
+  MatrixPtr prevGradY = CpuMatrix::create(heightY, width, false, false);
+
+  prevOutX->randomizeUniform();
+  prevOutY->randomizeUniform();
+  grad->randomizeUniform();
+  output->randomizeUniform();
+  prevGradX->randomizeUniform();
+  prevGradY->randomizeUniform();
+
+  MatrixPtr prevOutXGpu = GpuMatrix::create(heightX, width, false, true);
+  MatrixPtr prevOutYGpu = GpuMatrix::create(heightY, width, false, true);
+  MatrixPtr gradGpu = GpuMatrix::create(heightX, 1, false, true);
+  MatrixPtr outputGpu = GpuMatrix::create(heightX, 1, false, true);
+  MatrixPtr prevGradXGpu = GpuMatrix::create(heightX, width, false, true);
+  MatrixPtr prevGradYGpu = GpuMatrix::create(heightY, width, false, true);
+
+  prevOutXGpu->copyFrom(*prevOutX);
+  prevOutYGpu->copyFrom(*prevOutY);
+  gradGpu->copyFrom(*grad);
+  outputGpu->copyFrom(*output);
+  prevGradXGpu->copyFrom(*prevGradX);
+  prevGradYGpu->copyFrom(*prevGradY);
+
+  grad->cosSimDerivative(*output,
+                         *prevOutX,
+                         *prevOutY,
+                         *prevGradX,
+                         *prevGradY,
+                         scale);
+
+  gradGpu->cosSimDerivative(*outputGpu,
+                            *prevOutXGpu,
+                            *prevOutYGpu,
+                            *prevGradXGpu,
+                            *prevGradYGpu,
+                            scale);
+
+  MatrixPtr prevGradXCheck = CpuMatrix::create(heightX, width, false,
+                                               false);
+  MatrixPtr prevGradYCheck = CpuMatrix::create(heightY, width, false,
+                                               false);
+  prevGradXCheck->copyFrom(*prevGradXGpu);
+  prevGradYCheck->copyFrom(*prevGradYGpu);
+  MatrixCheckErr(*prevGradX, *prevGradXCheck);
+  MatrixCheckErr(*prevGradY, *prevGradYCheck);
+}
+
+TEST(Matrix, cosSimDerivate) {
+  for (auto heightX : {1, 10, 100}) {
+    for (auto heightY : {1, heightX}) {
+      for (auto width : {1, 10, 100}) {
+        for (auto scale : {1.0, 2.0}) {
+          testCosSimDerivate(heightX, heightY, width, scale);
+        }
+      }
+    }
+  }
+}
+
+
+void testParamReluForward(int height, int width, int w_height,
+                                                 int w_width) {
+  MatrixPtr output = CpuMatrix::create(height, width, false, false);
+  MatrixPtr input = CpuMatrix::create(height, width, false, false);
+  MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
+
+  output->randomizeUniform();
+  input->randomizeUniform();
+  w->randomizeUniform();
+  input->add(-0.5);
+
+  MatrixPtr outputGpu = GpuMatrix::create(height, width, false, true);
+  MatrixPtr inputGpu = GpuMatrix::create(height, width, false, true);
+  MatrixPtr wGpu = GpuMatrix::create(w_height, w_width, false, true);
+
+  inputGpu->copyFrom(*input);
+  wGpu->copyFrom(*w);
+
+  output->paramReluForward(*input, *w);
+  outputGpu->paramReluForward(*inputGpu, *wGpu);
+
+  MatrixPtr outputCheck = CpuMatrix::create(height, width, false, false);
+  outputCheck->copyFrom(*outputGpu);
+  MatrixCheckEqual(*output, *outputCheck);
+}
+
+TEST(Matrix, paramReluForward) {
+  for (auto height : {10, 100}) {
+    for (auto width : {10, 100}) {
+      for (auto w_height : {1, 2}) {
+        for (auto w_width : {1, 2}) {
+          testParamReluForward(height, width, w_height, w_width);
+        }
+      }
+    }
+  }
+}
+
+
+void testParamReluBackwardW(int height, int width, int w_height,
+                                                   int w_width) {
+  MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
+  MatrixPtr input = CpuMatrix::create(height, width, false, false);
+  MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
+
+  oGrad->randomizeUniform();
+  input->randomizeUniform();
+  w->randomizeUniform();
+  input->add(-0.5);
+
+  MatrixPtr oGradGpu = GpuMatrix::create(height, width, false, true);
+  MatrixPtr inputGpu = GpuMatrix::create(height, width, false, true);
+  MatrixPtr wGpu = GpuMatrix::create(w_height, w_width, false, true);
+
+  oGradGpu->copyFrom(*oGrad);
+  inputGpu->copyFrom(*input);
+  wGpu->copyFrom(*w);
+
+  w->paramReluBackwardW(*oGrad, *input);
+  wGpu->paramReluBackwardW(*oGradGpu, *inputGpu);
+  MatrixPtr wCheck = CpuMatrix::create(w_height, w_width, false, false);
+  wCheck->copyFrom(*wGpu);
+  MatrixCheckErr(*w, *wCheck);
+}
+
+TEST(Matrix, paramReluBackwardW) {
+  for (auto height : {10, 100}) {
+    for (auto width : {10, 100}) {
+      for (auto w_height : {1, 2}) {
+        for (auto w_width : {1, 2}) {
+          testParamReluBackwardW(height, width, w_height, w_width);
+        }
+      }
+    }
+  }
+}
+
+
+void testParamReluBackwardDiff(int height, int width, int w_height,
+                                                      int w_width) {
+  MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
+  MatrixPtr input = CpuMatrix::create(height, width, false, false);
+  MatrixPtr diff = CpuMatrix::create(height, width, false, false);
+  MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
+
+  oGrad->randomizeUniform();
+  input->randomizeUniform();
+  w->randomizeUniform();
+  diff->randomizeUniform();
+  input->add(-0.5);
+
+  MatrixPtr oGradGpu = GpuMatrix::create(height, width, false, true);
+  MatrixPtr inputGpu = GpuMatrix::create(height, width, false, true);
+  MatrixPtr diffGpu = CpuMatrix::create(height, width, false, true);
+  MatrixPtr wGpu = GpuMatrix::create(w_height, w_width, false, true);
+
+  oGradGpu->copyFrom(*oGrad);
+  inputGpu->copyFrom(*input);
+  wGpu->copyFrom(*w);
+  diffGpu->copyFrom(*diff);
+
+  diff->paramReluBackwardDiff(*oGrad, *input, *w);
+  diffGpu->paramReluBackwardDiff(*oGradGpu, *inputGpu, *wGpu);
+
+  MatrixPtr diffCheck = CpuMatrix::create(height, width, false, false);
+  diffCheck->copyFrom(*diffGpu);
+  MatrixCheckErr(*diff, *diffCheck);
+}
+
+TEST(Matrix, paramReluBackwardDiff) {
+  for (auto height : {10, 100}) {
+    for (auto width : {10, 100}) {
+      for (auto w_height : {1, 2}) {
+        for (auto w_width : {1, 2}) {
+          testParamReluBackwardDiff(height, width, w_height, w_width);
+        }
+      }
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#endif
diff --git a/paddle/math/tests/test_matrixUtil.h b/paddle/math/tests/test_matrixUtil.h
new file mode 100644
index 00000000000000..1310e509877a02
--- /dev/null
+++ b/paddle/math/tests/test_matrixUtil.h
@@ -0,0 +1,192 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <gtest/gtest.h>
+#include <paddle/utils/Util.h>
+#include "paddle/math/SparseMatrix.h"
+
+namespace paddle {
+
+void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b) {
+  ASSERT_EQ(a->getWidth(), b->getWidth());
+  ASSERT_EQ(a->getHeight(), b->getHeight());
+  ASSERT_EQ(a->isTransposed(), b->isTransposed());
+  for (size_t r = 0; r < a->getHeight(); ++r) {
+    for (size_t c = 0; c < a->getWidth(); ++c) {
+      ASSERT_FLOAT_EQ(a->getElement(r, c), b->getElement(r, c));
+    }
+  }
+}
+
+void checkSMatrixEqual(const CpuSparseMatrixPtr& a,
+                       const CpuSparseMatrixPtr& b) {
+  ASSERT_EQ(a->getWidth(), b->getWidth());
+  ASSERT_EQ(a->getHeight(), b->getHeight());
+  ASSERT_EQ(a->isTransposed(), b->isTransposed());
+  ASSERT_EQ(a->getFormat(), b->getFormat());
+  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
+  for (size_t r = 0; r < a->getElementCnt(); ++r) {
+    ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
+  }
+}
+
+void checkSMatrixEqual2(const CpuSparseMatrixPtr& a,
+                        const CpuSparseMatrixPtr& b) {
+  ASSERT_EQ(a->getWidth(), b->getWidth());
+  ASSERT_EQ(a->getHeight(), b->getHeight());
+  ASSERT_EQ(a->isTransposed(), b->isTransposed());
+  ASSERT_EQ(a->getFormat(), b->getFormat());
+  ASSERT_EQ(a->getValueType(), b->getValueType());
+  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
+  if (a->getFormat() == SPARSE_CSR) {
+    for (size_t r = 0; r < a->getElementCnt(); ++r) {
+      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
+      if (a->getValueType() == FLOAT_VALUE) {
+        ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
+      }
+    }
+    for (size_t r = 0; r <= a->getHeight(); r++) {
+      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
+    }
+  } else {
+    for (size_t r = 0; r < a->getElementCnt(); ++r) {
+      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
+      if (a->getValueType() == FLOAT_VALUE) {
+        ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
+      }
+    }
+    for (size_t r = 0; r <= a->getWidth(); r++) {
+      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
+    }
+  }
+}
+
+void checkSMatrixEqual2Dense(const CpuSparseMatrixPtr& a,
+                             const CpuMatrixPtr& b) {
+  ASSERT_EQ(a->getWidth(), b->getWidth());
+  ASSERT_EQ(a->getHeight(), b->getHeight());
+  ASSERT_EQ(a->isTransposed(), b->isTransposed());
+
+  if (a->getFormat() == SPARSE_CSC) {
+    int* rows = a->getRows();
+    for (size_t i = 0; i < a->getWidth(); i++) {
+      for (size_t j = a->getColStartIdx(i); j < a->getColStartIdx(i + 1); j++) {
+        if (a->getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a->getValue()[j], b->getElement(rows[j], i));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b->getElement(rows[j], i));
+        }
+      }
+    }
+  } else {
+    int* cols = a->getCols();
+    for (size_t i = 0; i < a->getHeight(); i++) {
+      for (size_t j = a->getRowStartIdx(i); j < a->getRowStartIdx(i + 1); j++) {
+        if (a->getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a->getValue()[j], b->getElement(i, cols[j]));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b->getElement(i, cols[j]));
+        }
+      }
+    }
+  }
+}
+
+void checkSMatrixErr(const CpuSparseMatrixPtr& a,
+                     const CpuSparseMatrixPtr& b) {
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+  ASSERT_EQ(a->getWidth(), b->getWidth());
+  ASSERT_EQ(a->getHeight(), b->getHeight());
+  ASSERT_EQ(a->isTransposed(), b->isTransposed());
+  ASSERT_EQ(a->getFormat(), b->getFormat());
+  ASSERT_EQ(a->getValueType(), b->getValueType());
+  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
+  int count = 0;
+  if (a->getFormat() == SPARSE_CSR) {
+    for (size_t r = 0; r < a->getElementCnt(); ++r) {
+      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
+      if (a->getValueType() == FLOAT_VALUE) {
+        real aVal = a->getValue()[r];
+        real bVal = b->getValue()[r];
+        if (fabs(aVal - bVal) > err) {
+          if ((fabsf(aVal - bVal) / fabsf(aVal)) > (err / 10.0f)) {
+            LOG(INFO) << "a=" << aVal << "\t" << "b=" << bVal;
+            count++;
+          }
+        }
+      }
+    }
+    for (size_t r = 0; r <= a->getHeight(); r++) {
+      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
+    }
+  } else {
+    for (size_t r = 0; r < a->getElementCnt(); ++r) {
+      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
+      if (a->getValueType() == FLOAT_VALUE) {
+        real aVal = a->getValue()[r];
+        real bVal = b->getValue()[r];
+        if (fabs(aVal - bVal) > err) {
+          if ((fabsf(aVal - bVal) / fabsf(aVal)) > (err / 10.0f)) {
+            count++;
+          }
+        }
+      }
+    }
+    for (size_t r = 0; r <= a->getWidth(); r++) {
+      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+void checkMatrixErr(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      real a = data1[i * width + j];
+      real b = data2[i * width + j];
+      if (fabs(a - b) > err) {
+        if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
+          count++;
+        }
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+void checkDataEqual(const real* a, const real* b, size_t size) {
+  for (size_t i = 0; i < size; ++i) {
+    ASSERT_FLOAT_EQ(a[i], b[i]);
+  }
+}
+
+}  //  namespace paddle
diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp
new file mode 100644
index 00000000000000..4fa9bc72013da6
--- /dev/null
+++ b/paddle/math/tests/test_perturbation.cpp
@@ -0,0 +1,252 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_ONLY_CPU
+
+#include <cmath>
+#include <gtest/gtest.h>
+#include <vector>
+#include <cuda_runtime.h>
+#include "hl_cuda.h"
+#include "hl_perturbation_util.cuh"
+
+using namespace std;  // NOLINT
+
+#define _USE_MATH_DEFINES
+
+const int NUM_IMAGES = 2;
+const int SAMPLING_RATE = 2;
+const int IMG_SIZE = 41;
+const int TGT_SIZE = 21;
+const int CHANNELS = 3;
+
+class PerturbationTest : public testing::Test {
+protected:
+  virtual void SetUp() { generateTestImages(gpuImages_); }
+
+  virtual void TearDown() {}
+
+  void allocateMem(real*& gpuAngle, real*& gpuScale, int*& gpuCenterR,
+                   int*& gpuCenterC) {
+    gpuAngle = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
+    gpuScale = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
+    gpuCenterR =
+        (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
+    gpuCenterC =
+        (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
+  }
+
+  // Generate translation parameters for testing.
+  void generateTranslationParams(int*& gpuCenterR, int*& gpuCenterC,
+                                 int imgSize) {
+    int cpuCenterR[NUM_IMAGES * SAMPLING_RATE];
+    int cpuCenterC[NUM_IMAGES * SAMPLING_RATE];
+    for (int i = 0; i < NUM_IMAGES * SAMPLING_RATE; ++i) {
+      cpuCenterR[i] = (imgSize - 1) / 2;
+      cpuCenterC[i] = (imgSize - 1) / 2 - 1;
+    }
+
+    gpuCenterR =
+        (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
+    hl_memcpy_host2device(gpuCenterR, cpuCenterR,
+                          sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
+
+    gpuCenterC =
+        (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
+    hl_memcpy_host2device(gpuCenterC, cpuCenterC,
+                          sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
+  }
+
+  // Generate rotation parameters for testing.
+  void generateRotationParams(real*& gpuAngle) {
+    real cpuAngle[NUM_IMAGES];
+    for (int i = 0; i < NUM_IMAGES; ++i) {
+      cpuAngle[i] = 90.0 * M_PI / 180.0;
+    }
+    gpuAngle = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
+    hl_memcpy_host2device(gpuAngle, cpuAngle, sizeof(real) * NUM_IMAGES);
+  }
+
+  void generateScaleParams(real*& gpuScale) {
+    real cpuScale[NUM_IMAGES];
+    for (int i = 0; i < NUM_IMAGES; ++i) {
+      cpuScale[i] = static_cast<real>(TGT_SIZE - 2) / TGT_SIZE;
+    }
+    gpuScale = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
+    hl_memcpy_host2device(gpuScale, cpuScale,
+                          sizeof(real) * NUM_IMAGES);
+  }
+
+  // Generate the test images, only the center regions are set to 1.
+  // The other parts are set to 0.
+  void generateTestImages(real*& gpuImages) {
+    const int IMAGE_MEM_SIZE = NUM_IMAGES * IMG_SIZE * IMG_SIZE * CHANNELS;
+    real cpuImages[IMAGE_MEM_SIZE];
+    // Set the middle of each image to 1.
+    real* ptr = cpuImages;
+    for (int i = 0; i < NUM_IMAGES; ++i) {
+      for (int r = 0; r < IMG_SIZE; ++r) {
+        for (int c = 0; c < IMG_SIZE; ++c) {
+          for (int ch = 0; ch < CHANNELS; ++ch) {
+            if (r >= IMG_SIZE / 4 && r < IMG_SIZE - IMG_SIZE / 4 &&
+                c >= IMG_SIZE / 4 && c < IMG_SIZE - IMG_SIZE / 4) {
+              *ptr = 1.0;
+            } else {
+              *ptr = 0.0;
+            }
+            ++ptr;
+          }
+        }
+      }
+    }
+    gpuImages = (real*)hl_malloc_device(sizeof(real) * IMAGE_MEM_SIZE);
+    hl_memcpy_host2device(gpuImages, cpuImages,
+                          sizeof(real) * IMAGE_MEM_SIZE);
+  }
+
+  real* gpuImages_;
+};
+
+// Random perturbation. Only to make sure the code does not break.
+TEST_F(PerturbationTest, random_perturb) {
+  real* gpuAngle, *gpuScaleRatio;
+  int* gpuCenterR, *gpuCenterC;
+  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
+
+  real* targets = NULL;
+  const int TARGET_MEM_SIZE =
+      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
+  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
+  hl_conv_random_disturb(gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS,
+                         NUM_IMAGES, 1.0, 1.0, SAMPLING_RATE, gpuAngle,
+                         gpuScaleRatio, gpuCenterR, gpuCenterC, 2, true,
+                         targets);
+  real cpuTargets[TARGET_MEM_SIZE];
+  hl_memcpy_device2host(cpuTargets, targets,
+                        sizeof(real) * TARGET_MEM_SIZE);
+}
+
+TEST_F(PerturbationTest, identity_perturb) {
+  real* gpuAngle, *gpuScaleRatio;
+  int* gpuCenterR, *gpuCenterC;
+  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
+
+  real* targets = NULL;
+  const int TARGET_MEM_SIZE =
+      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
+  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
+  hl_conv_random_disturb(gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS,
+                         NUM_IMAGES, 1.0, 1.0, SAMPLING_RATE, gpuAngle,
+                         gpuScaleRatio, gpuCenterR, gpuCenterC, 2, false,
+                         targets);
+  real cpuTargets[TARGET_MEM_SIZE];
+  hl_memcpy_device2host(cpuTargets, targets,
+                        sizeof(real) * TARGET_MEM_SIZE);
+  for (int i = 0; i < TARGET_MEM_SIZE; ++i) {
+    EXPECT_FLOAT_EQ(1.0, cpuTargets[i]);
+  }
+}
+
+TEST_F(PerturbationTest, translation_test) {
+  real* gpuAngle, *gpuScaleRatio;
+  int* gpuCenterR, *gpuCenterC;
+  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
+  hl_generate_disturb_params(gpuAngle, gpuScaleRatio, gpuCenterR,
+                             gpuCenterC, NUM_IMAGES, IMG_SIZE, 0.0,
+                             0.0, SAMPLING_RATE, false);
+  generateTranslationParams(gpuCenterR, gpuCenterC, IMG_SIZE);
+
+  real* targets = NULL;
+  const int TARGET_MEM_SIZE =
+      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
+  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
+  hl_conv_random_disturb_with_params(
+      gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS, NUM_IMAGES, SAMPLING_RATE,
+      gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, 2, targets);
+
+  real cpuTargets[TARGET_MEM_SIZE];
+  hl_memcpy_device2host(cpuTargets, targets,
+                        sizeof(real) * TARGET_MEM_SIZE);
+  for (int i = 0; i < SAMPLING_RATE * NUM_IMAGES; ++i) {
+    for (int p = 0; p < TGT_SIZE * TGT_SIZE * CHANNELS; ++p) {
+      const int offset = i * TGT_SIZE * TGT_SIZE * CHANNELS + p;
+      if (p < TGT_SIZE * CHANNELS) {
+        EXPECT_FLOAT_EQ(0.0, cpuTargets[offset]);
+      } else {
+        EXPECT_FLOAT_EQ(1.0, cpuTargets[offset]);
+      }
+    }
+  }
+}
+
+TEST_F(PerturbationTest, rotation_test) {
+  real* gpuAngle, *gpuScaleRatio;
+  int* gpuCenterR, *gpuCenterC;
+  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
+  hl_generate_disturb_params(gpuAngle, gpuScaleRatio, gpuCenterR,
+                             gpuCenterC, NUM_IMAGES, IMG_SIZE, 0.0,
+                             0.0, SAMPLING_RATE, false);
+  generateRotationParams(gpuAngle);
+
+  real* targets = NULL;
+  const int TARGET_MEM_SIZE =
+      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
+  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
+  hl_conv_random_disturb_with_params(
+      gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS, NUM_IMAGES, SAMPLING_RATE,
+      gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, 2, targets);
+
+  real cpuTargets[TARGET_MEM_SIZE];
+  hl_memcpy_device2host(cpuTargets, targets,
+                        sizeof(real) * TARGET_MEM_SIZE);
+  for (int i = 0; i < TARGET_MEM_SIZE; ++i) {
+    EXPECT_FLOAT_EQ(1.0, cpuTargets[i]);
+  }
+}
+
+TEST_F(PerturbationTest, scale_test) {
+  real* gpuAngle, *gpuScaleRatio;
+  int* gpuCenterR, *gpuCenterC;
+  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
+  hl_generate_disturb_params(gpuAngle, gpuScaleRatio, gpuCenterR,
+                             gpuCenterC, NUM_IMAGES, IMG_SIZE, 0.0,
+                             0.0, SAMPLING_RATE, false);
+  generateScaleParams(gpuScaleRatio);
+
+  real* targets = NULL;
+  const int TARGET_MEM_SIZE =
+      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
+  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
+  hl_conv_random_disturb_with_params(
+      gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS, NUM_IMAGES, SAMPLING_RATE,
+      gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, 2, targets);
+
+  real cpuTargets[TARGET_MEM_SIZE];
+  hl_memcpy_device2host(cpuTargets, targets,
+                        sizeof(real) * TARGET_MEM_SIZE);
+  for (int i = 0; i < SAMPLING_RATE * NUM_IMAGES; ++i) {
+    for (int p = 0; p < TGT_SIZE * TGT_SIZE * CHANNELS; ++p) {
+      const int offset = i * TGT_SIZE * TGT_SIZE * CHANNELS + p;
+      int c = (p / CHANNELS) % TGT_SIZE;
+      int r = (p / CHANNELS) / TGT_SIZE;
+      if (r == 0 || r == TGT_SIZE - 1 || c == 0 || c == TGT_SIZE - 1) {
+        EXPECT_FLOAT_EQ(0.0, cpuTargets[offset]);
+      } else {
+        EXPECT_FLOAT_EQ(1.0, cpuTargets[offset]);
+      }
+    }
+  }
+}
+
+#endif
diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp
new file mode 100644
index 00000000000000..6048dd81122292
--- /dev/null
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
@@ -0,0 +1,181 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_ONLY_CPU
+/// This unittest checks GpuSparseMatrix/CpuSparseMatrix get same result,
+//  so disable when
+/// only cpu version.
+
+#include "paddle/utils/Util.h"
+#include "paddle/math/Matrix.h"
+#include "test_matrixUtil.h"
+#include <gtest/gtest.h>
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static inline int uniformRandom(int n) { return n == 0 ? 0 : rand() % n; }
+
+void testSpMatrixAddBias(int M, int N, real rate, real scale) {
+  int nnz = M * N * rate;
+
+  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, N);
+
+  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, N);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+
+  hl_stream_t stream(HPPL_STREAM_1);
+  gpuA->copyFrom(*cpuA, stream);
+  gpuB->copyFrom(*cpuB, stream);
+  hl_stream_synchronize(stream);
+
+  cpuA->addBias(*cpuB, scale);
+  gpuA->addBias(*gpuB, scale);
+
+  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
+  outputCheck->copyFrom(*gpuA, stream);
+  hl_stream_synchronize(stream);
+  checkSMatrixEqual2(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuA),
+                     std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
+}
+
+void testSpMatrixAddDense(int M, int N, real rate) {  // add3
+  int nnz = M * N * rate;
+
+  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(M, N);
+
+  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(M, N);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+
+  hl_stream_t stream(HPPL_STREAM_3);
+  gpuA->copyFrom(*cpuA, stream);
+  gpuB->copyFrom(*cpuB, stream);
+  hl_stream_synchronize(stream);
+
+  cpuA->add3(cpuB);
+  gpuA->add3(gpuB);
+
+  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
+  outputCheck->copyFrom(*gpuA, stream);
+  hl_stream_synchronize(stream);
+  checkSMatrixEqual2(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuA),
+                     std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
+}
+
+void testSpMatrixMul(int M, int N, int K, real rate) {
+  int nnz = M * N * rate;
+
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(M, K);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(N, K);
+  MatrixPtr cpuC(new CpuSparseMatrix(M, N, nnz));
+
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(M, K);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(N, K);
+  MatrixPtr gpuC(new GpuSparseMatrix(M, N, nnz));
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuC->randomizeUniform();
+
+  hl_stream_t stream(HPPL_STREAM_3);
+  gpuA->copyFrom(*cpuA, stream);
+  gpuB->copyFrom(*cpuB, stream);
+  gpuC->copyFrom(*cpuC, stream);
+  hl_stream_synchronize(stream);
+
+  cpuC->mul(cpuA, cpuB->getTranspose(), 1, 1);
+  gpuC->mul(gpuA, gpuB->getTranspose(), 1, 1);
+
+  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
+  outputCheck->copyFrom(*gpuC, stream);
+  hl_stream_synchronize(stream);
+  checkSMatrixErr(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuC),
+                  std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
+}
+
+void testSpMatrixCollectBias(int M, int N, real rate) {
+  int nnz = M * N * rate;
+  LOG(INFO) << "nnz=" << nnz;
+
+  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, N);
+
+  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, N);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+
+  hl_stream_t stream(HPPL_STREAM_3);
+  gpuA->copyFrom(*cpuA, stream);
+  gpuB->copyFrom(*cpuB, stream);
+  hl_stream_synchronize(stream);
+
+  cpuB->collectBias(*cpuA, 1);
+  gpuB->collectBias(*gpuA, 1);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, N);
+  outputCheck->copyFrom(*gpuB, stream);
+  hl_stream_synchronize(stream);
+  checkMatrixErr(*cpuB, *outputCheck);
+}
+
+TEST(SMatrix, sMatrixOp) {
+  for (auto height : {1, 11, 200}) {
+    for (auto width : {200, 2048, 20480}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+      for (auto rate : {0.02, 0.1}) {
+        testSpMatrixAddDense(height, width, rate);
+        testSpMatrixAddBias(height, width, rate, 1.0);
+      }
+    }
+  }
+}
+
+TEST(SMatrix, sMatrixMul) {
+  for (auto M : {1, 40, 128, 200}) {
+    for (auto N : {100, 2000, 20480}) {
+      for (auto K : {100, 512, 1024}) {
+        VLOG(3) << " M=" << M << " N=" << N << " K=" << K;;
+        testSpMatrixMul(M, N, K, 0.05);
+      }
+    }
+  }
+}
+
+TEST(SMatrix, sMatrixCollectBias) {
+  for (auto height : {1, 128, 200}) {
+    for (auto width : {100, 2048, 20480}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+      testSpMatrixCollectBias(height, width, 0.1);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  int ret = RUN_ALL_TESTS();
+  return ret;
+}
+
+#endif
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
new file mode 100644
index 00000000000000..374b788418dbc6
--- /dev/null
+++ b/paddle/parameter/Argument.cpp
@@ -0,0 +1,540 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "Argument.h"
+
+#include <algorithm>
+
+namespace paddle {
+static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu,
+                          hl_stream_t stream) {
+  if (src) {
+    if (!dest) {
+      dest = src->clone(0, 0, useGpu);
+    } else {
+      dest->resize(src->getHeight(), src->getWidth());
+    }
+    dest->copyFrom(*src, stream);
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(IVectorPtr& dest, const IVectorPtr& src, bool useGpu,
+                          hl_stream_t stream) {
+  if (src) {
+    IVector::resizeOrCreate(dest, src->getSize(), useGpu);
+    dest->copyFrom(*src, stream);
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(ICpuGpuVectorPtr& dest,
+                          const ICpuGpuVectorPtr& src,
+                          bool useGpu,
+                          hl_stream_t stream) {
+  if (src) {
+    ICpuGpuVector::resizeOrCreate(dest, src->getSize(), useGpu);
+    dest->copyFrom(*src, stream);
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src,
+                          int32_t startRow, int32_t copySize, bool useGpu,
+                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
+  if (src) {
+    CHECK_LE((size_t)startRow + copySize, src->getHeight());
+
+    int height = copySize;
+    int width = src->getWidth();
+    if (!dest) {
+      dest = src->clone(height, width, useGpu);
+    } else {
+      dest->resize(height, width);
+    }
+    MatrixPtr submat = src->subMatrix(startRow, copySize);
+    dest->copyFrom(*submat, stream);
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(IVectorPtr& dest, const IVectorPtr& src,
+                          int32_t startPos, int32_t copySize, bool useGpu,
+                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
+  if (src) {
+    CHECK_LE((size_t)startPos + copySize, src->getSize());
+
+    int height = copySize;
+    IVector::resizeOrCreate(dest, height, useGpu);
+    dest->copyFrom(src->getData() + startPos, height, stream);
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(ICpuGpuVectorPtr& dest,
+                          const ICpuGpuVectorPtr& src,
+                          int32_t startPos,
+                          int32_t copySize,
+                          bool useGpu,
+                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
+  if (src) {
+    CHECK_LE((size_t)startPos + copySize, src->getSize());
+
+    ICpuGpuVector::resizeOrCreate(dest, copySize, useGpu);
+    dest->copyFrom(*src, startPos, copySize, useGpu, stream);
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(UserDefinedVectorPtr& dest,
+                          const UserDefinedVectorPtr& src, bool useGpu,
+                          hl_stream_t stream) {
+  if (src) {
+    CHECK(!useGpu) << "not implemented";
+    size_t height = src->size();
+    if (!dest) {
+      dest = std::make_shared<std::vector<void*>>(height);
+    } else {
+      dest->resize(height);
+    }
+    std::copy_n(src->begin(), height, dest->begin());
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(UserDefinedVectorPtr& dest,
+                          const UserDefinedVectorPtr& src, int32_t startPos,
+                          int32_t copySize, bool useGpu,
+                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
+  if (src) {
+    CHECK(!useGpu) << "not implemented";
+    CHECK_LE((size_t)startPos + copySize, src->size());
+
+    size_t height = copySize;
+    if (!dest) {
+      dest = std::make_shared<std::vector<void*>>(height);
+    } else {
+      dest->resize(height);
+    }
+    std::copy_n(src->begin() + startPos, height, dest->begin());
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src, bool useGpu,
+                          hl_stream_t stream) {
+  if (src) {
+    size_t height = src->size();
+    if (!dest) {
+      dest = std::make_shared<std::vector<std::string>>(height);
+    } else {
+      dest->resize(height);
+    }
+    std::copy_n(src->begin(), height, dest->begin());
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src,
+                          int32_t startPos, int32_t copySize, bool useGpu,
+                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
+  if (src) {
+    CHECK_LE((size_t)startPos + copySize, src->size());
+    size_t height = copySize;
+    if (!dest) {
+      dest = std::make_shared<std::vector<std::string>>(height);
+    } else {
+      dest->resize(height);
+    }
+    std::copy_n(src->begin() + startPos, height, dest->begin());
+  } else {
+    dest.reset();
+  }
+}
+
+void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu,
+                                 hl_stream_t stream) {
+  dataId = src.dataId;
+  resizeAndCopy(value, src.value, useGpu, stream);
+  resizeAndCopy(grad, src.grad, useGpu, stream);
+  resizeAndCopy(in, src.in, useGpu, stream);
+  resizeAndCopy(ids, src.ids, useGpu, stream);
+  resizeAndCopy(sequenceStartPositions, src.sequenceStartPositions,
+                false /* useGpu */, stream);
+  if (src.hasSubseq()) {
+    resizeAndCopy(subSequenceStartPositions,
+                  src.subSequenceStartPositions, false /* useGpu */, stream);
+  }
+  resizeAndCopy(udp, src.udp, useGpu, stream);
+  resizeAndCopy(strs, src.strs, useGpu, stream);
+}
+
+int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
+                                    int32_t copySize, bool useGpu,
+                                    hl_stream_t stream) {
+  dataId = src.dataId;
+
+  if (!src.sequenceStartPositions) {
+    // non-sequence input, copy samples directly
+    int32_t startRow = startSeq;
+    resizeAndCopy(in, src.in, startRow, copySize, useGpu, stream);
+    resizeAndCopy(value, src.value, startRow, copySize, useGpu, stream);
+    resizeAndCopy(grad, src.grad, startRow, copySize, useGpu, stream);
+    resizeAndCopy(ids, src.ids, startRow, copySize, useGpu, stream);
+    resizeAndCopy(udp, src.udp, startRow, copySize, useGpu, stream);
+    resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
+    return copySize;
+  } else {
+    // sequence input
+    const int* sequence = src.sequenceStartPositions->getData(false);
+    int32_t startRow = sequence[startSeq];           // sample start from here
+    int32_t endRow = sequence[startSeq + copySize];  // sample end
+    int32_t copyFeatureSize = endRow - startRow;     // num of samples
+    resizeAndCopy(in, src.in, startRow, copyFeatureSize, useGpu, stream);
+    resizeAndCopy(value, src.value, startRow, copyFeatureSize, useGpu, stream);
+    resizeAndCopy(grad, src.grad, startRow, copyFeatureSize, useGpu, stream);
+    resizeAndCopy(ids, src.ids, startRow, copyFeatureSize, useGpu, stream);
+    resizeAndCopy(udp, src.udp, startRow, copySize, useGpu, stream);
+    resizeAndCopy(sequenceStartPositions, src.sequenceStartPositions,
+                  startSeq, copySize + 1, false, stream);
+    // modify new sequenceStartPositions
+    int* destSequences = sequenceStartPositions->getMutableData(false);
+    for (int i = 0; i < copySize + 1; i++) {
+      destSequences[i] -= startRow;
+    }
+    CHECK_EQ(destSequences[0], 0);
+    CHECK_EQ(destSequences[copySize], copyFeatureSize);
+    if (src.hasSubseq()) {
+      // sequence has sub-sequence
+      int* subSequence = src.subSequenceStartPositions->getMutableData(false);
+      int32_t subStartSeq = 0;
+      int32_t subEndSeq = 0;
+      int numSubSequences = src.getNumSubSequences();
+      for (int i = 0; i < numSubSequences + 1; i++) {
+        if (subSequence[i] == startRow) {
+          subStartSeq = i;
+        } else if (subSequence[i] == endRow) {
+          subEndSeq = i;
+          break;
+        }
+      }
+      int32_t copySubSize = subEndSeq - subStartSeq;
+      resizeAndCopy(subSequenceStartPositions,
+                    src.subSequenceStartPositions, subStartSeq,
+                    copySubSize + 1, false, stream);
+      // modify new subSequenceStartPositions
+      int* destSubSequences = subSequenceStartPositions->getMutableData(false);
+      for (int i = 0; i < copySubSize + 1; i++) {
+        destSubSequences[i] -= startRow;
+      }
+      CHECK_EQ(destSubSequences[0], 0);
+      CHECK_EQ(destSubSequences[copySubSize], copyFeatureSize);
+    }
+    resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
+    return copyFeatureSize;
+  }
+}
+
+void Argument::concat(const std::vector<Argument>& args,
+                      const std::vector<int>& selectRows,
+                      const std::vector<int>& seqStartPos, bool useGpu,
+                      hl_stream_t stream, PassType passType) {
+  size_t batchSize = selectRows.size();
+  auto copyArg = [batchSize, stream](MatrixPtr& dst, MatrixPtr src,
+                                     int startRow, int pos, int size,
+                                     bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    size_t width = src->getWidth();
+    if (!dst) {
+      dst = src->clone(batchSize, width, useGpu);
+    } else {
+      dst->resize(batchSize, width);
+    }
+
+    MatrixPtr tmpMatrix = dst->subMatrix(startRow, size);
+    tmpMatrix->copyFrom(*src->subMatrix(pos, size), stream);
+  };
+
+  auto copyIds = [batchSize, stream](IVectorPtr& dst, const IVectorPtr& src,
+                                     int startRow, int pos, int size,
+                                     bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    IVector::resizeOrCreate(dst, batchSize, useGpu);
+    dst->subVec(startRow, size)->copyFrom(*src->subVec(pos, size), stream);
+  };
+
+  auto copyStrs = [batchSize, stream](SVectorPtr& dst, const SVectorPtr& src,
+                                      int startRow, int pos, int size,
+                                      bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    if (!dst) {
+      dst = std::make_shared<std::vector<std::string>>(batchSize);
+    } else {
+      dst->resize(batchSize);
+    }
+    std::copy(src->begin() + pos, src->begin() + pos + size,
+              dst->begin() + startRow);
+  };
+
+  dataId = args[0].dataId;
+  CHECK_NE(seqStartPos.size(), 0UL);
+  size_t sampleNum = seqStartPos.size() - 1;
+  for (size_t i = 0; i < sampleNum; ++i) {
+    int startPos = seqStartPos[i];
+    int endPos = seqStartPos[i + 1];
+    CHECK_GE(args.size(), static_cast<size_t>(endPos - startPos));
+    for (int j = startPos; j < endPos; ++j) {
+      const Argument& arg = args[j - startPos];
+      CHECK_EQ(arg.dataId, dataId) << "Arguments in concat should have"
+                                   << " same dataId";
+      const int copySize = 1;
+      const int rowIdx = selectRows[j];
+      copyArg(in, arg.in, j, rowIdx, copySize, useGpu);
+      copyArg(value, arg.value, j, rowIdx, copySize, useGpu);
+      if (passType != PASS_TEST) {
+        copyArg(grad, arg.grad, j, rowIdx, copySize, useGpu);
+      }
+      copyIds(ids, arg.ids, j, rowIdx, copySize, useGpu);
+      copyStrs(strs, arg.strs, j, rowIdx, copySize, useGpu);
+    }
+  }
+  ICpuGpuVector::resizeOrCreate(sequenceStartPositions,
+                          seqStartPos.size(), useGpu);
+  sequenceStartPositions->copyFrom(seqStartPos.data(),
+                                   seqStartPos.size(), useGpu);
+}
+
+void Argument::concat(const std::vector<Argument>& args, bool useGpu,
+                      hl_stream_t stream, PassType passType) {
+  int32_t batchSize = 0;
+  int64_t numSequences = 0;
+  for (auto& arg : args) {
+    batchSize += arg.getBatchSize();
+    numSequences += arg.getNumSequences();
+  }
+
+  auto copyArg = [batchSize, stream](MatrixPtr& dst, MatrixPtr src,
+                                     int startRow, bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    size_t width = src->getWidth();
+    if (!dst) {
+      dst = src->clone(batchSize, width, useGpu);
+    } else {
+      dst->resize(batchSize, width);
+    }
+
+    MatrixPtr tmpMatrix = dst->subMatrix(startRow, src->getHeight());
+    tmpMatrix->copyFrom(*src, stream);
+  };
+
+  auto copyIds = [batchSize, stream](IVectorPtr& dst, const IVectorPtr& src,
+                                     int startRow, bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    IVector::resizeOrCreate(dst, batchSize, useGpu);
+    dst->subVec(startRow, src->getSize())->copyFrom(*src, stream);
+  };
+
+  auto copyStrs = [batchSize, stream](SVectorPtr& dst, const SVectorPtr& src,
+                                      int startRow, bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    if (!dst) {
+      dst = std::make_shared<std::vector<std::string>>(batchSize);
+    } else {
+      dst->resize(batchSize);
+    }
+    std::copy(src->begin(), src->end(), dst->begin() + startRow);
+  };
+
+  int startRow = 0;
+  int startSequences = 0;
+  dataId = args[0].dataId;
+  for (auto& arg : args) {
+    CHECK_EQ(arg.dataId, dataId) << "Arguments in concat should have"
+                                 << " same dataId";
+    copyArg(in, arg.in, startRow, useGpu);
+    copyArg(value, arg.value, startRow, useGpu);
+    if (passType != PASS_TEST) copyArg(grad, arg.grad, startRow, useGpu);
+    copyIds(ids, arg.ids, startRow, useGpu);
+    if (arg.sequenceStartPositions) {
+      ICpuGpuVector::resizeOrCreate(sequenceStartPositions,
+                                     numSequences + 1,
+                                     false);
+      const int* src = arg.sequenceStartPositions->getData(false);
+      int* dest = sequenceStartPositions->getMutableData(false);
+      for (int i = 0; i < arg.getNumSequences() + 1; ++i) {
+        dest[i + startSequences] = src[i] + startRow;
+      }
+      startSequences += arg.getNumSequences();
+    }
+    copyStrs(strs, arg.strs, startRow, useGpu);
+    startRow += arg.getBatchSize();
+  }
+}
+
+void Argument::splitByDataId(const std::vector<Argument>& argus,
+                             std::vector<std::vector<Argument>>* arguGroups) {
+  arguGroups->clear();
+  int lastDataId = -1;
+  for (const auto& argu : argus) {
+    if (argu.dataId == -1) {
+      // is -1, then create a new group
+      arguGroups->emplace_back();
+      lastDataId = -1;
+    } else if (argu.dataId != lastDataId) {
+      // not -1, also not equal to last Argument, then create a new group
+      arguGroups->emplace_back();
+      lastDataId = argu.dataId;
+    } else {
+      // not -1, and equal to last Argument, do nothing
+    }
+    arguGroups->back().push_back(argu);
+  }
+}
+
+void Argument::getSeqLengthAndStart(
+    std::vector<std::tuple<int, int, int, int>>* seqLengthAndStart,
+    int* maxSequenceLength) const {
+  const int* starts = sequenceStartPositions->getData(false);
+  if (hasSubseq()) {
+    size_t numSubSequences = getNumSubSequences();
+    (*seqLengthAndStart).reserve(numSubSequences);
+    const int* subStarts = subSequenceStartPositions->getData(false);
+    int seqIndex = 0;
+    int subSeqIndex = 0;
+    *maxSequenceLength = 0;
+    for (size_t i = 0; i < numSubSequences; ++i) {
+      if (subStarts[i] == starts[seqIndex]) {
+        subSeqIndex = 0;
+        (*seqLengthAndStart)
+            .push_back(std::make_tuple<int, int, int, int>(
+                subStarts[i + 1] - subStarts[i], (int)subStarts[i],
+                (int)seqIndex, (int)subSeqIndex));
+        ++subSeqIndex;
+        ++seqIndex;
+      } else if (subStarts[i] < starts[seqIndex]) {
+        (*seqLengthAndStart)
+            .push_back(std::make_tuple<int, int, int, int>(
+                subStarts[i + 1] - subStarts[i], (int)subStarts[i],
+                (int)seqIndex - 1, (int)subSeqIndex));
+        ++subSeqIndex;
+      }
+      // maxSequenceLength_ = 1 + max(subSeqIndex) in each Seq.
+      if (*maxSequenceLength < std::get<3>((*seqLengthAndStart)[i]))
+        *maxSequenceLength = std::get<3>((*seqLengthAndStart)[i]);
+    }
+    *maxSequenceLength += 1;
+  } else {
+    size_t numSequences = getNumSequences();
+    (*seqLengthAndStart).reserve(numSequences);
+    for (size_t i = 0; i < numSequences; ++i) {
+      (*seqLengthAndStart)
+          .push_back(std::make_tuple<int, int, int, int>(
+              starts[i + 1] - starts[i], (int)starts[i], (int)i, (int)i));
+    }
+    std::sort((*seqLengthAndStart).begin(), (*seqLengthAndStart).end(),
+              std::greater<std::tuple<int, int, int, int>>());
+
+    *maxSequenceLength = std::get<0>((*seqLengthAndStart)[0]);
+  }
+}
+
+void Argument::checkSubset() const {
+  if (getNumSequences() > getNumSubSequences()) {
+    LOG(FATAL) << "numSubSequences is less than numSequences ("
+               << getNumSubSequences() << " vs. " << getNumSequences() << ")";
+  }
+  const int* start = sequenceStartPositions->getData(false);
+  const int* subStart = subSequenceStartPositions->getData(false);
+  int seqId = 0;
+  int subSeqId = 0;
+  while (seqId < getNumSequences() && subSeqId < getNumSubSequences()) {
+    if (start[seqId] > subStart[subSeqId]) {
+      ++subSeqId;
+    } else if (start[seqId] == subStart[subSeqId]) {
+      ++subSeqId;
+      ++seqId;
+    } else {
+      LOG(FATAL) << "seqStartPositions is not subset of subSeqStartPositions";
+    }
+  }
+  if (seqId < getNumSequences()) {
+    LOG(FATAL) << "seqStartPositions is not subset of subSeqStartPositions";
+  }
+}
+
+void Argument::degradeSequence(const Argument& input, bool useGpu) {
+  CHECK_EQ(input.hasSubseq(), 1UL);
+  size_t numSequences = input.getNumSequences();
+  size_t numSubSequences = input.getNumSubSequences();
+  ICpuGpuVector::resizeOrCreate(sequenceStartPositions,
+                                 numSequences + 1,
+                                 false);
+  int* tgtBuf = sequenceStartPositions->getMutableData(false);
+  const int* starts = input.sequenceStartPositions->getData(false);
+  const int* subStarts = input.subSequenceStartPositions->getData(false);
+  int seqId = 0;
+  for (size_t subSeqId = 0; subSeqId < numSubSequences; ++subSeqId) {
+    if (subStarts[subSeqId] == starts[seqId]) {
+      tgtBuf[seqId] = subSeqId;
+      seqId++;
+    }
+  }
+  tgtBuf[numSequences] = numSubSequences;
+}
+
+void Argument::subArgFrom(const Argument& input, size_t offset, size_t height,
+                          size_t width, bool useGpu, bool trans, bool seqFlag,
+                          size_t seqStart, size_t seqSize) {
+  value = Matrix::create(input.value->getData() + offset, height, width, trans,
+                         useGpu);
+  if (input.grad) {
+    grad = Matrix::create(input.grad->getData() + offset, height, width, trans,
+                          useGpu);
+  }
+  if (seqFlag) {
+    sequenceStartPositions = std::make_shared<ICpuGpuVector>(
+        *(input.sequenceStartPositions),
+        seqStart, seqSize);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
new file mode 100644
index 00000000000000..c444ebaf12930e
--- /dev/null
+++ b/paddle/parameter/Argument.h
@@ -0,0 +1,259 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "hl_gpu.h"
+
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+#include "paddle/utils/Locks.h"
+#include "paddle/utils/Util.h"
+#include "paddle/parameter/Parameter.h"
+
+namespace paddle {
+
+// vector of user defined pointers
+typedef std::shared_ptr<std::vector<void*>> UserDefinedVectorPtr;
+typedef std::shared_ptr<std::vector<std::string>> SVectorPtr;
+
+struct Argument {
+  Argument()
+      : in(nullptr),
+        value(nullptr),
+        ids(nullptr),
+        grad(nullptr),
+        strs(nullptr),
+        frameHeight(0),
+        frameWidth(0),
+        sequenceStartPositions(nullptr),
+        subSequenceStartPositions(nullptr),
+        cpuSequenceDims(nullptr),
+        udp(nullptr),
+        deviceId(-1),
+        allCount(0),
+        valueCount(0),
+        gradCount(0),
+        dataId(0) {}
+  Argument(const Argument& argument) {
+    *this = argument;
+    valueCount = 0;
+    gradCount = 0;
+    dataId = argument.dataId;
+  }
+  ~Argument() {}
+
+  void operator=(const Argument& argument) {
+    in = argument.in;
+    value = argument.value;
+    ids = argument.ids;
+    grad = argument.grad;
+    strs = argument.strs;
+    sequenceStartPositions = argument.sequenceStartPositions;
+    subSequenceStartPositions = argument.subSequenceStartPositions;
+    cpuSequenceDims = argument.cpuSequenceDims;
+    udp = argument.udp;
+    deviceId = argument.deviceId;
+    allCount = argument.allCount;
+    frameHeight = argument.frameHeight;
+    frameWidth = argument.frameWidth;
+    dataId = argument.dataId;
+  }
+
+  MatrixPtr in;  // used if needed
+  MatrixPtr value;
+  IVectorPtr ids;  // a sequence of ids. Can be use for class id for costLayer
+  MatrixPtr grad;  // If empty, gradient is not needed.
+  SVectorPtr strs;
+
+  // A dataBatch includes batchSize frames, one frame maybe not only vector
+  size_t frameHeight;
+  size_t frameWidth;
+
+  // If NULL, each position is treated independently.
+  // Otherwise, its size should be #NumberOfSequences + 1.
+  // The first position is always 0 and
+  // the last position should be equal to batchSize.
+  ICpuGpuVectorPtr sequenceStartPositions;
+
+  // If NULL, each sequence has no subsequence.
+  // Otherwise, its size should be #NumberOfSubSequences + 1.
+  // The first position is always 0 and
+  // the last position should be equal to batchSize.
+  ICpuGpuVectorPtr subSequenceStartPositions;
+
+  // dimension of sequence, stored only in CPU
+  IVectorPtr cpuSequenceDims;
+
+  UserDefinedVectorPtr udp;  // user defined pointer
+
+  int deviceId;            // the GPU device id which the argument in
+  int allCount;            // the number of output layers using this argument
+  mutable int valueCount;  // waiting this member when layer do forward
+  mutable int gradCount;   // waiting this member when layer do backward
+  mutable LockedCondition valueReadyCond;
+  mutable LockedCondition gradReadyCond;
+
+  int dataId;  // dataProvider id
+
+  /* Increase the reference count of the argument. */
+  void countIncrement() { allCount++; }
+
+  int getAllCount() const { return allCount; }
+
+  void waitValueReady() const {
+    valueReadyCond.wait([this] { return (valueCount != 0); });
+
+    std::lock_guard<std::mutex> guard(*valueReadyCond.mutex());
+    valueCount--;
+  }
+
+  void notifyValueReady() const {
+    valueReadyCond.notify_all([this] { valueCount = allCount; });
+  }
+
+  void waitGradReady() const {
+    gradReadyCond.wait([this] { return (gradCount == allCount); });
+    gradCount = 0;
+  }
+
+  void notifyGradReady() const {
+    gradReadyCond.notify_all([this] { gradCount++; });
+  }
+
+  int64_t getBatchSize() const {
+    if (value) return value->getHeight();
+    if (ids) return ids->getSize();
+    if (grad) return grad->getHeight();
+    if (in) return in->getHeight();
+    if (udp) return udp->size();
+    if (strs) return strs->size();
+    return 0;
+  }
+  size_t getFrameHeight() const { return frameHeight; }
+  size_t getFrameWidth() const { return frameWidth; }
+  void setFrameHeight(size_t h) { frameHeight = h; }
+  void setFrameWidth(size_t w) { frameWidth = w; }
+
+  int64_t getNumSequences() const {
+    return sequenceStartPositions ? sequenceStartPositions->getSize() - 1
+                                  : getBatchSize();
+  }
+
+  int64_t getNumSubSequences() const {
+    return subSequenceStartPositions
+               ? subSequenceStartPositions->getSize() - 1
+               : getBatchSize();
+  }
+
+  bool hasSubseq() const { return subSequenceStartPositions != nullptr; }
+
+  const int* getCpuStartPositions() const {
+    return hasSubseq() ? subSequenceStartPositions->getData(false)
+                       : sequenceStartPositions->getData(false);
+  }
+
+  static inline real sumCosts(const std::vector<Argument>& arguments) {
+    real cost = 0;
+    for (auto& arg : arguments) {
+      if (arg.value) {
+        SetDevice device(arg.deviceId);
+        cost += arg.value->getSum();
+      }
+    }
+    return cost;
+  }
+
+  /**
+   * @brief (value, grad, sequenceStartPositions) of output are subset of
+   *        input. Note that, output share the same memory of input.
+   *
+   * @param input[in]       input
+   * @param offset[in]      offset of input.value
+   * @param height[in]      height of output.value
+   * @param width[in]       width of output.value
+   * @param useGpu[in]
+   * @param trans[in]       whether input.value is transform
+   * @param seqFlag[in]     whether input has sequenceStartPositions
+   * @param seqStart[in]    offset of input.sequenceStartPositions
+   * @param seqSize[in]     lenght of output.sequenceStartPositions
+   */
+  void subArgFrom(const Argument& input, size_t offset, size_t height,
+                  size_t width, bool useGpu, bool trans = false,
+                  bool seqFlag = false, size_t seqStart = 0,
+                  size_t seqSize = 0);
+  /*
+   * for sequence input:
+   *   startSeq: the sequence id of start
+   *   copySize: how many sequences need to copy
+   *   return value: how many samples are copied
+   * for non-sequence input:
+   *   startSeq: the sample id of start
+   *   copySize: how many samples need to copy
+   *   return value: how many samples are copied
+   */
+  int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
+                            int32_t copySize, bool useGpu = FLAGS_use_gpu,
+                            hl_stream_t stream = HPPL_STREAM_DEFAULT);
+
+  void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu,
+                         hl_stream_t stream = HPPL_STREAM_DEFAULT);
+
+  /*
+    @brief Concatenate several arguments into one and put the result into it.
+    @param args : a vector of argument, each element of which is a frame in a
+    batch of sequences.
+    @param selectRows : select several row of args to concatenate
+    @param seqStartPos : sequence start positions in the final Argument
+    @param hl_stream_t : cuda stream
+    @param passTyoe : type of task, training or testing
+   */
+  void concat(const std::vector<Argument>& args,
+              const std::vector<int>& selectRows,
+              const std::vector<int>& seqStartPos, bool useGpu,
+              hl_stream_t stream, PassType passType);
+
+  /*
+    Concatenate several args into one and put the result into this.
+   */
+  void concat(const std::vector<Argument>& src, bool useGpu = FLAGS_use_gpu,
+              hl_stream_t stream = HPPL_STREAM_DEFAULT,
+              PassType passType = PASS_TEST);
+
+  /*
+   * split vector<Argument> to several vectors according to dataId
+   */
+  static void splitByDataId(const std::vector<Argument>& argus,
+                            std::vector<std::vector<Argument>>* arguGroups);
+
+  /*
+   Get Sequence Length, startPositions and max Length according to input
+   */
+  void getSeqLengthAndStart(
+      std::vector<std::tuple<int, int, int, int>>* seqLengthAndStart,
+      int* maxSequenceLength) const;
+  /*
+   Check Whether sequenceStartPositions is subset of
+   subSequenceStartPositions.
+   */
+  void checkSubset() const;
+
+  /*
+   sequence has sub-sequence degrades to a sequence.
+   */
+  void degradeSequence(const Argument& input, bool useGpu);
+};
+
+}  // namespace paddle
diff --git a/paddle/parameter/AverageOptimizer.cpp b/paddle/parameter/AverageOptimizer.cpp
new file mode 100644
index 00000000000000..4f730059c748f3
--- /dev/null
+++ b/paddle/parameter/AverageOptimizer.cpp
@@ -0,0 +1,194 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "AverageOptimizer.h"
+
+namespace paddle {
+
+// factory method to create an instance of AverageOptimizer
+ParameterOptimizer* AverageOptimizer::create(
+    const OptimizationConfig& optConfig, ParameterOptimizer* optimizer,
+    bool isParameterSparse, bool useParameterApply) {
+  if (optConfig.average_window() <= 0) {
+    return optimizer;
+  }
+  // disable average for embeded local updater
+  if (!useParameterApply && optConfig.num_batches_per_send_parameter() > 1) {
+    return optimizer;
+  }
+  if (isParameterSparse) {
+    return new AverageSparseOptimizer(optConfig, optimizer, useParameterApply);
+  }
+  return new AverageOptimizer(optConfig, optimizer, useParameterApply);
+}
+
+AverageOptimizer::AverageOptimizer(const OptimizationConfig& optConfig,
+                                   ParameterOptimizer* optimizer,
+                                   bool useParameterApply)
+    : ParameterOptimizer(optConfig),
+      optimizer_(optimizer),
+      useApply_(useParameterApply),
+      numUpdates_(0),
+      prevNumUpdates_(0),
+      numAccumulates_(0),
+      oldNumAccumulates_(0),
+      minAverageWindow_(std::min<int64_t>(
+        10000L, optConfig_.max_average_window())),
+      maxAverageWindow_(optConfig_.max_average_window()) {
+  parameterTypes_ = optimizer_->getParameterTypes();
+  addParameterType(PARAMETER_SUM1);
+  addParameterType(PARAMETER_SUM2);
+  addParameterType(PARAMETER_SUM3);
+  if (useParameterApply) {
+    addParameterType(PARAMETER_APPLY);
+  }
+}
+
+void AverageOptimizer::startBatch(int64_t numSamplesProcessed) {
+  optimizer_->startBatch(numSamplesProcessed);
+  learningRate_ = optimizer_->getLearningRate();
+
+  ++numUpdates_;
+  ++numAccumulates_;
+}
+
+/*
+  After traversal, the averaged parameter can be obtained by
+  ((PARAMETER_SUM1 + PARAMETER_SUM2 + PARAMETER_SUM3)
+  / (numAccumulates_ + oldNumAccumulates_))
+*/
+ParameterOptimizer::TraverseCallback AverageOptimizer::needSpecialTraversal(
+    const ParameterConfig& config) const {
+  TraverseCallbackVec callbacks;
+
+  if (auto callback = optimizer_->needSpecialTraversal(config)) {
+    callbacks.emplace_back(callback);
+  }
+
+  if (numUpdates_ % kMaxNumAccumulates == 0) {
+    // Move the sum to a different buffer to avoid loss of precision
+    // due to too many sums.
+    callbacks.emplace_back([this](const VectorPtr vecs[],
+                                  const ParameterConfig& config,
+                                  size_t sparseId) {
+      vecs[PARAMETER_SUM2]->add(*vecs[PARAMETER_SUM1]);
+      vecs[PARAMETER_SUM1]->zeroMem();
+    });
+  }
+
+  if (isAverageWindowTooLong()) {
+    // Now the average window is too long, discard the old sum.
+    if (auto callback = this->startCatchUpWith()) {
+      callbacks.emplace_back(callback);
+    }
+    callbacks.emplace_back([this](const VectorPtr vecs[],
+                                  const ParameterConfig& config,
+                                  size_t sparseId) {
+      vecs[PARAMETER_SUM3]->add(*vecs[PARAMETER_SUM1], *vecs[PARAMETER_SUM2]);
+      vecs[PARAMETER_SUM1]->zeroMem();
+      vecs[PARAMETER_SUM2]->zeroMem();
+    });
+  }
+
+  return composeCallbacks(callbacks);
+}
+
+void AverageOptimizer::finishBatch() {
+  optimizer_->finishBatch();
+  if (isAverageWindowTooLong()) {
+    this->finishCatchUpWith();
+    oldNumAccumulates_ = numAccumulates_;
+    numAccumulates_ = 0;
+  }
+}
+
+ParameterOptimizer::TraverseCallback AverageOptimizer::apply() {
+  if (numAccumulates_ + oldNumAccumulates_ == 0) {
+    return nullptr;
+  }
+
+  real scale = 1. / (numAccumulates_ + oldNumAccumulates_);
+  if (useApply_) {
+    return [scale](const VectorPtr vecs[], const ParameterConfig& config,
+                   size_t sparseId) {
+      vecs[PARAMETER_APPLY]->add3(*vecs[PARAMETER_SUM1], *vecs[PARAMETER_SUM2],
+                                  *vecs[PARAMETER_SUM3], scale, scale, scale);
+    };
+  } else {
+    return [scale](const VectorPtr vecs[], const ParameterConfig& config,
+                   size_t sparseId) {
+      vecs[PARAMETER_GRADIENT]->copyFrom(*vecs[PARAMETER_VALUE]);
+      vecs[PARAMETER_VALUE]->add3(*vecs[PARAMETER_SUM1], *vecs[PARAMETER_SUM2],
+                                  *vecs[PARAMETER_SUM3], scale, scale, scale);
+    };
+  }
+}
+
+ParameterOptimizer::TraverseCallback AverageOptimizer::restore() {
+  if (numAccumulates_ + oldNumAccumulates_ == 0) {
+    return nullptr;
+  }
+  if (useApply_) {
+    return nullptr;
+  }
+
+  return [](const VectorPtr vecs[], const ParameterConfig& config,
+            size_t sparseId) {
+    vecs[PARAMETER_VALUE]->copyFrom(*vecs[PARAMETER_GRADIENT]);
+    vecs[PARAMETER_GRADIENT]->zeroMem();
+  };
+}
+
+void AverageSparseOptimizer::update(const VectorPtr vecs[],
+                                    const ParameterConfig& paraConfig,
+                                    size_t sparseId) const {
+  optimizer_->update(vecs, paraConfig, sparseId);
+
+  CHECK_LT(sparseId, t0Vec_.size());
+  int timediff = timer_ + 1 - t0Vec_[sparseId];
+  if (timediff > 0) {
+    vecs[PARAMETER_SUM1]->add(*vecs[PARAMETER_VALUE], timediff);
+    t0Vec_[sparseId] = timer_ + 1;
+  }
+}
+
+ParameterOptimizer::TraverseCallback AverageSparseOptimizer::startCatchUpWith()
+    const {
+  TraverseCallbackVec callbacks;
+
+  if (auto callback = optimizer_->startCatchUpWith()) {
+    callbacks.emplace_back(callback);
+  }
+
+  if (timer_ > 0) {
+    callbacks.emplace_back(
+        [this](const VectorPtr vecs[], const ParameterConfig& config,
+               size_t sparseId) { this->catchUpWith(vecs, config, sparseId); });
+  }
+
+  return composeCallbacks(callbacks);
+}
+
+void AverageSparseOptimizer::catchUpWith(const VectorPtr vecs[],
+                                         const ParameterConfig& paraConfig,
+                                         size_t sparseId) const {
+  CHECK_LT(sparseId, t0Vec_.size());
+  int timediff = timer_ - t0Vec_[sparseId];
+  if (timediff > 0) {
+    vecs[PARAMETER_SUM1]->add(*vecs[PARAMETER_VALUE], timediff);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/parameter/AverageOptimizer.h b/paddle/parameter/AverageOptimizer.h
new file mode 100644
index 00000000000000..8e0ead84125ab2
--- /dev/null
+++ b/paddle/parameter/AverageOptimizer.h
@@ -0,0 +1,141 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "FirstOrderOptimizer.h"
+
+namespace paddle {
+
+// After Optimization, parameter values are further averaged within
+// time range.
+class AverageOptimizer : public ParameterOptimizer {
+public:
+  // if *useParameterApply* set, use PARAMETER_APPLY to store averaged parameter
+  // else use PARAMETER_VALUE, and value backup in PARAMETER_GRADIENT
+  AverageOptimizer(const OptimizationConfig& optConfig,
+                   ParameterOptimizer* optimizer, bool useParameterApply);
+
+  static ParameterOptimizer* create(const OptimizationConfig& optConfig,
+                                    ParameterOptimizer* optimizer,
+                                    bool isParameterSparse = false,
+                                    bool useParameterApply = false);
+
+  virtual void init(size_t numRows, const ParameterConfig* config) {
+    optimizer_->init(numRows, config);
+  }
+
+  virtual void startPass() { optimizer_->startPass(); }
+  virtual void finishPass() {
+    optimizer_->finishPass();
+    updateAverageWindowLimit();
+  }
+
+  virtual void startBatch(int64_t numSamplesProcessed);
+  virtual void finishBatch();
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+                      size_t sparseId) const {
+    optimizer_->update(vecs, paraConfig, sparseId);
+    vecs[PARAMETER_SUM1]->add(*vecs[PARAMETER_VALUE], 1.0f);
+  }
+
+  virtual TraverseCallback needSpecialTraversal(
+      const ParameterConfig& config) const;
+
+  virtual TraverseCallback startCatchUpWith() const {
+    return optimizer_->startCatchUpWith();
+  }
+  virtual void finishCatchUpWith() { return optimizer_->finishCatchUpWith(); }
+
+  virtual TraverseCallback apply();
+  virtual TraverseCallback restore();
+
+  virtual void setNoDecay() { optimizer_->setNoDecay(); }
+
+protected:
+  std::unique_ptr<ParameterOptimizer> optimizer_;
+  bool useApply_;
+
+  // should only be called from finishPass()
+  void updateAverageWindowLimit() {
+    if (!optConfig_.has_max_average_window()) {
+      // use the number of batches in the last pass as maxAverageWindow_
+      CHECK_GT(numUpdates_, prevNumUpdates_);
+      maxAverageWindow_ = numUpdates_ - prevNumUpdates_;
+      prevNumUpdates_ = numUpdates_;
+    }
+    minAverageWindow_ = std::min(minAverageWindow_, numUpdates_);
+  }
+
+  bool isAverageWindowTooLong() const {
+    return numAccumulates_ >= minAverageWindow_ &&
+           numAccumulates_ >=
+               std::min<int64_t>(maxAverageWindow_,
+                                 numUpdates_ * optConfig_.average_window());
+  }
+
+  static const int64_t kMaxNumAccumulates = 16384;
+  int64_t numUpdates_;
+  int64_t prevNumUpdates_;
+  int64_t numAccumulates_;
+  int64_t oldNumAccumulates_;
+  int64_t minAverageWindow_;
+  int64_t maxAverageWindow_;
+};
+
+// Average Optimizer with Sparse support.
+class AverageSparseOptimizer : public AverageOptimizer {
+public:
+  AverageSparseOptimizer(const OptimizationConfig& optConfig,
+                         ParameterOptimizer* optimizer, bool useParameterApply)
+      : AverageOptimizer(optConfig, optimizer, useParameterApply) {}
+
+  virtual void init(size_t numRows, const ParameterConfig* config) {
+    AverageOptimizer::init(numRows, config);
+
+    t0Vec_.resize(numRows);
+
+    timer_ = 0;
+    t0Vec_.assign(t0Vec_.size(), 0);
+  }
+  virtual void finishBatch() {
+    AverageOptimizer::finishBatch();
+    timer_++;
+  }
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+                      size_t sparseId) const;
+  void catchUpWith(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+                   size_t sparseId) const;
+  virtual TraverseCallback startCatchUpWith() const;
+  virtual void finishCatchUpWith() {
+    optimizer_->finishCatchUpWith();
+
+    timer_ = 0;
+    t0Vec_.assign(t0Vec_.size(), 0);
+  }
+
+protected:
+  /**
+   *  counting batches, clear after catch up with
+   *  t(timer_) is current time,
+   *  t0(t0Vec_) are last occur time of i rows.
+   *  if one block is update by multi threads,
+   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
+   */
+  int timer_;
+  mutable std::vector<int32_t> t0Vec_;
+};
+
+}  // namespace paddle
diff --git a/paddle/parameter/CMakeLists.txt b/paddle/parameter/CMakeLists.txt
new file mode 100644
index 00000000000000..d6f67604c03485
--- /dev/null
+++ b/paddle/parameter/CMakeLists.txt
@@ -0,0 +1,13 @@
+# The utilities for paddle
+
+file(GLOB PARAMETERS_HEADERS . *.h)
+file(GLOB PARAMETERS_SOURCES . *.cpp)
+
+add_library(paddle_parameter STATIC
+        ${PARAMETERS_SOURCES})
+add_style_check_target(paddle_parameter ${PARAMETERS_SOURCES})
+add_style_check_target(paddle_parameter ${PARAMETERS_HEADERS})
+add_dependencies(paddle_parameter gen_proto_cpp)
+if(WITH_TESTING)
+    add_subdirectory(tests)
+endif()
\ No newline at end of file
diff --git a/paddle/parameter/FirstOrderOptimizer.cpp b/paddle/parameter/FirstOrderOptimizer.cpp
new file mode 100644
index 00000000000000..bb46a51d1e02c6
--- /dev/null
+++ b/paddle/parameter/FirstOrderOptimizer.cpp
@@ -0,0 +1,312 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Util.h"
+#include "paddle/utils/Flags.h"
+
+#include "FirstOrderOptimizer.h"
+
+#include <cmath>
+
+P_DEFINE_bool(log_clipping, false, "enable log clipping or not");
+
+namespace paddle {
+
+SparseMomentumParameterOptimizer::SparseMomentumParameterOptimizer(
+    const OptimizationConfig& optConfig)
+    : ParameterOptimizer(optConfig) {
+  addParameterType(PARAMETER_MOMENTUM);
+  addParameterType(PARAMETER_MOMENTUM_UT);
+  addParameterType(PARAMETER_MOMENTUM_VT);
+  alpha_ = 1;
+  beta_ = 1;
+  tau_ = -1;
+  threshold_ = 1e+06;
+}
+
+void SparseMomentumParameterOptimizer::init(size_t numRows,
+                                            const ParameterConfig* config) {
+  isParameterSparse_ = numRows != 0;
+  t0Vec_.resize(numRows);
+  t0Vec_.assign(t0Vec_.size(), 0);
+  timer_ = 0;
+  momentum_ = config->momentum();
+  decayRate_ = config->decay_rate();
+  gamma_ = config->learning_rate();
+}
+
+void SparseMomentumParameterOptimizer::startBatch(int64_t numSamplesProcessed) {
+  learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
+  if (isParameterSparse_) {
+    tau_ = tau_ + beta_ / alpha_;
+    alpha_ = alpha_ / momentum_;
+    beta_ = beta_ / (1 + decayRate_ * gamma_ * learningRate_);
+  }
+}
+
+void SparseMomentumParameterOptimizer::update(const VectorPtr vecs[],
+                                              const ParameterConfig& paraConfig,
+                                              size_t sparseId) const {
+  if (sparseId != -1LU) {
+    CHECK_LT(sparseId, t0Vec_.size());
+    if (t0Vec_[sparseId] == 0) {
+      vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]);
+      t0Vec_[sparseId] = 1;
+    }
+    vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
+                                     -alpha_ * gamma_ * learningRate_);
+    vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
+                                     tau_ * alpha_ * gamma_ * learningRate_);
+    vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
+                               tau_ / beta_ + 1.0 / alpha_,
+                               *vecs[PARAMETER_MOMENTUM_VT], 1.0 / beta_);
+
+  } else {
+    vecs[PARAMETER_VALUE]->sgdUpdate(
+        *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
+        learningRate_ * paraConfig.learning_rate(), paraConfig.momentum(),
+        applyDecay_ ? paraConfig.decay_rate() : 0);
+  }
+}
+
+ParameterOptimizer::TraverseCallback
+SparseMomentumParameterOptimizer::needSpecialTraversal(
+    const ParameterConfig& config) const {
+  if (alpha_ > threshold_ && isParameterSparse_) {
+    //  Restart to avoid large value multiplication
+    //  1. \alpha = 1, \beta = 1, \tau = 0
+    //  2. Note that \tau * u_t + v_t = \beta \theta_t, therefore:
+    //     u_t should be rescaled to u_t/alpha_
+    //     v_t should be reset to \theta_t
+    return [this](const VectorPtr vecs[], const ParameterConfig& config,
+                  size_t sparseId) {
+      vecs[PARAMETER_MOMENTUM_UT]->divScalar(alpha_);
+      vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]);
+    };
+  } else {
+    return nullptr;
+  }
+}
+
+void SparseMomentumParameterOptimizer::finishBatch() {
+  timer_++;
+  if (!isParameterSparse_) return;
+  if (alpha_ > threshold_) {
+    alpha_ = 1;
+    beta_ = 1;
+    tau_ = -1;
+  }
+}
+
+void AdagradParameterOptimizer::update(const VectorPtr vecs[],
+                                       const ParameterConfig& config,
+                                       size_t sparseId) const {
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
+                                                1.0f);
+  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
+                                     *vecs[PARAMETER_GRADIENT_SQURESUM1]);
+  vecs[PARAMETER_LEARNING_RATE]->add(optConfig_.ada_epsilon());
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(
+      *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
+      *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(),
+      config.momentum(), applyDecay_ ? config.decay_rate() : 0);
+}
+
+ParameterOptimizer::TraverseCallback
+AdagradParameterOptimizer::needSpecialTraversal(
+    const ParameterConfig& config) const {
+  if (numUpdates_ % kMaxNumAccumulates == 0) {
+    // Move the sum to a different buffer to avoid loss of precision
+    // due to too many sums.
+    return [this](const VectorPtr vecs[], const ParameterConfig& config,
+                  size_t sparseId) {
+      vecs[PARAMETER_GRADIENT_SQURESUM]->add(
+          *vecs[PARAMETER_GRADIENT_SQURESUM1]);
+      vecs[PARAMETER_GRADIENT_SQURESUM1]->zeroMem();
+    };
+  } else {
+    return nullptr;
+  }
+}
+
+void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[],
+                                        const ParameterConfig& config,
+                                        size_t sparseId) const {
+  CHECK(sparseId == -1LU) << "Sparse update is not supported";
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(*vecs[PARAMETER_GRADIENT],
+                                                    rou_, 1.0f - rou_);
+
+  // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
+  vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
+                                        *vecs[PARAMETER_GRADIENT_SQURESUM],
+                                        epsilon_, epsilon_);
+  vecs[PARAMETER_LEARNING_RATE]->sqrt();
+
+  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
+      *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_LEARNING_RATE], rou_,
+      1.0f - rou_);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(
+      *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
+      *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(),
+      config.momentum(), applyDecay_ ? config.decay_rate() : 0);
+}
+
+void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
+                                       const ParameterConfig& config,
+                                       size_t sparseId) const {
+  real accumulatedRou = rou_;
+
+  bool firstTime = timer_ == 0;
+  if (sparseId != -1LU) {
+    CHECK_LT(sparseId, t0Vec_.size());
+    accumulatedRou = std::pow(rou_, timer_ + 1 - t0Vec_[sparseId]);
+    firstTime = t0Vec_[sparseId] == 0;
+    t0Vec_[sparseId] = timer_ + 1;
+  }
+
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou,
+      firstTime ? 1.0f : 1.0f - rou_);
+
+  // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->add(*vecs[PARAMETER_GRADIENT],
+                                          accumulatedRou, 1.0f - rou_);
+
+  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
+  // Basiclly if the sign of the gradient changes more often,
+  // the learning rate will be decreased.
+  vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
+  vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
+                                           -1.0f);
+  vecs[PARAMETER_LEARNING_RATE]->add(optConfig_.ada_epsilon());
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(
+      *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
+      *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(),
+      config.momentum(), applyDecay_ ? config.decay_rate() : 0);
+}
+
+void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
+                                              const ParameterConfig& config,
+                                              size_t sparseId) const {
+  real accumulatedRou = rou_;
+
+  bool firstTime = timer_ == 0;
+  if (sparseId != -1LU) {
+    CHECK_LT(sparseId, t0Vec_.size());
+    accumulatedRou = std::pow(rou_, timer_ + 1 - t0Vec_[sparseId]);
+    firstTime = t0Vec_[sparseId] == 0;
+    t0Vec_[sparseId] = timer_ + 1;
+  }
+
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou,
+      firstTime ? 1.0f : 1.0f - rou_);
+
+  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
+  // Basiclly if the bigger the magnitude gradient is,
+  // the smaller the learning rate will be.
+  vecs[PARAMETER_LEARNING_RATE]->assign(optConfig_.ada_epsilon());
+  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(
+      *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
+      *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(),
+      config.momentum(), applyDecay_ ? config.decay_rate() : 0);
+}
+
+void AdamParameterOptimizer::update(const VectorPtr vecs[],
+                                    const ParameterConfig& config,
+                                    size_t sparseId) const {
+  CHECK(sparseId == -1UL) << "Sparse update is not supported";
+  Vector* m = vecs[PARAMETER_MOMENTUM].get();
+  Vector* g = vecs[PARAMETER_GRADIENT].get();
+  Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
+  Vector* theta = vecs[PARAMETER_VALUE].get();
+
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  m->add(*g, beta1_, 1 - beta1_);
+
+  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
+  g->square();
+  v->add(*g, beta2_, 1 - beta2_);
+
+  // tmp = m_t / ( \sqrt{v_t} + \epsilon )
+  // \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
+  g->sqrt(*v);
+  g->dotDiv(*m, *g, 0., epsilon_);
+  real alpha = config.learning_rate() * learningRate_;
+  alpha = alpha * std::sqrt(1 - std::pow(beta2_, step_)) /
+          (1 - std::pow(beta1_, step_));
+  theta->add(*theta, 1.0, *g, -alpha);
+}
+
+void AdamaxParameterOptimizer::update(const VectorPtr vecs[],
+                                      const ParameterConfig& config,
+                                      size_t sparseId) const {
+  CHECK(sparseId == -1UL) << "Sparse update is not supported";
+  Vector* m = vecs[PARAMETER_MOMENTUM].get();
+  Vector* g = vecs[PARAMETER_GRADIENT].get();
+  Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
+  Vector* theta = vecs[PARAMETER_VALUE].get();
+
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  m->add(*g, beta1_, 1 - beta1_);
+
+  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
+  u->mulScalar(beta2_);
+  g->abs();
+  u->max(*u, *g);
+
+  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
+  g->dotDiv(*m, *u);
+  real learningRate = config.learning_rate() * learningRate_;
+  learningRate /= (1 - std::pow(beta1_, step_));
+  theta->add(*theta, 1.0, *g, -learningRate);
+}
+
+
+void OptimizerWithGradientClipping::update(const VectorPtr vecs[],
+                                           const ParameterConfig& config,
+                                           size_t sparseId) const {
+  real maxAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsMax();
+  if (maxAbsGrad > config.gradient_clipping_threshold()) {
+    if (FLAGS_log_clipping) {
+      real avgAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsSum() /
+                        vecs[PARAMETER_GRADIENT]->getSize();
+      LOG(INFO) << "parameter=" << config.name() << " need clipping,"
+                << " max grad=" << maxAbsGrad << " avg grad=" << avgAbsGrad;
+    }
+    vecs[PARAMETER_GRADIENT]->clip(-config.gradient_clipping_threshold(),
+                                   config.gradient_clipping_threshold());
+  }
+
+  optimizer_->update(vecs, config, sparseId);
+}
+
+}  // namespace paddle
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
new file mode 100644
index 00000000000000..ad5f48097643a1
--- /dev/null
+++ b/paddle/parameter/FirstOrderOptimizer.h
@@ -0,0 +1,356 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "ParameterOptimizer.h"
+#include "Regularizer.h"
+
+namespace paddle {
+
+// Plain SGD optimization.
+class SgdOptimizer : public ParameterOptimizer {
+public:
+  explicit SgdOptimizer(const OptimizationConfig& optConfig)
+      : ParameterOptimizer(optConfig) {
+    addParameterType(PARAMETER_MOMENTUM);
+  }
+
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
+  }
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+                      size_t sparseId) const {
+    (void)sparseId;
+    real torch_learningRate = optConfig_.learning_method() == "torch_momentum" ?
+                              1.0 - paraConfig.momentum() : 1.0;
+    vecs[PARAMETER_VALUE]->sgdUpdate(
+        *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
+        learningRate_ * paraConfig.learning_rate() *
+        (firstTime_ ? 1.0 : torch_learningRate),
+        paraConfig.momentum(),
+        applyDecay_ ? paraConfig.decay_rate() : 0);
+  }
+  virtual void finishBatch() {
+        firstTime_ = false;
+  }
+};
+
+// SGD optimization with sparse support.
+class SparseMomentumParameterOptimizer : public ParameterOptimizer {
+  /* sparse momentum optimizer
+
+    update scheme:
+
+    \alpha_t = \alpha_{t-1} / k
+    \beta_t = \beta_{t-1} / (1 + \lambda\gamma_t)
+    u_t = u_{t-1} - \alpha_t \gamma_t g_t
+    v_t = v_{t-1} + \tau_{t-1} \alpha_t \gamma_t g_t
+    \tau_t = \tau_{t-1} + \beta_t / \alpha_t
+
+    where:
+    k: momentum
+    lambda: decay rate
+    \gamma_t: learning rate at the t'th step
+  */
+
+public:
+  explicit SparseMomentumParameterOptimizer(
+      const OptimizationConfig& optConfig);
+  virtual void init(size_t numRows, const ParameterConfig* config);
+  virtual void startBatch(int64_t numSamplesProcessed);
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+                      size_t sparseId) const;
+  virtual TraverseCallback needSpecialTraversal(
+      const ParameterConfig& config) const;
+  virtual void finishBatch();
+
+private:
+  real alpha_;
+  real beta_;
+  real tau_;
+  real gamma_;
+  real threshold_;
+  real momentum_;
+  real decayRate_;
+
+protected:
+  int64_t timer_;
+  mutable std::vector<int64_t> t0Vec_;
+  bool isParameterSparse_;
+};
+
+/*
+ * AdaGrad optimization.
+ * http://www.magicbroom.info/Papers/DuchiHaSi10.pdf
+ */
+class AdagradParameterOptimizer : public ParameterOptimizer {
+public:
+  explicit AdagradParameterOptimizer(const OptimizationConfig& optConfig)
+      : ParameterOptimizer(optConfig) {
+    addParameterType(PARAMETER_MOMENTUM);
+    addParameterType(PARAMETER_GRADIENT_SQURESUM);
+    addParameterType(PARAMETER_GRADIENT_SQURESUM1);
+    addParameterType(PARAMETER_LEARNING_RATE);
+    numUpdates_ = 0;
+  }
+
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    (void)numSamplesProcessed;
+    ++numUpdates_;
+  }
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+                      size_t sparseId) const;
+  virtual TraverseCallback needSpecialTraversal(
+      const ParameterConfig& config) const;
+
+protected:
+  int64_t numUpdates_;
+  static const int64_t kMaxNumAccumulates = 16384;
+};
+
+/*
+ * AdaDelta Optimization.
+ * http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf
+*/
+class AdaDeltaParameterOptimizer : public ParameterOptimizer {
+public:
+  explicit AdaDeltaParameterOptimizer(const OptimizationConfig& optConfig)
+      : ParameterOptimizer(optConfig) {
+    addParameterType(PARAMETER_MOMENTUM);
+    addParameterType(PARAMETER_GRADIENT_SQURESUM);
+    addParameterType(PARAMETER_GRADIENT_SQURESUM1);
+    addParameterType(PARAMETER_LEARNING_RATE);
+    rou_ = optConfig.ada_rou();
+    epsilon_ = optConfig.ada_epsilon();
+  }
+
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
+  }
+
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+                      size_t sparseId) const;
+
+protected:
+  real rou_;
+  real epsilon_;
+};
+
+// RMSProp Parameter Optimization.
+class RMSPropParameterOptimizer : public ParameterOptimizer {
+public:
+  explicit RMSPropParameterOptimizer(const OptimizationConfig& optConfig)
+      : ParameterOptimizer(optConfig) {
+    addParameterType(PARAMETER_MOMENTUM);
+    addParameterType(PARAMETER_GRADIENT_SQURESUM1);
+    addParameterType(PARAMETER_GRADIENT_SQURESUM);
+    addParameterType(PARAMETER_LEARNING_RATE);
+    rou_ = optConfig.ada_rou();
+    epsilon_ = optConfig.ada_epsilon();
+  }
+
+  virtual void init(size_t numRows, const ParameterConfig* config) {
+    t0Vec_.resize(numRows);
+    t0Vec_.assign(t0Vec_.size(), 0);
+    timer_ = 0;
+  }
+
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
+  }
+  virtual void finishBatch() { timer_++; }
+
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+                      size_t sparseId) const;
+
+protected:
+  real rou_;
+  real epsilon_;
+
+  /**
+   *  counting batches, donot need catch up with
+   *  t(timer_) is current time,
+   *  t0(t0Vec_) are last occur time of i rows.
+   *  if one block is update by multi threads,
+   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
+   */
+  int64_t timer_;
+  mutable std::vector<int64_t> t0Vec_;
+};
+
+// Decayed AdaGrad Optimization.
+class DecayedAdagradParameterOptimizer : public ParameterOptimizer {
+public:
+  explicit DecayedAdagradParameterOptimizer(const OptimizationConfig& optConfig)
+      : ParameterOptimizer(optConfig) {
+    addParameterType(PARAMETER_MOMENTUM);
+    addParameterType(PARAMETER_GRADIENT_SQURESUM);
+    addParameterType(PARAMETER_LEARNING_RATE);
+    rou_ = optConfig.ada_rou();
+    epsilon_ = optConfig.ada_epsilon();
+  }
+
+  virtual void init(size_t numRows, const ParameterConfig* config) {
+    t0Vec_.resize(numRows);
+    t0Vec_.assign(t0Vec_.size(), 0);
+    timer_ = 0;
+  }
+
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
+  }
+  virtual void finishBatch() { timer_++; }
+
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+                      size_t sparseId) const;
+
+protected:
+  real rou_;
+  real epsilon_;
+
+  /**
+   *  counting batches, donot need catch up with
+   *  t(timer_) is current time,
+   *  t0(t0Vec_) are last occur time of i rows.
+   *  if one block is update by multi threads,
+   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
+   */
+  int64_t timer_;
+  mutable std::vector<int64_t> t0Vec_;
+};
+
+/**
+ * Adam Optimizer.
+ * Reference Paper: http://arxiv.org/abs/1412.6980 Algorithm 1
+ */
+class AdamParameterOptimizer : public ParameterOptimizer {
+public:
+  explicit AdamParameterOptimizer(const OptimizationConfig& optConfig)
+      : ParameterOptimizer(optConfig),
+        beta1_(optConfig.adam_beta1()),
+        beta2_(optConfig.adam_beta2()),
+        epsilon_(optConfig.adam_epsilon()),
+        step_(1),
+        learningRate_(optConfig.learning_rate()) {
+    addParameterType(PARAMETER_MOMENTUM);
+    addParameterType(PARAMETER_SECOND_MOMENTUM);
+  }
+
+  virtual void finishBatch() { ++step_; }
+
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+                      size_t sparseId) const;
+
+protected:
+  real beta1_;
+  real beta2_;
+  real epsilon_;
+  int64_t step_;
+  real learningRate_;
+};
+
+/**
+ * AdaMax Optimizer.
+ * Reference Paper: http://arxiv.org/abs/1412.6980 Algorithm 2
+ */
+class AdamaxParameterOptimizer : public ParameterOptimizer {
+public:
+  explicit AdamaxParameterOptimizer(const OptimizationConfig& optConfig)
+      : ParameterOptimizer(optConfig),
+        beta1_(optConfig.adam_beta1()),
+        beta2_(optConfig.adam_beta2()),
+        step_(1),
+        learningRate_(optConfig.learning_rate()) {
+    addParameterType(PARAMETER_MOMENTUM);
+    addParameterType(PARAMETER_WEIGHTED_INFINITY_NORM);
+  }
+
+  virtual void finishBatch() { ++step_; }
+
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+                      size_t sparseId) const;
+
+protected:
+  real beta1_;
+  real beta2_;
+  int64_t step_;
+  real learningRate_;
+};
+
+// Used in pserver,
+// when PARAMETER_DELTA stores in PARAMETER_GRADIENT.
+class AddOptimizer : public ParameterOptimizer {
+public:
+  explicit AddOptimizer(const OptimizationConfig& optConfig)
+      : ParameterOptimizer(optConfig) {}
+
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    // learningRate required by regularizer
+    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
+  }
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+                      size_t sparseId) const {
+    vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_GRADIENT],
+                               optConfig_.delta_add_rate());
+  }
+};
+
+// A optimizer which does nothing.
+class DummyOptimizer : public ParameterOptimizer {
+public:
+  explicit DummyOptimizer(const OptimizationConfig& optConfig)
+      : ParameterOptimizer(optConfig) {}
+
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+                      size_t sparseId) const {}
+};
+
+// Do gradient clipping before sgd update
+class OptimizerWithGradientClipping : public ParameterOptimizer {
+public:
+  OptimizerWithGradientClipping(const OptimizationConfig& optConfig,
+                                ParameterOptimizer* optimizer)
+      : ParameterOptimizer(optConfig), optimizer_(optimizer) {
+    parameterTypes_ = optimizer_->getParameterTypes();
+  }
+
+  virtual void init(size_t numRows, const ParameterConfig* config) {
+    optimizer_->init(numRows, config);
+  }
+
+  virtual void startPass() { optimizer_->startPass(); }
+  virtual void finishPass() { optimizer_->finishPass(); }
+
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    optimizer_->startBatch(numSamplesProcessed);
+    learningRate_ = optimizer_->getLearningRate();
+  }
+  virtual void finishBatch() { optimizer_->finishBatch(); }
+
+  virtual TraverseCallback needSpecialTraversal(
+      const ParameterConfig& config) const {
+    return optimizer_->needSpecialTraversal(config);
+  }
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+                      size_t sparseId) const;
+
+  virtual void setNoDecay() { optimizer_->setNoDecay(); }
+
+protected:
+  std::unique_ptr<ParameterOptimizer> optimizer_;
+};
+
+}  // namespace paddle
diff --git a/paddle/parameter/LearningRateScheduler.cpp b/paddle/parameter/LearningRateScheduler.cpp
new file mode 100644
index 00000000000000..ce045ebf05a226
--- /dev/null
+++ b/paddle/parameter/LearningRateScheduler.cpp
@@ -0,0 +1,174 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "LearningRateScheduler.h"
+#include "paddle/utils/StringUtil.h"
+
+namespace paddle {
+
+ClassRegistrar<LearningRateScheduler, OptimizationConfig>
+    LearningRateScheduler::registrar_;
+
+LearningRateScheduler* LearningRateScheduler::create(
+    const OptimizationConfig& config) {
+  return registrar_.createByType(config.learning_rate_schedule(), config);
+}
+
+// LRS stands for LearningRateScheduler
+
+class BaseLRS : public LearningRateScheduler {
+public:
+  explicit BaseLRS(const OptimizationConfig& config)
+      : learningRate_(config.learning_rate()),
+        a_(config.learning_rate_decay_a()),
+        b_(config.learning_rate_decay_b()) {}
+
+protected:
+  real learningRate_;
+  real a_;
+  real b_;
+};
+
+class ConstLRS : public BaseLRS {
+public:
+  explicit ConstLRS(const OptimizationConfig& config) : BaseLRS(config) {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    return learningRate_;
+  }
+};
+REGISTER_LEARNING_RATE_SCHEDULER(constant, ConstLRS);
+
+class PolyLRS : public BaseLRS {
+public:
+  explicit PolyLRS(const OptimizationConfig& config) : BaseLRS(config) {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    return learningRate_ * pow(1.0 + a_ * numSamplesProcessed, -b_);
+  }
+};
+REGISTER_LEARNING_RATE_SCHEDULER(poly, PolyLRS);
+
+class CaffePolyLRS : public BaseLRS {
+public:
+  explicit CaffePolyLRS(const OptimizationConfig& config) : BaseLRS(config) {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    if (numSamplesProcessed > a_) {
+      LOG_FIRST_N(WARNING, 1)
+          << "Using caffe_poly learning rate schedule, "
+          << "learning rate hits ZERO when "
+          << "numSamplesProcessed > config.learning_rate_decay_b(), "
+          << "training is over and you can stop it. "
+          << "See common/LearningRateScheduler.cpp for more info.";
+      return 0;
+    } else {
+      return learningRate_ * pow(1.0 - numSamplesProcessed / a_, b_);
+    }
+  }
+};
+REGISTER_LEARNING_RATE_SCHEDULER(caffe_poly, CaffePolyLRS);
+
+class ExpLRS : public BaseLRS {
+public:
+  explicit ExpLRS(const OptimizationConfig& config) : BaseLRS(config) {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    double decayRatio = (double)numSamplesProcessed / b_;
+    return learningRate_ * pow(a_, decayRatio);
+  }
+};
+REGISTER_LEARNING_RATE_SCHEDULER(exp, ExpLRS);
+
+class DiscreteExpLRS : public BaseLRS {
+public:
+  explicit DiscreteExpLRS(const OptimizationConfig& config) : BaseLRS(config) {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    int numDecays = floor(numSamplesProcessed / b_);
+    return learningRate_ * pow(a_, numDecays);
+  }
+};
+REGISTER_LEARNING_RATE_SCHEDULER(discexp, DiscreteExpLRS);
+
+class LinearLRS : public BaseLRS {
+public:
+  explicit LinearLRS(const OptimizationConfig& config) : BaseLRS(config) {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    return std::max(learningRate_ - a_ * numSamplesProcessed, b_);
+  }
+};
+REGISTER_LEARNING_RATE_SCHEDULER(linear, LinearLRS);
+
+/*
+  specify learning rate through
+  learning_rate_args = 'seg0:rate0,seg1:rate1,...,segK:rateK'
+  if seg_{i-1} <= numSamples <= seg_i,
+  then learning_rate = learning_rate_base * rate_i
+*/
+class ManualLRS : public BaseLRS {
+public:
+  explicit ManualLRS(const OptimizationConfig& config)
+      : BaseLRS(config), currentSegment_(0), lastNum_(0) {
+    std::vector<std::string> pieces;
+    str::split(config.learning_rate_args(), ',', &pieces);
+    rates_.reserve(pieces.size());
+    std::string s1, s2;
+
+    for (auto& piece : pieces) {
+      auto pos = piece.find(':');
+      CHECK(pos != std::string::npos) << "Wrong format for learning_rate_args: "
+                                      << config.learning_rate_args();
+      segments_.push_back(str::to<int64_t>(piece.substr(0, pos)));
+      rates_.push_back(str::to<real>(piece.substr(pos + 1)));
+    }
+  }
+
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    return calc(numSamplesProcessed);
+  }
+
+  real calc(int64_t num) {
+    // We assume that num never decreases.
+    CHECK_LE(lastNum_, num);
+    lastNum_ = num;
+    while (currentSegment_ < rates_.size()) {
+      if (num <= segments_[currentSegment_]) {
+        return learningRate_ * rates_[currentSegment_];
+      }
+      ++currentSegment_;
+      if (currentSegment_ < rates_.size()) {
+        LOG(INFO) << " learning_rate changes to "
+                  << learningRate_ * rates_[currentSegment_];
+      }
+    }
+    return learningRate_ * rates_.back();
+  }
+
+protected:
+  std::vector<real> rates_;
+  std::vector<int64_t> segments_;
+  size_t currentSegment_;
+  int64_t lastNum_;
+};
+
+REGISTER_LEARNING_RATE_SCHEDULER(manual, ManualLRS);
+
+class PassManualLRS : public ManualLRS {
+public:
+  explicit PassManualLRS(const OptimizationConfig& config)
+      : ManualLRS(config) {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    return calc(pass);
+  }
+};
+
+REGISTER_LEARNING_RATE_SCHEDULER(pass_manual, PassManualLRS);
+}  // namespace paddle
diff --git a/paddle/parameter/LearningRateScheduler.h b/paddle/parameter/LearningRateScheduler.h
new file mode 100644
index 00000000000000..74fb848fabe1ad
--- /dev/null
+++ b/paddle/parameter/LearningRateScheduler.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "TrainerConfig.pb.h"
+#include "paddle/utils/ClassRegistrar.h"
+
+namespace paddle {
+// NOLINTNEXTLINES_4
+#define REGISTER_LEARNING_RATE_SCHEDULER(__type_name, __class_name)              \
+  static InitFunction __reg_type_##__type_name([]() {                            \
+    LearningRateScheduler::registrar_.registerClass<__class_name>(#__type_name); \
+  })
+
+class LearningRateScheduler {
+public:
+  static LearningRateScheduler* create(const OptimizationConfig& config);
+  virtual ~LearningRateScheduler() {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) = 0;
+
+  static ClassRegistrar<LearningRateScheduler, OptimizationConfig> registrar_;
+};
+
+}  // namespace paddle
diff --git a/paddle/parameter/OptimizerFunctions.cpp b/paddle/parameter/OptimizerFunctions.cpp
new file mode 100644
index 00000000000000..5adcf86efd5284
--- /dev/null
+++ b/paddle/parameter/OptimizerFunctions.cpp
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "AverageOptimizer.h"
+#include "FirstOrderOptimizer.h"
+#include "OptimizerWithRegularizer.h"
+
+namespace paddle {
+
+// creator for AverageOptimizer
+ParameterOptimizer* sgdOptimizerCreate(const OptimizationConfig& optConfig,
+                                       const ParameterConfig& paraConfig,
+                                       bool isParameterSparse, bool inPserver) {
+  ParameterOptimizer* optimizer = OptimizerWithRegularizer::create(
+      optConfig, paraConfig, isParameterSparse, inPserver);
+  return AverageOptimizer::create(optConfig, optimizer, isParameterSparse,
+                                  inPserver /*useParameterApply*/);
+}
+
+std::vector<ParameterType> sgdOptimizerGetTypes(
+    const OptimizationConfig& optConfig, bool inPserver) {
+  std::unique_ptr<ParameterOptimizer> optimizer;
+  optimizer.reset(AverageOptimizer::create(
+      optConfig, ParameterOptimizer::create(optConfig, inPserver),
+      false /*isParameterSparse*/, inPserver));
+  CHECK(optimizer) << "fail to create optimizer: "
+                   << optConfig.learning_method();
+  return optimizer->getParameterTypes();
+}
+
+bool useApplyInPserver(const OptimizationConfig& optConfig) {
+  auto types = sgdOptimizerGetTypes(optConfig, true /*inPserver*/);
+  return types.end() != std::find(types.begin(), types.end(), PARAMETER_APPLY);
+}
+
+}  // namespace paddle
diff --git a/paddle/parameter/OptimizerFunctions.h b/paddle/parameter/OptimizerFunctions.h
new file mode 100644
index 00000000000000..9592658224d856
--- /dev/null
+++ b/paddle/parameter/OptimizerFunctions.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "FirstOrderOptimizer.h"
+
+namespace paddle {
+
+/*
+ * Factory function creates the corresponding SgdOptimizer
+ * according to the configuration in optConfig.
+ */
+ParameterOptimizer* sgdOptimizerCreate(const OptimizationConfig& optConfig,
+                                       const ParameterConfig& paraConfig,
+                                       bool isParameterSparse, bool inPserver);
+
+/*
+ * Get the parameter types needed for the specific optimization
+ * algorithm specified in optConfig.
+ */
+std::vector<ParameterType> sgdOptimizerGetTypes(
+    const OptimizationConfig& optConfig, bool inPserver);
+
+/*
+ * Whether trainer need call apply() in pserver and get result back.
+ * currently, only averager depend on this.
+ */
+bool useApplyInPserver(const OptimizationConfig& optConfig);
+
+}  // namespace paddle
diff --git a/paddle/parameter/OptimizerWithRegularizer.cpp b/paddle/parameter/OptimizerWithRegularizer.cpp
new file mode 100644
index 00000000000000..0da27a51c6d293
--- /dev/null
+++ b/paddle/parameter/OptimizerWithRegularizer.cpp
@@ -0,0 +1,181 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "OptimizerWithRegularizer.h"
+
+namespace paddle {
+
+ParameterOptimizer::TraverseCallback
+OptimizerWithRegularizerEveryNumBatches::needSpecialTraversal(
+    const ParameterConfig& config) const {
+  TraverseCallbackVec callbacks;
+
+  if (isRegularizationBatch(config)) {
+    callbacks.emplace_back(
+        [this](const VectorPtr vecs[], const ParameterConfig& config,
+               size_t sparseId) { this->doTraversal(vecs, config); });
+  }
+
+  if (auto callback = optimizer_->needSpecialTraversal(config)) {
+    callbacks.emplace_back(callback);
+  }
+
+  return composeCallbacks(callbacks);
+}
+
+void OptimizerWithRegularizerEveryNumBatches::doTraversal(
+    const VectorPtr vecs[], const ParameterConfig& config) const {
+  int32_t base =
+      std::max(baseTimer_, (timer_ + 1 - config.num_batches_regularization()));
+  regularizer_->update(vecs, config, optimizer_->getLearningRate(), base,
+                       timer_ + 1);
+}
+
+ParameterOptimizer::TraverseCallback
+OptimizerWithRegularizerEveryNumBatches::startCatchUpWith() const {
+  TraverseCallbackVec callbacks;
+
+  if (auto callback = optimizer_->startCatchUpWith()) {
+    callbacks.emplace_back(callback);
+  }
+
+  if (baseTimer_ < timer_) {
+    callbacks.emplace_back(
+        [this](const VectorPtr vecs[], const ParameterConfig& config,
+               size_t sparseId) { this->catchUpWith(vecs, config, sparseId); });
+  }
+
+  return composeCallbacks(callbacks);
+}
+
+void OptimizerWithRegularizerEveryNumBatches::catchUpWith(
+    const VectorPtr vecs[], const ParameterConfig& config,
+    size_t sparseId) const {
+  int32_t base = timer_ - timer_ % config.num_batches_regularization();
+  regularizer_->update(vecs, config, optimizer_->getLearningRate(),
+                       std::max(base, baseTimer_), timer_);
+}
+
+void OptimizerWithRegularizerSparse::init(size_t numRows,
+                                          const ParameterConfig* config) {
+  OptimizerWithRegularizer::init(numRows, config);
+  t0Vec_.resize(numRows);
+
+  timer_ = 0;
+  t0Vec_.assign(t0Vec_.size(), 0);
+}
+
+void OptimizerWithRegularizerSparse::update(const VectorPtr vecs[],
+                                            const ParameterConfig& config,
+                                            size_t sparseId) const {
+  optimizer_->update(vecs, config, sparseId);
+  // para W(t0) -> W(t+1)
+  CHECK_LT(sparseId, t0Vec_.size());
+  regularizer_->update(vecs, config, optimizer_->getLearningRate(),
+                       t0Vec_[sparseId], timer_ + 1);
+  t0Vec_[sparseId] = timer_ + 1;
+}
+
+ParameterOptimizer::TraverseCallback
+OptimizerWithRegularizerSparse::startCatchUpWith() const {
+  TraverseCallbackVec callbacks;
+
+  if (auto callback = optimizer_->startCatchUpWith()) {
+    callbacks.emplace_back(callback);
+  }
+
+  if (timer_ > 0) {
+    callbacks.emplace_back(
+        [this](const VectorPtr vecs[], const ParameterConfig& config,
+               size_t sparseId) { this->catchUpWith(vecs, config, sparseId); });
+  }
+
+  return composeCallbacks(callbacks);
+}
+
+void OptimizerWithRegularizerSparse::catchUpWith(const VectorPtr vecs[],
+                                                 const ParameterConfig& config,
+                                                 size_t sparseId) const {
+  // para W(t0) -> W(t+1)
+  CHECK_LT(sparseId, t0Vec_.size());
+  regularizer_->update(vecs, config, optimizer_->getLearningRate(),
+                       t0Vec_[sparseId], timer_);
+}
+
+// factory method to create instance of OptimizerWithRegularizer
+ParameterOptimizer* OptimizerWithRegularizer::create(
+    const OptimizationConfig& optConfig, const ParameterConfig& paraConfig,
+    bool isParameterSparse, bool inPserver) {
+  ParameterOptimizer* optimizer =
+      ParameterOptimizer::create(optConfig, inPserver);
+  if (paraConfig.gradient_clipping_threshold() > 0.0f &&
+     !dynamic_cast<AddOptimizer*>(optimizer)) {
+    optimizer = new OptimizerWithGradientClipping(optConfig, optimizer);
+  }
+  Regularizer* regularizer =
+      Regularizer::get(optimizer->getParameterTypes(), paraConfig);
+  if (!regularizer) {
+    return optimizer;
+  }
+
+  if (paraConfig.num_batches_regularization() > 1) {
+    if (optConfig.num_batches_per_send_parameter() > 1) {
+      CHECK_EQ(optConfig.num_batches_per_send_parameter() %
+                   paraConfig.num_batches_regularization(),
+               0)
+          << "regularization should be apply in sending batch";
+    }
+    CHECK(paraConfig.momentum() == 0.0f) << "Parameter cannot support momentum "
+                                            "if num_batches_regularization set";
+
+    if (optConfig.center_parameter_update_method() == "average" &&
+        optConfig.num_batches_per_send_parameter() ==
+            paraConfig.num_batches_regularization()) {
+      LOG(INFO) << "decay in pserver and no decay in trainer";
+      if (inPserver) {  // decay in pserver
+        optimizer->setNoDecay();
+        return new OptimizerWithRegularizer(optConfig, optimizer, regularizer);
+      }
+      // no decay in trainer
+      optimizer->setNoDecay();
+      return optimizer;
+    }
+    if (dynamic_cast<AddOptimizer*>(optimizer)) {
+      return optimizer;  // normal average, no decay in pserver
+    }
+    // normal
+    optimizer->setNoDecay();
+    return new OptimizerWithRegularizerEveryNumBatches(optConfig, optimizer,
+                                                       regularizer);
+  }
+  if (isParameterSparse) {
+      CHECK(paraConfig.momentum() == 0.0f)
+          << "Parameter cannot support momentum if it's sparse.";
+    optimizer->setNoDecay();
+    return new OptimizerWithRegularizerSparse(optConfig, optimizer,
+                                              regularizer);
+  }
+  // dense
+  if (paraConfig.decay_rate_l1() == 0.0f ||
+    dynamic_cast<AddOptimizer*>(optimizer)) {
+    return optimizer;
+  }
+  CHECK(paraConfig.momentum() == 0.0f)
+    << "Parameter cannot support momentum if it use L1 decay.";
+  optimizer->setNoDecay();
+  return new OptimizerWithRegularizer(optConfig, optimizer, regularizer);
+}
+
+}  // namespace paddle
diff --git a/paddle/parameter/OptimizerWithRegularizer.h b/paddle/parameter/OptimizerWithRegularizer.h
new file mode 100644
index 00000000000000..b8b2d5b84d6875
--- /dev/null
+++ b/paddle/parameter/OptimizerWithRegularizer.h
@@ -0,0 +1,152 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "FirstOrderOptimizer.h"
+
+namespace paddle {
+
+// add regularizer for objective function to do optimization
+class OptimizerWithRegularizer : public ParameterOptimizer {
+public:
+  static ParameterOptimizer* create(const OptimizationConfig& optConfig,
+                                    const ParameterConfig& paraConfig,
+                                    bool isParameterSparse, bool inPserver);
+
+  OptimizerWithRegularizer(const OptimizationConfig& optConfig,
+                           ParameterOptimizer* optimizer,
+                           Regularizer* regularizer)
+      : ParameterOptimizer(optConfig),
+        optimizer_(optimizer),
+        regularizer_(regularizer) {
+    parameterTypes_ = optimizer_->getParameterTypes();
+  }
+
+  virtual void init(size_t numRows, const ParameterConfig* config) {
+    optimizer_->init(numRows, config);
+  }
+
+  virtual void startPass() {
+    optimizer_->startPass();
+    timer_ = 0;
+  }
+
+  virtual void finishPass() { optimizer_->finishPass(); }
+
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    optimizer_->startBatch(numSamplesProcessed);
+  }
+
+  virtual void finishBatch() {
+    optimizer_->finishBatch();
+    ++timer_;
+  }
+
+  virtual TraverseCallback needSpecialTraversal(
+      const ParameterConfig& config) const {
+    return optimizer_->needSpecialTraversal(config);
+  }
+
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+                      size_t sparseId) const {
+    optimizer_->update(vecs, config, sparseId);
+    regularizer_->update(vecs, config, optimizer_->getLearningRate(), 0, 1);
+  }
+
+protected:
+  std::unique_ptr<ParameterOptimizer> optimizer_;
+  Regularizer* regularizer_;
+
+  /**
+   *  counting batches, clear after catch up with
+   *  t(timer_) is current time,
+   *  t0(t0Vec_) are last occur time of i rows.
+   *  if one block is update by multi threads,
+   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
+   */
+  int timer_;
+};
+
+// Regularized Loss function for every num of batches
+class OptimizerWithRegularizerEveryNumBatches
+    : public OptimizerWithRegularizer {
+public:
+  OptimizerWithRegularizerEveryNumBatches(const OptimizationConfig& optConfig,
+                                          ParameterOptimizer* optimizer,
+                                          Regularizer* regularizer)
+      : OptimizerWithRegularizer(optConfig, optimizer, regularizer) {}
+
+  virtual void startPass() {
+    OptimizerWithRegularizer::startPass();
+    baseTimer_ = 0;
+  }
+
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+                      size_t sparseId) const {
+    optimizer_->update(vecs, config, sparseId);
+  }
+
+  virtual TraverseCallback needSpecialTraversal(
+      const ParameterConfig& config) const;
+  void doTraversal(const VectorPtr vecs[], const ParameterConfig& config) const;
+
+  void catchUpWith(const VectorPtr vecs[], const ParameterConfig& config,
+                   size_t sparseId) const;
+
+  virtual TraverseCallback startCatchUpWith() const;
+  virtual void finishCatchUpWith() { baseTimer_ = timer_; }
+
+protected:
+  bool isRegularizationBatch(const ParameterConfig& config) const {
+    return ((timer_ + 1) % config.num_batches_regularization() == 0);
+  }
+
+  /**
+   *  recored the timer_ value while catchUpWith called.
+   */
+  int baseTimer_;
+};
+
+// Regularized Loss function with Sparse support
+class OptimizerWithRegularizerSparse : public OptimizerWithRegularizer {
+public:
+  OptimizerWithRegularizerSparse(const OptimizationConfig& optConfig,
+                                 ParameterOptimizer* optimizer,
+                                 Regularizer* regularizer)
+      : OptimizerWithRegularizer(optConfig, optimizer, regularizer) {}
+
+  virtual void init(size_t numRows, const ParameterConfig* config);
+
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+                      size_t sparseId) const;
+  void catchUpWith(const VectorPtr vecs[], const ParameterConfig& config,
+                   size_t sparseId) const;
+  virtual TraverseCallback startCatchUpWith() const;
+  virtual void finishCatchUpWith() {
+    timer_ = 0;
+    t0Vec_.assign(t0Vec_.size(), 0);
+  }
+
+protected:
+  /**
+   *  t0Vec_ are last occur time of i rows
+   *  if one block is update by multi threads,
+   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
+   */
+  mutable std::vector<int32_t> t0Vec_;
+};
+
+}  // namespace paddle
diff --git a/paddle/parameter/ParallelParameter.cpp b/paddle/parameter/ParallelParameter.cpp
new file mode 100644
index 00000000000000..19cbdab1c8d1e8
--- /dev/null
+++ b/paddle/parameter/ParallelParameter.cpp
@@ -0,0 +1,209 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#include <fstream>
+
+#include "ParallelParameter.h"
+
+namespace paddle {
+
+UpdateFunction paramUpdateFunctions[UPDATE_TYPE_NUM] = {
+    nullptr,  // &ParallelParameter::singleUpdate,  /* single thread */
+    nullptr,  // &ParallelParameter::controlUpdate,    /* controller thread */
+    &ParallelParameter::majorUpdate, /* major thread */
+    &ParallelParameter::minorUpdate, /* minor thread */
+
+    nullptr,                         /* master */
+    &ParallelParameter::slaveUpdate, /* slave */
+};
+ParallelParameterPtr ParallelParameter::create(TrainerRole role,
+                                               ParameterPtr localParam,
+                                               int asyncCount) {
+  ParallelParameterPtr ptr = nullptr;
+  switch (role) {
+    case TRAINER_ROLE_CONTROL:
+    case TRAINER_ROLE_MAJOR:
+    case TRAINER_ROLE_MINOR:
+      ptr = std::make_shared<SyncParameter>(role, localParam);
+      break;
+    case TRAINER_ROLE_MASTER:
+    case TRAINER_ROLE_SLAVE:
+      ptr = std::make_shared<AsyncParameter>(role, asyncCount, localParam);
+      break;
+    default:
+      LOG(FATAL) << "unknown role " << role << "\n";
+  }
+  return ptr;
+}
+void ParallelParameter::syncUpdate(TrainerRole role, real learnRate) {
+  if (paramUpdateFunctions[role]) {
+    (this->*paramUpdateFunctions[role])(learnRate);
+  }
+}
+
+void SyncParameter::attachControlParam(ParallelParameterPtr controler) {
+  controlParam_ = controler;
+}
+
+void SyncParameter::attachMajorParam(ParallelParameterPtr partner) {
+  majorPartners_.push_back(partner);
+  if (role_ == TRAINER_ROLE_CONTROL) {
+    localParam_->setSharedCount(majorPartners_.size());
+  }
+  // partnerParam_ = partner;
+}
+
+void SyncParameter::attachMinorParam(ParallelParameterPtr partner,
+                                     int deviceId) {
+  minorPartners_.push_back(partner);
+  minorDeviceIds_.push_back(deviceId);
+  // partnerParam_ = partner;
+}
+
+void SyncParameter::waitAllMajorGradReady() {
+  for (size_t i = 0; i < majorPartners_.size(); i++) {
+    majorPartners_[i]->waitGradReady();
+    partnerParam_ = majorPartners_[i]->getLocalParameter();
+    VectorPtr localGrad = localParam_->getBuf(PARAMETER_GRADIENT);
+    VectorPtr patnrGrad = partnerParam_->getBuf(PARAMETER_GRADIENT);
+    if (FLAGS_use_gpu) hl_set_device(minorDeviceIds_[i]);
+    localGrad->add(*patnrGrad);
+  }
+}
+
+void SyncParameter::synchronizeParamter() {
+  valueSem_->wait();
+  if (role_ == TRAINER_ROLE_MINOR) {
+    /* copy the value from controller */
+    VectorPtr cntrlVec =
+        (controlParam_->getLocalParameter())->getBuf(PARAMETER_VALUE);
+    VectorPtr localVec = localParam_->getBuf(PARAMETER_VALUE);
+    localVec->copyFrom(*cntrlVec);
+
+    /* dispatch the value to major */
+    for (size_t i = 0; i < majorPartners_.size(); i++) {
+      VectorPtr majorVec =
+          (majorPartners_[i]->getLocalParameter())->getBuf(PARAMETER_VALUE);
+      majorVec->copyFrom(*localVec);
+      majorPartners_[i]->postValueReady();
+    }
+  }
+}
+
+void SyncParameter::singleUpdate(real learnRate) {
+  CHECK(role_ == TRAINER_ROLE_SINGLE);
+  localParam_->updateWithGradient(learnRate);
+}
+
+void SyncParameter::controlUpdate(const UpdateCallback &callBack) {
+  CHECK(role_ == TRAINER_ROLE_CONTROL);
+  CHECK(gradSem_ != NULL && valueSem_ != NULL);
+  CHECK(majorPartners_.size());
+
+  /* update */
+  if (callBack) {
+    callBack(localParam_.get());
+    localParam_->clearGradient();
+  }
+
+  for (size_t i = 0; i < minorPartners_.size(); i++) {
+    minorPartners_[i]->postValueReady();
+  }
+}
+
+void SyncParameter::majorUpdate(real learnRate) {
+  (void)learnRate;
+  CHECK(role_ == TRAINER_ROLE_MAJOR);
+  CHECK(gradSem_ != NULL && valueSem_ != NULL);
+  CHECK(minorPartners_.size() && controlParam_);
+
+  /* wait the minor-Gradient is ready */
+  for (size_t i = 0; i < minorPartners_.size(); i++) {
+    minorPartners_[i]->waitGradReady();
+    partnerParam_ = minorPartners_[i]->getLocalParameter();
+    VectorPtr localGrad = localParam_->getBuf(PARAMETER_GRADIENT);
+    VectorPtr minorGrad = partnerParam_->getBuf(PARAMETER_GRADIENT);
+    localGrad->add(*minorGrad);
+  }
+
+  /* notice the controller that the gradient is ready */
+  gradSem_->post();
+}
+
+void SyncParameter::minorUpdate(real learnRate) {
+  (void)learnRate;
+  CHECK(role_ == TRAINER_ROLE_MINOR);
+  CHECK(gradSem_ != NULL && valueSem_ != NULL);
+
+  // notice the major that the gradient is ready
+  gradSem_->post();
+}
+
+AsyncParameter::AsyncParameter(TrainerRole role, int asyncCount,
+                               ParameterPtr localParam)
+    : ParallelParameter(role, localParam) {
+  asyncCount_ = asyncCount;
+  accumCounter_ = 0;
+  gradientAccum_ = Vector::create(localParam->getSize(), localParam->useGpu());
+  gradientAccum_->zeroMem();
+}
+
+void AsyncParameter::slaveUpdate(real learnRate) {
+  /* increase the accumCounter_ */
+  accumCounter_++;
+
+  /* accumulate the gradient to the buffer */
+  VectorPtr grad = localParam_->getBuf(PARAMETER_GRADIENT);
+  gradientAccum_->add(*grad);
+
+  /* if need to be synchronized with the master */
+  if (accumCounter_ == asyncCount_) {
+    gradSem_->post();
+    // accumCounter_ = 0; NOTICE: the upper-function need to reset the counter
+  } else {  // self update
+    localParam_->updateWithGradient(learnRate);
+  }
+  localParam_->clearGradient();
+}
+
+bool AsyncParameter::masterUpdate(ParallelParameterPtr slaveParam,
+                                  const UpdateCallback &callback) {
+  CHECK(slaveParam && callback);
+
+  /* wait the slave is ready */
+  if (!slaveParam->timeWaitGradReady(5)) {
+    return false;
+  }
+
+  AsyncParameter *asyncParam = dynamic_cast<AsyncParameter *>(slaveParam.get());
+
+  /* get the accum-gradient to update local parameter */
+  VectorPtr slaveVec = asyncParam->getAccum();
+  localParam_->getBuf(PARAMETER_GRADIENT)->copyFrom(*slaveVec);
+  callback(localParam_.get());
+  // slaveVec->zeroMem();
+
+  /* copy the newest parameter-value to the slave */
+  slaveVec = (slaveParam->getLocalParameter())->getBuf(PARAMETER_VALUE);
+  slaveVec->copyFrom(*(localParam_->getBuf(PARAMETER_VALUE)));
+
+  /* release the semphore */
+  slaveParam->postValueReady();
+
+  return true;
+}
+
+}  // namespace paddle
diff --git a/paddle/parameter/ParallelParameter.h b/paddle/parameter/ParallelParameter.h
new file mode 100644
index 00000000000000..882033af636529
--- /dev/null
+++ b/paddle/parameter/ParallelParameter.h
@@ -0,0 +1,244 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <stdint.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "hl_gpu.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/Locks.h"
+#include "paddle/parameter/Parameter.h"
+#include "paddle/parameter/ParameterUpdateFunctions.h"
+#include "paddle/utils/TypeDefs.h"
+#include "paddle/math/Vector.h"
+
+#include "ParameterConfig.pb.h"
+
+namespace paddle {
+
+class ParallelParameter;
+class SyncParameter;
+class AsyncParameter;
+
+typedef std::shared_ptr<ParallelParameter> ParallelParameterPtr;
+
+const int UPDATE_TYPE_NUM = 32;
+
+/**
+ * TrainRole denotes the role of current training, different roles have
+ * different jobs.
+ *
+ * control, major, minor are three kinds of role to support mutiple GPUs 
+ * parallel SGD training. SM on GPU card has two groups, each group
+ * consist of a major and a minor.
+ *
+ * @param    single  single GPU card single thread training.
+ * 
+ *
+ * @param    control current parameter updates via control role,
+ *                   not participate in real training. control role is
+ *                   responsible for merging all major's gradient and 
+ *                   update parameter value. 
+ *
+ * @param    major   major role paticipates in real training, when local
+ *                   gradient is ready, merge its corresponding minor's
+ *                   gradient and notify controller: this group's gradient
+ *                   is already ready.
+ *
+ * @param    minor   minor role participates in real training, when local
+ *                   gradient is ready, only notify its corresponding major.
+ *                   In order to maximum apportion jobs, after controller
+ *                   updates the paramemter value, each group's minior
+ *                   reponses to dispatch the latest model into local and
+ *                   major.
+ */
+enum TrainerRole {
+  TRAINER_ROLE_SINGLE,
+  TRAINER_ROLE_CONTROL,
+  TRAINER_ROLE_MAJOR,
+  TRAINER_ROLE_MINOR,
+  TRAINER_ROLE_MASTER,
+  TRAINER_ROLE_SLAVE
+};
+typedef void (ParallelParameter::*UpdateFunction)(real learnRate);
+
+class ParallelParameter {
+public:
+  static ParallelParameterPtr create(TrainerRole role, ParameterPtr localParam,
+                                     int asyncCount = 1);
+
+  ParallelParameter(TrainerRole role, ParameterPtr localParam) {
+    role_ = role;
+    gradSem_.reset(new Semaphore(0));
+    valueSem_.reset(new Semaphore(0));
+    localParam_ = localParam;
+  }
+
+  virtual ~ParallelParameter() {}
+
+  ParameterPtr getLocalParameter() { return localParam_; }
+  bool timeWaitGradReady(int sec) {
+    struct timespec ts;
+    ts.tv_nsec = 0;
+    ts.tv_sec = time(NULL) + sec;
+    return gradSem_->timeWait(&ts);
+  }
+  void waitGradReady() { gradSem_->wait(); }
+  void postValueReady() { valueSem_->post(); }
+
+  void syncUpdate(TrainerRole role, real learnRate);
+
+  virtual void synchronizeParamter() = 0;
+
+  /**
+   * for synchronous
+   */
+  virtual void singleUpdate(real learnRate) { (void)learnRate; }
+
+  virtual void controlUpdate(const UpdateCallback& callback) { (void)callback; }
+
+  virtual void majorUpdate(real learnRate) { (void)learnRate; }
+
+  virtual void minorUpdate(real learnRate) { (void)learnRate; }
+
+  /**
+   * for asynchronous
+   */
+  virtual void slaveUpdate(real learnRate) { (void)learnRate; }
+
+protected:
+  TrainerRole role_;
+  ParameterPtr localParam_;
+  std::unique_ptr<Semaphore>
+      gradSem_;  /// wether the local parameter-gradient is ready
+  std::unique_ptr<Semaphore>
+      valueSem_;  /// wether the local parameter-value is updated
+};
+
+/**
+ * this class is designed for multi-threading training. 
+ *
+ * "Synchronous" means multiple GPUs calculate 1/4 mini-Batch,
+ * but will get only one gradient
+ */
+class SyncParameter : public ParallelParameter {
+public:
+  SyncParameter(TrainerRole role, ParameterPtr localParam)
+      : ParallelParameter(role, localParam) {
+    controlParam_ = nullptr;
+    majorPartners_.clear();
+    minorPartners_.clear();
+  }
+  ~SyncParameter() {
+    majorPartners_.clear();
+    minorPartners_.clear();
+  }
+  void attachControlParam(ParallelParameterPtr controler);
+
+  void attachMajorParam(ParallelParameterPtr partner);
+
+  void attachMinorParam(ParallelParameterPtr partner, int deviceId);
+
+  void waitAllMajorGradReady();
+
+  void synchronizeParamter();
+
+  void singleUpdate(real learnRate);
+
+  void controlUpdate(const UpdateCallback& callback);
+
+  void majorUpdate(real learnRate);
+
+  void minorUpdate(real learnRate);
+
+  std::vector<ParallelParameterPtr>& getMajorPartners() {
+    return majorPartners_;
+  }
+
+  std::vector<ParallelParameterPtr>& getMinorPartners() {
+    return minorPartners_;
+  }
+
+private:
+  // The following variables are used in a multithreaded training situation
+  // partnerParam_ is local-parameter's partner
+  // controlParam_ is the controller-thread 's parameter
+  ParameterPtr partnerParam_;
+  std::vector<ParallelParameterPtr> majorPartners_;
+  std::vector<ParallelParameterPtr> minorPartners_;
+  std::vector<int> minorDeviceIds_;
+  ParallelParameterPtr controlParam_;
+};
+
+class AsyncParameter : public ParallelParameter {
+public:
+  AsyncParameter(TrainerRole role, int asyncCount, ParameterPtr localParam);
+
+  void clearCounter() { accumCounter_ = 0; }
+
+  VectorPtr getAccum() { return gradientAccum_; }
+
+  void synchronizeParamter() {
+    if (accumCounter_ == asyncCount_) {
+      valueSem_->wait();
+      clearCounter();
+      gradientAccum_->zeroMem();
+    }
+  }
+
+  /**
+   * When asynchronous training, update strategy including slave and master.
+   *
+   * slave: If in range asyncCount, adopting self-update method.
+   *        If beyond asyncCount, waiting for master to update. 
+   */
+  void slaveUpdate(real learnRate);
+
+  /**
+   * When asynchronous training, update strategy including slave and master.
+   *
+   * master: it only polls slaves, do not training data. 
+   *         If slave's gradient is ready, fetch it.
+   *         Update master's parameter, then copy it into
+   *         corresponding slave.
+   */
+  bool masterUpdate(ParallelParameterPtr slaveParam,
+                    const UpdateCallback& callback);
+
+private:
+  /**
+   * When asynchronous training, every aysnc trainer needs to
+   * accumulate a number of batch gradient. 
+   *
+   * gradientAccum_ is used to save the sum of gradients.
+   */
+  VectorPtr gradientAccum_;
+
+  /// Asynchronous count.
+  int asyncCount_;
+  /// Accumulate counter of current gradients.
+  int accumCounter_;
+};
+
+typedef std::map<std::string, ParallelParameterPtr> ParallelParameterMap;
+
+}  // namespace paddle
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
new file mode 100644
index 00000000000000..64d72ae7404f09
--- /dev/null
+++ b/paddle/parameter/Parameter.cpp
@@ -0,0 +1,474 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <fstream>
+#include "paddle/math/MathUtils.h"
+#include "AverageOptimizer.h"
+#include "FirstOrderOptimizer.h"
+#include "Parameter.h"
+#include "paddle/utils/Logging.h"
+#include "OptimizerFunctions.h"
+#include "OptimizerWithRegularizer.h"
+#include "ParameterUpdateFunctions.h"
+#include "paddle/math/SparseRowMatrix.h"
+#include "paddle/math/CpuSparseMatrix.h"
+#include "hl_gpu.h"
+#include "paddle/utils/CommandLineParser.h"
+
+P_DEFINE_int32(enable_grad_share, (100 * 1024 * 1024),
+               "threshold for enable gradient parameter share for batch "
+               "multi-cpu training");
+P_DEFINE_int32(
+    grad_share_block_num, 64,
+    "block number of gradient parameter share for batch multi-cpu training");
+
+namespace paddle {
+
+const std::string Parameter::kMissParameterFail = "fail";
+const std::string Parameter::kMissParameterRand = "rand";
+const std::string Parameter::kMissParameterZero = "zero";
+
+Parameter::Parameter(const ParameterConfig& config, bool useGpu, bool doInit)
+    : config_(config),
+      useGpu_(useGpu),
+      deviceId_(-1),
+      sharedCount_(0),
+      updateCounter_(0),
+      updated_(false) {
+  setID(-1); /* capture uninitialized id */
+  if (useGpu_ && FLAGS_parallel_nn) {
+    /* gpu environment is specified by device property */
+    deviceId_ = config_.device();
+    if (deviceId_ < 0) {
+      useGpu_ = false;
+    }
+  }
+
+  if (doInit) {
+    initialize();
+  }
+
+  for (int i = 0; i < config.update_hooks_size(); ++i) {
+    this->updaterHooks_.push_back(IParameterUpdaterHook::create(config, i));
+  }
+}
+
+void Parameter::initialize() {
+  SetDevice device(deviceId_);
+
+  bufs_[PARAMETER_VALUE] =
+      Vector::createParallelVector(config_.size(), useGpu_);
+  bufs_[PARAMETER_VALUE]->zeroMem();
+
+  if (config_.is_sparse()) {
+    enableSparseParameter();
+  }
+
+  if (!isStatic()) {
+    bufs_[PARAMETER_GRADIENT] =
+        Vector::createParallelVector(config_.size(), useGpu_);
+    bufs_[PARAMETER_MOMENTUM] =
+        Vector::createParallelVector(config_.size(), useGpu_);
+
+    bufs_[PARAMETER_GRADIENT]->zeroMem();
+    bufs_[PARAMETER_MOMENTUM]->zeroMem();
+  }
+}
+
+void Parameter::randomize(const VectorPtr& value,
+                          const ParameterConfig& config) {
+  if (PARAMETER_INIT_UNIFORM == config.initial_strategy()) {
+    // initialize the parameter as uniform distribution
+    real initial_min = config.initial_mean() - config.initial_std();
+    real initial_max = config.initial_mean() + config.initial_std();
+    value->uniform(initial_min, initial_max);
+    VLOG(1) << config.name() << ": initial_min=" << initial_min
+                            << ", initial_max=" << initial_max;
+  } else if (PARAMETER_INIT_NORMAL == config.initial_strategy()) {
+    /* Initialize the parameters randomly */
+    value->randnorm(config.initial_mean(), config.initial_std());
+    VLOG(1) << config.name()
+                            << ": initial_mean=" << config.initial_mean()
+                            << ", initial_std=" << config.initial_std();
+  } else {
+    LOG(FATAL) << "not supported initial_strategy: "
+               << config.initial_strategy();
+  }
+}
+
+void Parameter::randomize() {
+  if (!bufs_[PARAMETER_VALUE]) return;
+  SetDevice device(deviceId_);
+  Parameter::randomize(bufs_[PARAMETER_VALUE], config_);
+
+  if (config_.is_sparse()) {
+    if (format_ == SPARSE_CSC) {
+      sparseRand(intBufs_[PARAMETER_COLS]->getData(),
+                 intBufs_[PARAMETER_ROWS]->getData(), config_.size(),
+                 config_.dims(1) + 1, config_.dims(0), useGpu_);
+    } else {
+      sparseRand(intBufs_[PARAMETER_ROWS]->getData(),
+                 intBufs_[PARAMETER_COLS]->getData(), config_.size(),
+                 config_.dims(0) + 1, config_.dims(1), useGpu_);
+    }
+  }
+  setValueUpdated();
+}
+
+void Parameter::zeroMem() {
+  if (!bufs_[PARAMETER_VALUE]) return;
+  bufs_[PARAMETER_VALUE]->zeroMem();
+  setValueUpdated();
+  LOG(INFO) << getName() << " set to 0";
+}
+
+bool Parameter::isGradShared(size_t* blockNum) {
+  if (!useGpu_ && !isStatic() && FLAGS_enable_grad_share > 0 &&
+      !isGradSparseUpdate() &&
+      this->getSize() > (size_t)FLAGS_enable_grad_share) {
+    if (blockNum) {
+      *blockNum = (size_t)FLAGS_grad_share_block_num;
+    }
+    return true;
+  }
+  return false;
+}
+
+bool Parameter::isValueShared() {
+  return !useGpu_ && config_.is_shared() && FLAGS_trainer_count > 1;
+}
+
+bool Parameter::isGradSparseUpdate() const {
+  return !useGpu_ && !isStatic() &&
+      (config_.sparse_update() || config_.sparse_remote_update());
+}
+
+void Parameter::setMat(ParameterType pType, int matType) {
+  CHECK(!mats_[pType]);
+
+  if (config_.dims_size() == 0 && matType == MAT_NORMAL) {
+    return;
+  }
+
+  CHECK_EQ((size_t)config_.dims_size(), 2LU);
+  size_t height = config_.dims(0);
+  size_t width = config_.dims(1);
+  if (matType == MAT_NORMAL) {
+    if (!config_.is_sparse()) {
+      CHECK_EQ(height * width, bufs_[pType]->getSize());
+      mats_[pType] =
+          Matrix::create(bufs_[pType]->getMemoryHandle(), height, width);
+    } else {
+      size_t size = bufs_[pType]->getSize();
+      CHECK_GE(height * width, size);
+      if (format_ == SPARSE_CSR) {
+        CHECK_EQ(height + 1, intBufs_[PARAMETER_ROWS]->getSize());
+        CHECK_EQ(size, intBufs_[PARAMETER_COLS]->getSize());
+      } else {
+        CHECK_EQ(width + 1, intBufs_[PARAMETER_COLS]->getSize());
+        CHECK_EQ(size, intBufs_[PARAMETER_ROWS]->getSize());
+      }
+      mats_[pType] = Matrix::createSparseMatrix(
+          bufs_[pType]->getData(), intBufs_[PARAMETER_ROWS]->getData(),
+          intBufs_[PARAMETER_COLS]->getData(), height, width,
+          bufs_[pType]->getSize(), FLOAT_VALUE, format_, false, useGpu_);
+    }
+  } else if (matType == MAT_NORMAL_SHARED) {
+    CHECK_EQ(height * width, bufs_[pType]->getSize());
+    size_t blockNum = 0;
+    CHECK(isGradShared(&blockNum));
+    mats_[pType] = std::make_shared<SharedCpuMatrix>(
+        blockNum, std::dynamic_pointer_cast<CpuMemoryHandle>(
+                      bufs_[pType]->getMemoryHandle()),
+        height, width);
+  } else if (matType == MAT_VALUE_SHARED) {
+    CHECK_EQ(height * width, bufs_[pType]->getSize());
+    mats_[pType] = std::make_shared<SharedCpuMatrix>(
+        std::dynamic_pointer_cast<CpuMemoryHandle>(
+        bufs_[pType]->getMemoryHandle()), height, width);
+  } else if (matType == MAT_SPARSE_ROW_IDS) {
+    CHECK_EQ(height * width, bufs_[pType]->getSize());
+    mats_[pType] = std::make_shared<SparseRowIdsCpuMatrix>(
+        std::dynamic_pointer_cast<CpuMemoryHandle>(
+            bufs_[pType]->getMemoryHandle()),
+        height, width);
+  } else if (matType == MAT_SPARSE_ROW) {
+    auto valueMat =
+        std::dynamic_pointer_cast<SparseRowCpuMatrix>(mats_[PARAMETER_VALUE]);
+    SparseRowCpuMatrix::IndexDictPtr indexDict(nullptr);
+    if (pType != PARAMETER_VALUE) {
+      CHECK(valueMat) << "The matrix for PARAMETER_VALUE must be set "
+                      << " and its type must be MAT_SPARSE_ROW,"
+                      << " MAT_SPARSE_ROW_PREFETCH or MAT_CACHE_ROW";
+      indexDict = valueMat->getIndexDictHandle();
+    }
+    auto mat = std::make_shared<SparseRowCpuMatrix>(
+        nullptr, height, width,
+        // grad share index with value
+        indexDict);
+    mats_[pType] = mat;
+  } else if (matType == MAT_CACHE_ROW) {
+    CHECK(isGradSparseUpdate());
+    auto mat = std::make_shared<CacheRowCpuMatrix>(
+      height, width);
+    mats_[pType] = mat;
+  } else if (matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
+             matType == MAT_SPARSE_ROW_PREFETCH) {
+    auto mat = std::make_shared<SparsePrefetchRowCpuMatrix>(
+        bufs_[pType] ? std::dynamic_pointer_cast<CpuMemoryHandle>(
+          bufs_[pType]->getMemoryHandle()) : nullptr,
+        height, width,
+        nullptr,  // indexDictHandle
+        getGlobalSyncThreadPool());
+    mats_[pType] = mat;
+  } else if (matType == MAT_SPARSE_ROW_AUTO_GROW) {
+    CHECK(isGradSparseUpdate());
+    mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(
+      height, width);
+  } else {
+    LOG(FATAL) << "Unsupported mat type" << matType;
+  }
+}
+
+SparsePrefetchRowCpuMatrix* Parameter::getPrefetchMatrix() {
+  MatrixPtr mat = mats_[PARAMETER_VALUE];
+  if (mat) {
+    return dynamic_cast<SparsePrefetchRowCpuMatrix*>(mat.get());
+  }
+
+  return nullptr;
+}
+
+void Parameter::updateWithGradient(real learningRate) {
+  sgdUpdate(learningRate * config_.learning_rate(), config_.momentum(),
+            config_.decay_rate(), bufs_[PARAMETER_VALUE].get(),
+            bufs_[PARAMETER_GRADIENT].get(), bufs_[PARAMETER_MOMENTUM].get());
+}
+
+void Parameter::updateWithGradient(real learningRate, MatrixPtr gradMat,
+                                   IVectorPtr t0, int currentTime, bool fini) {
+  SparseRowCpuMatrix* sparseMat =
+      dynamic_cast<SparseRowCpuMatrix*>(gradMat.get());
+  CHECK(sparseMat);
+  CHECK_EQ(config_.momentum(), 0.0f)
+      << "not support momentum in sparse input sgd";
+  bool useL1 = (config_.decay_rate_l1() != 0.0f);
+  sparseMat->sgdUpdate(*bufs_[PARAMETER_VALUE], *t0,
+                       learningRate * config_.learning_rate(), currentTime,
+                       useL1 ? config_.decay_rate_l1() : config_.decay_rate(),
+                       useL1, fini);
+}
+
+void Parameter::updateWithGradient(real learningRate, VectorPtr gradVec,
+                                   bool normalUpdate) {
+  if (normalUpdate) {
+    sgdUpdate(learningRate * config_.learning_rate(), config_.momentum(),
+              config_.decay_rate(), bufs_[PARAMETER_VALUE].get(), gradVec.get(),
+              bufs_[PARAMETER_MOMENTUM].get());
+  } else {
+    size_t size = gradVec->getSize();
+    real* mom = bufs_[PARAMETER_MOMENTUM]->getData();
+    real* grad = gradVec->getData();
+    real* value = bufs_[PARAMETER_VALUE]->getData();
+    hl_matrix_add(mom, grad, mom, 1, size, 1.0f, learningRate);
+    hl_matrix_add(value, grad, value, 1, size, 1.0f, learningRate);
+  }
+}
+
+void Parameter::incUpdate(const UpdateCallback& callback) {
+  // Static parameter is fixed, and does not need to be updated
+  if (isStatic()) {
+    return;
+  }
+
+  ++updateCounter_;
+  if (isUpdatable()) {
+    if (callback) callback(this);
+    clearUpdate();
+  }
+}
+
+bool Parameter::save(const std::string& filename) const {
+  std::ofstream fs(filename, std::ios_base::binary);
+  CHECK(fs) << "Fail to open " << filename;
+  return save(fs);
+}
+
+bool Parameter::save(std::ostream& s) const {
+  CpuVector vec(*bufs_[PARAMETER_VALUE].get());
+  Header header;
+  header.version = kFormatVersion;
+  header.valueSize = sizeof(real);
+  header.size = getSize();
+
+  CHECK_EQ(header.size, vec.getSize());
+
+  CHECK(s.write(reinterpret_cast<char*>(&header), sizeof(header)))
+      << "Fail to write parameter " << getName();
+
+  CHECK(s.write(reinterpret_cast<char*>(vec.getData()),
+                header.size * sizeof(real)))
+      << "Fail to write parameter " << getName();
+  if (config_.is_sparse()) {
+    CpuIVector rows(*intBufs_[PARAMETER_ROWS].get());
+    CpuIVector cols(*intBufs_[PARAMETER_COLS].get());
+    CHECK(s.write(reinterpret_cast<char*>(rows.getData()),
+                  rows.getSize() * sizeof(int)))
+        << "Fail to write parameter " << getName();
+    CHECK(s.write(reinterpret_cast<char*>(cols.getData()),
+                  cols.getSize() * sizeof(int)))
+        << "Fail to write parameter " << getName();
+  }
+
+  return true;
+}
+
+/**
+ * Load parameter value from a file
+ */
+bool Parameter::load(const std::string& filename) {
+  std::ifstream fs(filename, std::ios_base::binary);
+  if (!fs) {
+    LOG(INFO) << "missing parameters [" << filename << "] while loading model.";
+    if (isStatic()) {
+      LOG(FATAL) << getName() << " is static but missing, not allowed.";
+      return false;
+    }
+    if (kMissParameterFail == FLAGS_load_missing_parameter_strategy) {
+      LOG(FATAL) << getName() << " missing, not allowed.";
+      return false;
+    }
+    if (kMissParameterRand == FLAGS_load_missing_parameter_strategy) {
+      LOG(INFO) << getName() << " missing, set to random.";
+      randomize();
+      return true;
+    }
+    if (kMissParameterZero == FLAGS_load_missing_parameter_strategy) {
+      LOG(INFO) << getName() << " missing, set to zero.";
+      zeroMem();
+      return true;
+    }
+    LOG(FATAL) << "unsupported load_missing_parameter_strategy: "
+        << FLAGS_load_missing_parameter_strategy;
+    return false;
+  }
+  return load(fs);
+}
+
+bool Parameter::load(std::istream& s) {
+  CpuVector vec(*bufs_[PARAMETER_VALUE].get());
+  Header header;
+  CHECK(s.read(reinterpret_cast<char*>(&header), sizeof(header)))
+      << "Fail to read parameter " << getName();
+  CHECK_EQ(header.version, kFormatVersion)
+      << "Incorrect format version: " << header.version;
+  CHECK_EQ(header.size, getSize())
+      << "The size (" << header.size << ") in the file does not match the size "
+      << "(" << getSize() << ") of the parameter: " << getName();
+  CHECK_EQ(header.valueSize, sizeof(real))
+      << "Unsupported valueSize " << header.valueSize << " at: " << getName();
+  CHECK(s.read(reinterpret_cast<char*>(vec.getData()),
+               header.size * sizeof(real)));
+
+  auto & tmp = *bufs_[PARAMETER_VALUE].get();
+  if (typeid(tmp) == typeid(GpuVector)) {
+    bufs_[PARAMETER_VALUE]->copyFrom(vec);
+  }
+
+  if (config_.is_sparse() && config_.need_compact()) {
+    // load from dense parameter with many zero
+    CHECK_EQ(config_.dims_size(), 2);
+    auto height = config_.dims(0);
+    auto width = config_.dims(1);
+    auto mat = Matrix::create(vec.getData(), height, width);
+    CpuSparseMatrix sparseMat(height, width, 0, FLOAT_VALUE, format_,
+                              /*trans*/ false);
+    sparseMat.copyFrom(*mat, HPPL_STREAM_DEFAULT);
+    auto nnz = sparseMat.getElementCnt();
+    size_t rowSize = (format_ == SPARSE_CSR) ? height + 1 : nnz;
+    size_t colSize = (format_ == SPARSE_CSR) ? nnz : width + 1;
+
+    intBufs_[PARAMETER_ROWS]->copyFrom(sparseMat.getRows(), rowSize);
+    intBufs_[PARAMETER_COLS]->copyFrom(sparseMat.getCols(), colSize);
+    bufs_[PARAMETER_VALUE]->resize(nnz);  // for setMat check
+    bufs_[PARAMETER_VALUE]->copyFrom(sparseMat.getValue(), nnz);
+    config_.set_size(nnz);
+    LOG(INFO) << "compact nnz=" << (1. * nnz / (height * width))
+              << " name=" << config_.name();
+  } else if (config_.is_sparse()) {
+    CpuIVector rows(*intBufs_[PARAMETER_ROWS].get());
+    CpuIVector cols(*intBufs_[PARAMETER_COLS].get());
+    size_t rowSize, colSize;
+    CHECK_EQ(config_.dims_size(), 2);
+    if (format_ == SPARSE_CSR) {
+      rowSize = config_.dims(0) + 1;
+      colSize = config_.size();
+    } else {
+      rowSize = config_.size();
+      colSize = config_.dims(1) + 1;
+    }
+    CHECK(
+        s.read(reinterpret_cast<char*>(rows.getData()), rowSize * sizeof(int)));
+    CHECK(
+        s.read(reinterpret_cast<char*>(cols.getData()), colSize * sizeof(int)));
+    auto & paramRows = *intBufs_[PARAMETER_ROWS].get();
+    if (typeid(paramRows) == typeid(GpuIVector)) {
+      intBufs_[PARAMETER_ROWS]->copyFrom(rows);
+    }
+    auto & paramCols = *intBufs_[PARAMETER_COLS].get();
+    if (typeid(paramCols) == typeid(GpuIVector)) {
+      intBufs_[PARAMETER_COLS]->copyFrom(cols);
+    }
+  }
+
+  setValueUpdated();
+
+  return true;
+}
+
+ThreadLocal<std::vector<VectorPtr>> Parameter::tlsTempBufs_;
+
+VectorPtr* Parameter::getTlsTempBufs() {
+  std::vector<VectorPtr>& bufs = *tlsTempBufs_;
+  if (bufs.empty()) {
+    bufs.resize(NUM_PARAMETER_TYPES);
+    for (auto& vec : bufs) {
+      vec.reset(new CpuVector(0, nullptr));
+    }
+  }
+  return bufs.data();
+}
+
+void Parameter::exec(ExecFunc func) {
+  auto execFunc = [this, func](int tid, size_t numThreads) {
+    if (numThreads == 1) {  // single thread
+      func(this->getBufs());
+    } else {  // multi thread
+      VectorPtr* vecs = Parameter::getTlsTempBufs();
+      auto interval = calcSplitArrayInterval(this->getSize(), (size_t)tid,
+                                             numThreads, 8LU /*for avx*/);
+      for (size_t i = 0; i < (size_t)NUM_PARAMETER_TYPES; ++i) {
+        if (bufs_[i]) {
+          vecs[i]->subVecFrom(*bufs_[i], interval);
+        }
+      }
+      func(vecs);
+    }
+  };
+
+  getBuf(PARAMETER_VALUE)->exec(execFunc);
+}
+
+}  // namespace paddle
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
new file mode 100644
index 00000000000000..2f9606dc680265
--- /dev/null
+++ b/paddle/parameter/Parameter.h
@@ -0,0 +1,402 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <stdint.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "ParameterConfig.pb.h"
+#include "TrainerConfig.pb.h"
+
+#include "paddle/utils/Locks.h"
+#include "paddle/utils/TypeDefs.h"
+#include "paddle/math/Vector.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Util.h"
+#include "paddle/utils/ThreadLocal.h"
+#include "ParameterUpdaterHook.h"
+#include "paddle/utils/GlobalConstants.h"
+
+namespace paddle {
+
+class SparsePrefetchRowCpuMatrix;
+
+class Parameter;
+typedef std::function<void(Parameter* param)> UpdateCallback;
+typedef std::function<void(int paramId, Parameter* param)> ParamInitCallback;
+
+struct Segment {
+  int64_t beginDim;
+  int64_t endDim;
+
+  // We allow the possibility that the parameters are not stored at contiguous
+  // memory locations for speed reason (i.e. data alignemnt)
+  // This means that the dimenstion is not same as the position in the memroy
+  // buffer.
+  int64_t beginPos;  // beginning position in the local value or grad buffer
+};
+
+
+class Parameter;
+typedef std::shared_ptr<Parameter> ParameterPtr;
+
+class Parameter {
+public:
+  Parameter(const ParameterConfig& config, bool useGpu, bool doInit = true);
+  const std::string& getName() const { return config_.name(); }
+
+  size_t getSize() const { return config_.size(); }
+
+  bool isFullSize() const {
+    return this->getSize() == bufs_[PARAMETER_VALUE]->getSize();
+  }
+
+  inline bool useGpu() const { return useGpu_; }
+
+  int getDeviceId() const { return deviceId_; }
+
+  void setDevice(int deviceId) { deviceId_ = deviceId; }
+
+  /// The id ranges from 0 to the_total_number_of_parameters - 1
+  size_t getID() const { return config_.para_id(); }
+
+  /// ID is a implict value created until neural network is built.
+  void setID(size_t id) { config_.set_para_id(id); }
+
+  bool isStatic() const { return config_.is_static(); }
+
+  enum MatType {
+    MAT_NORMAL,
+    /// both value and grad are shared
+    MAT_NORMAL_SHARED,
+
+    /// Now used in BatchNorm in CPU mode
+    MAT_VALUE_SHARED,
+
+    /// sparse matrix, which has full size parameter
+    MAT_SPARSE_ROW_IDS,
+    /// sparse matrix, parameter size scale by sparse rates.
+    MAT_SPARSE_ROW_AUTO_GROW,
+    MAT_CACHE_ROW,
+    MAT_SPARSE_ROW,
+
+    /// sparse matrix for prefetching parameter from pserver
+    MAT_SPARSE_ROW_PREFETCH,
+    /// same as above, but parameter has full size for saving parameter in local
+    MAT_SPARSE_ROW_PREFETCH_FULL_SIZE,
+  };
+
+  void enableSparseParameter() {
+    if (config_.is_sparse()) {
+      if (config_.format() == "csr") {
+        size_t height = config_.dims(0);
+        size_t nnz = config_.size();
+        enableIntType(PARAMETER_ROWS, height + 1);
+        enableIntType(PARAMETER_COLS, nnz);
+        format_ = SPARSE_CSR;
+      } else {
+        size_t width = config_.dims(1);
+        size_t nnz = config_.size();
+        enableIntType(PARAMETER_COLS, width + 1);
+        enableIntType(PARAMETER_ROWS, nnz);
+        format_ = SPARSE_CSC;
+      }
+    }
+  }
+
+  /// allocate buffer for the give type
+  void enableType(ParameterType type, MatType matType = MAT_NORMAL) {
+    if (bufs_[type] || mats_[type]) {
+      return;
+    }
+    SetDevice device(deviceId_);
+    if (config_.dims_size() == 2) {
+      if (matType == MAT_NORMAL || matType == MAT_NORMAL_SHARED ||
+          matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
+          matType == MAT_VALUE_SHARED ||
+          matType == MAT_SPARSE_ROW_IDS) {
+        bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
+        bufs_[type]->zeroMem();
+      } else {
+        CHECK(isGradSparseUpdate());
+      }
+      if (config_.is_sparse() && type == PARAMETER_VALUE) {
+        enableSparseParameter();
+      }
+      setMat(type, matType);
+    } else {
+      bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
+      bufs_[type]->zeroMem();
+    }
+  }
+
+  void enableIntType(ParameterType type, size_t intStoreSize = 0) {
+    if (!intBufs_[type]) {
+      SetDevice device(deviceId_);
+      size_t size = intStoreSize ? intStoreSize : config_.size();
+      intBufs_[type] = IVector::create(size, useGpu_);
+      intBufs_[type]->zeroMem();
+    }
+  }
+
+  void enableSharedType(ParameterType type, VectorPtr vec,
+                        MatrixPtr mat = nullptr) {
+    if (!bufs_[type] && !mats_[type]) {
+      bufs_[type] = vec;
+      mats_[type] = mat;
+    }
+  }
+
+  void enableSharedType(ParameterType type, VectorPtr vec, MatType matType) {
+    if (!bufs_[type]) {
+      bufs_[type] = vec;
+      setMat(type, matType);
+    }
+  }
+
+  /// for batchGradientMachine: blockNum is number of partitions of the matrix.
+  bool isGradShared(size_t* blockNum = NULL);
+
+  bool isValueShared();
+
+  // for AsgdSparseGradientMachine & SgdSparseGradientMachine:
+  // and MultiGradientMachine
+  bool isGradSparseUpdate() const;
+
+  bool isSparseRemoteUpdate() const {
+    return config_.sparse_remote_update() && !useGpu();
+  }
+
+  const ParameterConfig& getConfig() const { return config_; }
+
+  ParameterConfig& getConfig() { return config_; }
+
+  bool hasType(ParameterType pType) const {
+    return bufs_[pType] || mats_[pType];
+  }
+
+  const VectorPtr& getBuf(ParameterType pType) const {
+    return this->bufs_[pType];
+  }
+
+  const VectorPtr* getBufs() const { return bufs_; }
+
+  const MatrixPtr& getMat(ParameterType pType) const { return mats_[pType]; }
+
+  const IVectorPtr& getIntBuf(ParameterType pType) { return intBufs_[pType]; }
+
+  void setIntBuf(ParameterType pType, const IVectorPtr& iVec) {
+    intBufs_[pType] = iVec;
+  }
+
+  SparsePrefetchRowCpuMatrix* getPrefetchMatrix();
+
+  float getLearnRate() const { return config_.learning_rate(); }
+
+  float getInitMean() const { return config_.initial_mean(); }
+
+  float getInitStandardDeviation() const { return config_.initial_std(); }
+
+  void setValueUpdated() { updated_ = true; }
+
+  void clearValueUpdated() { updated_ = false; }
+
+  bool isValueUpdated() const { return updated_; }
+
+  /**
+   * Update bufs_[PARAMETER_VALUE] using bufs_[PARAMETER_GRADIENT]
+   */
+  void updateWithGradient(real learningRate);
+
+  /**
+   * Update bufs_[PARAMETER_VALUE] using sparse row grad matrix.
+   *
+   * @see SparseRowCpuMatrix::sgdUpdate for more information.
+   */
+  void updateWithGradient(real learningRate, MatrixPtr gradMat, IVectorPtr t0,
+                          int currentTime, bool fini = false);
+
+  /**
+   * This function is used to calculate multiple gpus, but only as a candidate
+   */
+  void updateWithGradient(real learningRate, VectorPtr grad,
+                          bool normalUpdate = true);
+
+  /**
+   * Save parameter value to a file
+   */
+  bool save(const std::string& filename) const;
+
+  /**
+   * Save parameter to ostream
+   */
+  bool save(std::ostream& s) const;
+
+  /**
+   * Load parameter value from a file
+   */
+  bool load(const std::string& filename);
+
+  /**
+   * Load parameter from istream
+   */
+  bool load(std::istream& is);
+
+  std::vector<Segment>& getGradientSegments() { return gradSegments_; }
+
+  void incShared() { sharedCount_++; }
+
+  /**
+   * After one of the parameter's gradient is merged
+   * You should call this function to do some additional processing,
+   */
+  void incUpdate(const UpdateCallback& callbacks = NULL);
+
+  void clearGradient() {
+    auto& mat = getMat(PARAMETER_GRADIENT);
+    if (mat) {
+      // zeroMem will also clear rows for SparseRowCpuMatrix
+      mat->zeroMem();
+    } else {
+      auto& gradBuf = getBuf(PARAMETER_GRADIENT);
+      if (gradBuf) gradBuf->zeroMem();
+    }
+  }
+
+  void initialize();
+
+  /**
+   * Initialize the value according to config_: initial_mean,
+   * initial_std and initial_strategy.
+   */
+  void randomize();
+  static void randomize(const VectorPtr& value, const ParameterConfig& config);
+
+  /// Initialize the value to 0
+  void zeroMem();
+
+  static const int kFormatVersion = 0;
+  /// file header structure
+  struct Header {
+    int32_t version;     // = 0, file format version
+    uint32_t valueSize;  // = sizeof(real)
+    uint64_t size;       // = getSize()
+  };
+
+  /**
+   * @brief  Parameter Update Hook.
+   *
+   * The parameter's update hook before ParameterUpdater::updateImpl
+   * It could modify gradient/momentum/etc here. Such as drop some gradient,
+   * etc.
+   */
+  void updateHook() {
+    for (auto& hook : updaterHooks_) {
+      hook->update(this);
+    }
+  }
+
+  /**
+   * @brief  Initialize all updater hook.
+   *
+   * This method should be invoked in ParameterUpdater::init() only.
+   */
+  void initHook() {
+    for (auto& hook : updaterHooks_) {
+      hook->init(this);
+    }
+  }
+
+protected:
+  /**
+   * @brief create matrix to matType.
+   *
+   * used by gradient machine which needs specify matrix type,
+   * instead of creating in weights.cpp.
+   *
+   * @note  pType should be enabled already.
+   */
+  void setMat(ParameterType pType, int matType);
+
+  bool isUpdatable() { return (updateCounter_ == sharedCount_); }
+
+  void clearUpdate() { updateCounter_ = 0; }
+
+protected:
+  ParameterConfig config_;
+
+  bool useGpu_;
+
+  int deviceId_;
+
+  /**
+   * @brief bufs_ stores parameter value and gradient.
+   *
+   * Layer should use bufs_[PARAMETER_VALUE] to form weight matrix for
+   * calculation and stores gradient to bufs_[PARAMETER_GRADIENT].
+   */
+  VectorPtr bufs_[NUM_PARAMETER_TYPES];
+
+  /**
+   * @brief Weight matrix for bufs_.
+   *
+   * It's helpfull when parameter shared by multi-layers.
+   * Caller should check, if mats exist, do not create it again.
+   */
+  MatrixPtr mats_[NUM_PARAMETER_TYPES];
+
+  /// Int vectors, used in some User defined parameter types
+  IVectorPtr intBufs_[NUM_PARAMETER_TYPES];
+
+  int sharedCount_;
+  int updateCounter_;
+  std::vector<Segment> gradSegments_;  // segments of non-zero gradient
+
+  bool updated_;
+  SparseFormat format_;
+
+  static ThreadLocal<std::vector<VectorPtr>> tlsTempBufs_;
+
+  std::vector<std::shared_ptr<IParameterUpdaterHook>> updaterHooks_;
+
+public:
+  void setSharedCount(int cnt) { sharedCount_ = cnt; }
+  int getSharedCount() { return sharedCount_; }
+
+  void singleUpdate(void* data);
+  bool isSparse() { return config_.is_sparse(); }
+  SparseFormat getFormat() { return format_; }
+
+  static const std::string kMissParameterFail;
+  static const std::string kMissParameterRand;
+  static const std::string kMissParameterZero;
+
+  static VectorPtr* getTlsTempBufs();
+
+  /**
+   * exec a func in single/multi thread.
+   * vecs is bufs_ of Parameter, as input of ExecFunc.
+   */
+  typedef std::function<void(const VectorPtr vecs[])> ExecFunc;
+  void exec(ExecFunc func);
+};
+
+typedef std::map<std::string, ParameterPtr> ParameterMap;
+
+}  // namespace paddle
diff --git a/paddle/parameter/ParameterOptimizer.cpp b/paddle/parameter/ParameterOptimizer.cpp
new file mode 100644
index 00000000000000..164b50c4d27910
--- /dev/null
+++ b/paddle/parameter/ParameterOptimizer.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+
+#include <fstream>
+
+#include "AverageOptimizer.h"
+#include "FirstOrderOptimizer.h"
+#include "OptimizerFunctions.h"
+#include "OptimizerWithRegularizer.h"
+#include "ParameterOptimizer.h"
+#include "hl_gpu.h"
+
+namespace paddle {
+
+ParameterOptimizer* ParameterOptimizer::create(
+    const OptimizationConfig& optConfig, bool inPserver) {
+  if (inPserver && optConfig.num_batches_per_send_parameter() > 1) {
+    return new AddOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "momentum") {
+    return new SgdOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "torch_momentum") {
+    return new SgdOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "adagrad") {
+    return new AdagradParameterOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "adadelta") {
+    return new AdaDeltaParameterOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "rmsprop") {
+    return new RMSPropParameterOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "decayed_adagrad") {
+    return new DecayedAdagradParameterOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "adam") {
+    return new AdamParameterOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "adamax") {
+    return new AdamaxParameterOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "sparse_momentum") {
+    return new SparseMomentumParameterOptimizer(optConfig);
+  }
+  return nullptr;
+}
+
+}  // namespace paddle
diff --git a/paddle/parameter/ParameterOptimizer.h b/paddle/parameter/ParameterOptimizer.h
new file mode 100644
index 00000000000000..8c766743401ddd
--- /dev/null
+++ b/paddle/parameter/ParameterOptimizer.h
@@ -0,0 +1,208 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "LearningRateScheduler.h"
+#include "Parameter.h"
+
+namespace paddle {
+
+/**
+ * Some member functions are set to const for two reasons:
+ *
+ * 1. For sparse update thread safe: update(), traverse callback(const this)
+ *    may be called many times, each time one row, and these function
+ *    can be called parallelly by multi worker, to speed up large block.
+ *
+ * 2. For predicate functions, needSpecialTraversal(), startCatchUpWith()
+ *    may be called many times, should be no state change between calls.
+ */
+class ParameterOptimizer {
+public:
+  typedef std::function<void(const VectorPtr vecs[],
+                             const ParameterConfig& config, size_t sparseId)>
+      TraverseCallback;
+
+public:
+  explicit ParameterOptimizer(const OptimizationConfig& optConfig)
+      : applyDecay_(true),
+        optConfig_(optConfig),
+        parameterTypes_{PARAMETER_VALUE, PARAMETER_GRADIENT},
+        learningRate_(optConfig.learning_rate()),
+        learningRateScheduler_(LearningRateScheduler::create(optConfig)),
+        pass_(0),
+        firstTime_(true) {}
+
+  real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    return learningRateScheduler_->calcLearningRate(numSamplesProcessed, pass);
+  }
+
+  virtual ~ParameterOptimizer() {}
+
+  /**
+   * For sparse update, optimizer can maintain numRows of timer(t0).
+   * Some sparse optimizer depends on parameter config in functions
+   * such as startBatch(). Optimizer can get it here. But notice that,
+   * not all callers can pass config here, so the optimizer should check
+   * config passed in is not null ptr.
+   */
+  virtual void init(size_t numRows, const ParameterConfig* config) {}
+
+  virtual void startPass() {}
+  virtual void finishPass() { ++pass_; }
+
+  /// called by Trainer before forward() of a batch.
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    (void)numSamplesProcessed;
+  }
+
+ /**
+  * following hooks useful for sparse update,
+  * because the traversal in block costs.
+  * called by Trainer after update and before finishBatch
+  * e.g. Trainer call like this:
+  *
+  * @code
+  * startBatch();
+  * if (dense) {
+  *   update(blockVec);
+  * } else {//sparse
+  *   for (row : rows_in_block) {update(rowVec)}
+  * }
+  * auto callback = needSpecialTraversal();
+  * if (callback) {
+  *   // do traverse, maybe multi-thread
+  *   if (dense) {
+  *     callback();
+  *   } else {//sparse
+  *     for (row : all_rows_in_block) {callback();}
+  *   }
+  * }
+  * finishBatch();
+  * @endcode
+  *
+  * @return callback if need traverse,
+  *         else return nullptr.
+  *         It should be no state change.
+  */
+  virtual TraverseCallback needSpecialTraversal(
+      const ParameterConfig& config) const {
+    return nullptr;
+  }
+
+  /// called by Trainer after backward() of a batch
+  virtual void finishBatch() {}
+
+  /**
+   * between startBatch() and finishBatch(), update() will be called
+   * by the trainer multiple times, each time for updating one Parameter
+   * with its gradient in PARAMETER_GRADIENT. sparseId is row id,
+   * when sparseId set, update is sparse, each time one row.
+   */
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+                      size_t sparseId = -1LU) const = 0;
+
+ /**
+  * following hooks catch up with current time for sparse update,
+  * In the beginning, call startCatchUpWith() and check return.
+  * In the end, call finishCatchUpWith() to finish state.
+  * callback do the actual works, can call many times for sparse data.
+  * e.g. Trainer call like this:
+  *
+  * @code
+  * auto callback = startCatchUpWith();
+  * if (callback) {
+  *   // do catch up with, maybe multi-thread
+  *   if (dense) {
+  *     callback();
+  *   } else {//sparse
+  *     for (row : rows_in_block) {callback();}
+  *   }
+  *   // finish catch up with, main thread
+  *   finishCatchUpWith();
+  * }
+  * @endcode
+  *
+  * @return callback if need catch up with,
+  *         else return nullptr.
+  *         It should be no state change.
+  */
+  virtual TraverseCallback startCatchUpWith() const { return nullptr; }
+  virtual void finishCatchUpWith() {}
+
+ /**
+  * following two hooks used by averager,
+  * apply to final parameter value (PARAMETER_VALUE or PARAMETER_APPLY).
+  *
+  * restore() will restore orginal value if it apply to PARAMETER_VALUE.
+  * Caller must ensure it's catched up with current time before apply.
+  *
+  * Use returned callback same way as callback returned by
+  * ParameterOptimizer::needSpecialTraversal()
+  */
+  virtual TraverseCallback apply() { return nullptr; }
+  virtual TraverseCallback restore() { return nullptr; }
+
+  /// return the parameter types used by this updater
+  const std::vector<ParameterType>& getParameterTypes() const {
+    return parameterTypes_;
+  }
+
+  void addParameterType(ParameterType type) {
+    for (auto t : parameterTypes_) {
+      if (t == type) return;
+    }
+    parameterTypes_.push_back(type);
+  }
+  real getLearningRate() const { return learningRate_; }
+
+  virtual void setNoDecay() { applyDecay_ = false; }
+
+  static ParameterOptimizer* create(const OptimizationConfig& optConfig,
+                                    bool inPserver = false);
+
+protected:
+  typedef std::vector<ParameterOptimizer::TraverseCallback> TraverseCallbackVec;
+
+  static TraverseCallback composeCallbacks(
+      const TraverseCallbackVec& callbacks) {
+    if (callbacks.size() > 1LU) {
+      return [callbacks](const VectorPtr vecs[], const ParameterConfig& config,
+                         size_t sparseId) {
+        for (auto callback : callbacks) {
+          callback(vecs, config, sparseId);
+        }
+      };
+    }
+    return (callbacks.size() == 1LU) ? callbacks[0] : nullptr;
+  }
+
+  bool applyDecay_;
+  const OptimizationConfig& optConfig_;
+  std::vector<ParameterType> parameterTypes_;
+
+  /**
+   * global learning rate, init value is opt_config.learning_rate,
+   * sparse regularizer get this value per batch, after StartBatch() called
+   * so, if lr change in StartBatch, please assign to learningRate_
+   */
+  real learningRate_;
+  std::unique_ptr<LearningRateScheduler> learningRateScheduler_;
+  int64_t pass_;  // current training pass (starting from 0)
+  bool firstTime_;
+};
+
+}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp
new file mode 100644
index 00000000000000..679e3bf89b517a
--- /dev/null
+++ b/paddle/parameter/ParameterUpdateFunctions.cpp
@@ -0,0 +1,267 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Logging.h"
+#ifdef __AVX__
+#include <x86intrin.h>
+#include <xmmintrin.h>
+#endif
+
+#include "ParameterUpdateFunctions.h"
+
+namespace paddle {
+
+void sgdUpdateCpu(real learningRate, real momentum, real decayRate, size_t size,
+                  real* value, const real* grad, real* momentumVec) {
+  decayRate *= learningRate;
+  for (size_t i = 0; i < size; ++i) {
+    momentumVec[i] = momentum * momentumVec[i] - learningRate * grad[i] -
+                     decayRate * value[i];
+    value[i] += momentumVec[i];
+  }
+}
+
+void sgdUpdate(real learningRate, real momentum, real decayRate, Vector* value,
+               Vector* grad, Vector* momentumVec) {
+  size_t size = value->getSize();
+  real* val = value->getData();
+  real* grd = grad->getData();
+  real* mom = momentumVec->getData();
+  if (typeid(*value) == typeid(CpuVector)) {
+    sgdUpdateCpu(learningRate, momentum, decayRate, size, val, grd, mom);
+  } else if (typeid(*value) == typeid(GpuVector)) {
+    value->sgdUpdate(*grad, *momentumVec, learningRate, momentum, decayRate);
+  } else {
+    LOG(FATAL) << "Wrong";
+  }
+}
+
+void sgdUpdateAvx(float learningRate, float momentum, float decayRate,
+                  size_t size, float* value, const float* _grad,
+                  float* momentumVec) {
+#ifdef __AVX__
+  float* grad = const_cast<float*>(_grad);  // the gradient is not modified
+                                            // but when invoke simd functions
+                                            // need non-const pointer.
+  size_t gradientAlign = 0;
+  size_t gradientAlignHeader = (size_t)grad % sizeof(__m256);
+  CHECK_EQ(gradientAlignHeader, (size_t)momentumVec % sizeof(__m256))
+      << "Gradent buffer didn't align with momentum buffer";
+  CHECK_EQ(gradientAlignHeader, (size_t)value % sizeof(__m256))
+      << "Gradent buffer didn't align with value buffer";
+  if (0 != gradientAlignHeader) {
+    gradientAlignHeader = sizeof(__m256) - gradientAlignHeader;
+    gradientAlign = gradientAlignHeader / sizeof(real);
+
+    // handle the unalign buffer
+    for (size_t i = 0; i < gradientAlign; i++) {
+      momentumVec[i] = momentum * momentumVec[i] - (learningRate * grad[i]) -
+                       (decayRate * learningRate * value[i]);
+      value[i] += momentumVec[i];
+    }
+    grad += gradientAlign;
+    momentumVec += gradientAlign;
+    value += gradientAlign;
+  }
+
+  constexpr size_t kParallelNum = 8;
+  constexpr size_t nStepSize = (sizeof(__m256) / sizeof(real)) * kParallelNum;
+  size_t cntLoop = (size - gradientAlign) / nStepSize;
+  size_t cntRem = (size - gradientAlign) % nStepSize;
+  __m256 gradientTmp[kParallelNum];
+  __m256 valueTmp[kParallelNum];
+  __m256 lr, mom, dr;
+  std::function<void(void)> loopFun;
+
+  learningRate *= -1;
+  lr = _mm256_set_ps(learningRate, learningRate, learningRate, learningRate,
+                     learningRate, learningRate, learningRate, learningRate);
+
+  if (0 != momentum) {
+    mom = _mm256_set_ps(momentum, momentum, momentum, momentum, momentum,
+                        momentum, momentum, momentum);
+  }
+
+  decayRate *= learningRate;
+  if (0 != decayRate) {
+    dr = _mm256_set_ps(decayRate, decayRate, decayRate, decayRate, decayRate,
+                       decayRate, decayRate, decayRate);
+  }
+
+  auto gradMulFun = [&](void) {
+    gradientTmp[0] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad), lr);
+    gradientTmp[1] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 8), lr);
+    gradientTmp[2] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 16), lr);
+    gradientTmp[3] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 24), lr);
+    gradientTmp[4] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 32), lr);
+    gradientTmp[5] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 40), lr);
+    gradientTmp[6] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 48), lr);
+    gradientTmp[7] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 56), lr);
+  };
+
+  auto valueMulFun = [&](void) {
+    valueTmp[0] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value), dr);
+    valueTmp[1] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 8), dr);
+    valueTmp[2] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 16), dr);
+    valueTmp[3] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 24), dr);
+    valueTmp[4] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 32), dr);
+    valueTmp[5] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 40), dr);
+    valueTmp[6] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 48), dr);
+    valueTmp[7] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 56), dr);
+  };
+
+  auto momentumMulFun = [&](void) {
+    *reinterpret_cast<__m256*>(momentumVec) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec), mom);
+    *reinterpret_cast<__m256*>(momentumVec + 8) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 8), mom);
+    *reinterpret_cast<__m256*>(momentumVec + 16) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 16), mom);
+    *reinterpret_cast<__m256*>(momentumVec + 24) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 24), mom);
+    *reinterpret_cast<__m256*>(momentumVec + 32) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 32), mom);
+    *reinterpret_cast<__m256*>(momentumVec + 40) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 40), mom);
+    *reinterpret_cast<__m256*>(momentumVec + 48) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 48), mom);
+    *reinterpret_cast<__m256*>(momentumVec + 56) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 56), mom);
+  };
+
+  auto momentumAddGradFun = [&](void) {
+    *reinterpret_cast<__m256*>(momentumVec) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec), gradientTmp[0]);
+    *reinterpret_cast<__m256*>(momentumVec + 8) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 8), gradientTmp[1]);
+    *reinterpret_cast<__m256*>(momentumVec + 16) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 16), gradientTmp[2]);
+    *reinterpret_cast<__m256*>(momentumVec + 24) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 24), gradientTmp[3]);
+    *reinterpret_cast<__m256*>(momentumVec + 32) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 32), gradientTmp[4]);
+    *reinterpret_cast<__m256*>(momentumVec + 40) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 40), gradientTmp[5]);
+    *reinterpret_cast<__m256*>(momentumVec + 48) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 48), gradientTmp[6]);
+    *reinterpret_cast<__m256*>(momentumVec + 56) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 56), gradientTmp[7]);
+  };
+
+  auto momentumZeroFun = [&](void) {
+    *reinterpret_cast<__m256*>(momentumVec) = gradientTmp[0];
+    *reinterpret_cast<__m256*>(momentumVec + 8) = gradientTmp[1];
+    *reinterpret_cast<__m256*>(momentumVec + 16) = gradientTmp[2];
+    *reinterpret_cast<__m256*>(momentumVec + 24) = gradientTmp[3];
+    *reinterpret_cast<__m256*>(momentumVec + 32) = gradientTmp[4];
+    *reinterpret_cast<__m256*>(momentumVec + 40) = gradientTmp[5];
+    *reinterpret_cast<__m256*>(momentumVec + 48) = gradientTmp[6];
+    *reinterpret_cast<__m256*>(momentumVec + 56) = gradientTmp[7];
+  };
+
+  auto momentumAddValueFun = [&](void) {
+    *reinterpret_cast<__m256*>(momentumVec) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec), valueTmp[0]);
+    *reinterpret_cast<__m256*>(momentumVec + 8) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec + 8), valueTmp[1]);
+    *reinterpret_cast<__m256*>(momentumVec + 16) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 16), valueTmp[2]);
+    *reinterpret_cast<__m256*>(momentumVec + 24) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 24), valueTmp[3]);
+    *reinterpret_cast<__m256*>(momentumVec + 32) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 32), valueTmp[4]);
+    *reinterpret_cast<__m256*>(momentumVec + 40) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 40), valueTmp[5]);
+    *reinterpret_cast<__m256*>(momentumVec + 48) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 48), valueTmp[6]);
+    *reinterpret_cast<__m256*>(momentumVec + 56) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 56), valueTmp[7]);
+  };
+
+  auto valueAddMomentumFun = [&](void) {
+    *reinterpret_cast<__m256*>(value) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value),
+                      *reinterpret_cast<__m256*>(momentumVec));
+    *reinterpret_cast<__m256*>(value + 8) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 8),
+                      *reinterpret_cast<__m256*>(momentumVec + 8));
+    *reinterpret_cast<__m256*>(value + 16) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 16),
+                      *reinterpret_cast<__m256*>(momentumVec + 16));
+    *reinterpret_cast<__m256*>(value + 24) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 24),
+                      *reinterpret_cast<__m256*>(momentumVec + 24));
+    *reinterpret_cast<__m256*>(value + 32) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 32),
+                      *reinterpret_cast<__m256*>(momentumVec + 32));
+    *reinterpret_cast<__m256*>(value + 40) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 40),
+                      *reinterpret_cast<__m256*>(momentumVec + 40));
+    *reinterpret_cast<__m256*>(value + 48) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 48),
+                      *reinterpret_cast<__m256*>(momentumVec + 48));
+    *reinterpret_cast<__m256*>(value + 56) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 56),
+                      *reinterpret_cast<__m256*>(momentumVec + 56));
+  };
+
+  if (0 == decayRate && 0 == momentum) {
+    loopFun = [&](void) {
+      gradMulFun();
+      momentumZeroFun();
+      valueAddMomentumFun();
+    };
+  } else if (0 == decayRate && 0 != momentum) {
+    loopFun = [&](void) {
+      gradMulFun();
+      momentumMulFun();
+      momentumAddGradFun();
+      valueAddMomentumFun();
+    };
+  } else if (0 != decayRate && 0 == momentum) {
+    loopFun = [&](void) {
+      gradMulFun();
+      valueMulFun();
+      momentumZeroFun();
+      momentumAddValueFun();
+      valueAddMomentumFun();
+    };
+  } else if (0 != decayRate && 0 != momentum) {
+    loopFun = [&](void) {
+      gradMulFun();
+      valueMulFun();
+      momentumMulFun();
+      momentumAddGradFun();
+      momentumAddValueFun();
+      valueAddMomentumFun();
+    };
+  }
+
+  for (size_t i = 0; i < cntLoop; i++) {
+    loopFun();
+    grad += nStepSize;
+    momentumVec += nStepSize;
+    value += nStepSize;
+  }
+
+  for (size_t i = 0; i < cntRem; i++) {
+    momentumVec[i] = momentum * momentumVec[i] + (learningRate * grad[i]) +
+                     (decayRate * value[i]);
+    value[i] += momentumVec[i];
+  }
+#endif
+}
+
+}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdateFunctions.h b/paddle/parameter/ParameterUpdateFunctions.h
new file mode 100644
index 00000000000000..59eb25656e51c0
--- /dev/null
+++ b/paddle/parameter/ParameterUpdateFunctions.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "paddle/utils/TypeDefs.h"
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+
+/**
+ * Performs the following operations.
+ *
+ * momentumVec = momentum * momentumVec
+ *               - learningRate * grad
+ *               - learningRate * decayRate * value
+ *
+ * value = value + momentumVec
+ * momentum = 0 or decayRate = 0 are specially handled to avoid unnecessary
+ * computation.
+ */
+void sgdUpdate(real learningRate, real momentum, real decayRate, Vector* value,
+               Vector* grad, Vector* momentumVec);
+
+void sgdUpdateCpu(real learningRate, real momentum, real decayRate, size_t size,
+                  real* value, const real* grad, real* momentumVec);
+
+void sgdUpdateAvx(float learningRate, float momentum, float decayRate,
+                  size_t size, float* value, const float* grad,
+                  float* momentumVec);
+
+}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdaterBase.cpp b/paddle/parameter/ParameterUpdaterBase.cpp
new file mode 100644
index 00000000000000..e3f1d54037b305
--- /dev/null
+++ b/paddle/parameter/ParameterUpdaterBase.cpp
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <fstream>
+#include "paddle/utils/Logging.h"
+#include "ParameterUpdaterBase.h"
+#include "hl_gpu.h"
+
+namespace paddle {
+
+void ParameterUpdater::init(std::vector<ParameterPtr>& parameters) {
+  parameters_ = parameters;
+  for (ParameterType type : getParameterTypes()) {
+    for (auto& para : parameters) {
+      para->enableType(type);
+    }
+  }
+  for (size_t pid = 0; pid < parameters_.size(); ++pid) {
+    nonStaticParaIDMap_.insert(
+        std::pair<size_t, size_t>(parameters_[pid]->getID(), pid));
+  }
+
+  for (auto& para : parameters) {
+    if (!para->isStatic()) {
+      para->initHook();
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdaterBase.h b/paddle/parameter/ParameterUpdaterBase.h
new file mode 100644
index 00000000000000..f16e183515853e
--- /dev/null
+++ b/paddle/parameter/ParameterUpdaterBase.h
@@ -0,0 +1,183 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Parameter.h"
+
+namespace paddle {
+
+class ParameterOptimizer;
+
+class ParameterUpdater {
+public:
+  ParameterUpdater() : parameterTypes_{PARAMETER_VALUE, PARAMETER_GRADIENT} {}
+  virtual ~ParameterUpdater() {}
+
+  void addParameterType(ParameterType type) {
+    for (auto t : parameterTypes_) {
+      if (t == type) return;
+    }
+    parameterTypes_.push_back(type);
+  }
+
+  virtual void init(std::vector<ParameterPtr>& parameters);
+
+  // called by Trainer when starting a new pass
+  virtual void startPass() {}
+
+  // called by Trainer then finishing a pass, ruturn true if pass accepted
+  virtual bool finishPass(real cost = 0) { return true; }
+
+  // called by Trainer before backward() of a batch
+  // Return the type of pass it needs. This pass type will be passed
+  // to GradientMachine::forward() by the caller.
+  virtual PassType startBatch(int64_t batchSize) {
+    (void)batchSize;
+    return PASS_TRAIN;
+  }
+
+  // called by Trainer after backward() of a batch
+  // cost: the cost for this batch
+  virtual void finishBatch(real cost) { (void)cost; }
+
+  // between startBatch() and finishBatch(), update() will be called
+  // by the trainer multiple times, each time for updating one Parameter
+  // with its gradient in PARAMETER_GRADIENT
+  virtual void update(Parameter* para) {
+    SetDevice setDevice(para->getDeviceId());
+    para->updateHook();
+    this->updateImpl(para);
+  }
+
+  // only get required sparse rows by default,
+  // get full matrix parameter if *fullSize* set
+  // get PARAMETER_APPLY on pserver if *apply* set
+  virtual void getParametersRemote(bool fullSize = false, bool apply = false) {}
+
+  virtual void loadParametersRemote(const std::string& dirName) {}
+  virtual void saveParametersRemote(const std::string& dirName) {}
+  virtual void randParametersRemote() {}
+
+  // something like regularization may be delayed apply
+  // trainer should catch up with before parameter is saved or sended.
+  virtual void catchUpWith() {}
+
+  // following two hooks used by averager
+  // apply to final parameter value (PARAMETER_VALUE or PARAMETER_APPLY).
+  // restore() will restore orginal value if it apply to PARAMETER_VALUE.
+  virtual void apply() {}
+  virtual void restore() {}
+
+  // return the parameter types used by this updater
+  const std::vector<ParameterType>& getParameterTypes() const {
+    return parameterTypes_;
+  }
+
+#ifndef PADDLE_DISABLE_TIMER
+  virtual void setForwardbackwardTime(uint64_t delta) {}
+#endif
+
+protected:
+  virtual void updateImpl(Parameter* para) = 0;
+
+  std::vector<ParameterType> parameterTypes_;
+  std::vector<ParameterPtr> parameters_;
+  std::map<size_t, size_t> nonStaticParaIDMap_;
+};
+
+// Composite of ParameterUpdaters, each ParameterUpdater handle
+// part of all Parameters. It's useful when we need different
+// update strategy for different Parameter.
+class ParameterUpdaterComposite : public ParameterUpdater {
+public:
+  ParameterUpdaterComposite() {}
+  virtual ~ParameterUpdaterComposite() {}
+
+  virtual void init(std::vector<ParameterPtr>& parameters) = 0;
+
+  virtual void startPass() {
+    syncThreadPool_->execPlusOwner(
+        [&](int tid, size_t numThreads) { updaters_[tid]->startPass(); });
+  }
+
+  virtual bool finishPass(real cost = 0) {
+    syncThreadPool_->execPlusOwner(
+        [&](int tid, size_t numThreads) { updaters_[tid]->finishPass(cost); });
+    return true;
+  }
+
+  virtual PassType startBatch(int64_t batchSize) {
+    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
+      updaters_[tid]->startBatch(batchSize);
+    });
+    return PASS_TRAIN;
+  }
+
+  virtual void finishBatch(real cost) {
+    syncThreadPool_->execPlusOwner(
+        [&](int tid, size_t numThreads) { updaters_[tid]->finishBatch(cost); });
+  }
+
+  virtual void getParametersRemote(bool fullSize, bool apply) {
+    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
+      updaters_[tid]->getParametersRemote(fullSize, apply);
+    });
+  }
+  virtual void loadParametersRemote(const std::string& dirName) {
+    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
+      updaters_[tid]->loadParametersRemote(dirName);
+    });
+  }
+  virtual void saveParametersRemote(const std::string& dirName) {
+    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
+      updaters_[tid]->saveParametersRemote(dirName);
+    });
+  }
+  virtual void randParametersRemote() {
+    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
+      updaters_[tid]->randParametersRemote();
+    });
+  }
+
+  virtual void catchUpWith() {
+    syncThreadPool_->execPlusOwner(
+        [&](int tid, size_t numThreads) { updaters_[tid]->catchUpWith(); });
+  }
+
+#ifndef PADDLE_DISABLE_TIMER
+  virtual void setForwardbackwardTime(uint64_t delta) {
+    for (auto& updater : updaters_) {
+      updater->setForwardbackwardTime(delta);
+    }
+  }
+#endif
+
+  virtual void apply() {
+    syncThreadPool_->execPlusOwner(
+        [&](int tid, size_t numThreads) { updaters_[tid]->apply(); });
+  }
+  virtual void restore() {
+    syncThreadPool_->execPlusOwner(
+        [&](int tid, size_t numThreads) { updaters_[tid]->restore(); });
+  }
+
+protected:
+  virtual void updateImpl(Parameter* para) {}
+  std::vector<std::unique_ptr<ParameterUpdater>> updaters_;
+  std::unique_ptr<SyncThreadPool> syncThreadPool_;
+};
+
+}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp
new file mode 100644
index 00000000000000..02a352920cf381
--- /dev/null
+++ b/paddle/parameter/ParameterUpdaterHook.cpp
@@ -0,0 +1,182 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "ParameterUpdaterHook.h"
+
+#include <fstream>
+#include <unordered_map>
+#include <mutex>
+#include <atomic>
+#include <thread>
+
+#include "paddle/math/Vector.h"
+#include "paddle/parameter/Parameter.h"
+#include "paddle/utils/Util.h"
+#include "paddle/utils/Flags.h"
+
+namespace paddle {
+
+/**
+ * The static pruning hook
+ *
+ * Static means user load a mask map before training started. This map will
+ * define which link/weight between neural is disabled.
+ */
+class StaticPruningHook : public IParameterUpdaterHook {
+public:
+  /**
+   * The Mask Map Header.
+   * The map file started with this header.
+   *
+   * In Version 0, reset file will be:
+   *  contains header.size bit, each bit means such weight is enabled or not.
+   *    if bit is 1, then such weight is enabled.
+   *  at end, the file will round to byte, and the low bits of end byte will be
+   *  filled by zero.
+   *
+   */
+  struct StaticMaskHeader {
+    uint32_t version;
+    size_t size;
+  } __attribute__((__packed__));
+
+  explicit StaticPruningHook(const std::string& mask_filename) : initCount_(0) {
+    bool ok = this->loadMaskFile(mask_filename);
+    if (!ok) {
+      LOG(WARNING) << "Fail to load mask file " << mask_filename
+                   << " in current directory, searching in init_model_path";
+      std::string combineMaskFilename =
+          path::join(FLAGS_init_model_path, mask_filename);
+      CHECK(this->loadMaskFile(combineMaskFilename))
+          << "Cannot load " << mask_filename << " in ./" << mask_filename
+          << " and " << combineMaskFilename;
+    }
+    VLOG(3) << mask_filename << " mask size = " << this->mask_.size();
+  }
+
+  void update(Parameter* para) {
+    updateThreadChecker_.check();
+    auto& vec = para->getBuf(PARAMETER_GRADIENT);
+    if (vec) {
+      vec->dotMul(*maskVec_);
+    }
+  }
+
+  void init(Parameter* para) {
+    size_t initCount = this->initCount_.fetch_add(1);
+    CHECK_EQ(initCount, 0UL) << "Currently the StaticPruningHook must invoke "
+                                "in same ParamterUpdater";
+    VLOG(3) << "Initialize Parameter " << para;
+    SetDevice device(para->getDeviceId());
+
+    auto maskVec = Vector::create(this->mask_.size(), false);
+    {  // Initialize maskVec with float mask vector
+      real* dataPtr = maskVec->getData();
+      size_t i = 0;
+      for (bool m : mask_) {
+        dataPtr[i++] = m ? 1.0 : 0.0;
+      }
+    }
+
+    // Currently just use a mask vector for hack.
+    // @TODO(yuyang18): Implemented the mask operation in vector.
+    if (para->useGpu()) {
+      maskVec_ = Vector::create(this->mask_.size(), para->useGpu());
+      maskVec_->copyFrom(*maskVec);
+    } else {
+      maskVec_ = maskVec;
+    }
+
+    auto& vec = para->getBuf(PARAMETER_VALUE);
+    vec->dotMul(*maskVec_);
+  }
+
+private:
+  bool loadMaskFile(const std::string& mask_filename) {
+    std::ifstream fin;
+    fin.open(mask_filename);
+    if (fin.is_open()) {
+      StaticMaskHeader header;
+      fin.read(reinterpret_cast<char*>(&header), sizeof(StaticMaskHeader));
+      CHECK_EQ(header.version, 0UL);
+      mask_.resize(header.size);
+      uint8_t buf;
+      for (size_t i = 0; i < header.size; ++i, buf <<= 1) {
+        if (i % 8 == 0) {
+          fin.read(reinterpret_cast<char*>(&buf), sizeof(uint8_t));
+        }
+        mask_[i] = buf & 0x80;
+      }
+      fin.close();
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  SameThreadChecker updateThreadChecker_;
+  std::atomic<size_t> initCount_;
+  VectorPtr maskVec_;
+  std::vector<bool> mask_;
+};
+
+IParameterUpdaterHook::IParameterUpdaterHook() {}
+
+IParameterUpdaterHook::~IParameterUpdaterHook() {}
+
+/**
+ * A Hasher used by g_hooks.
+ *
+ * Use the independent hasher intendedly. There is a hasher in PServer for hash
+ * ParameterBlock. But not to use same hasher to reduce dependency.
+ *
+ * May be extracted to Util.h to unify the hasher.
+ */
+class StringIntPairHasher {
+public:
+  size_t operator()(const std::pair<std::string, int>& k) const {
+    return intHasher_(strHasher_(k.first) + k.second);
+  }
+
+private:
+  std::hash<std::string> strHasher_;
+  std::hash<int> intHasher_;
+};
+
+static WeakKVCache<std::pair<std::string, int>, IParameterUpdaterHook,
+                   StringIntPairHasher> g_hookCache_;
+
+/**
+ * ParameterUpdaterHook actually factory method.
+ */
+static IParameterUpdaterHook* createImpl(
+    const ParameterUpdaterHookConfig& config) {
+  auto& type = config.type();
+  if (type == "pruning") {
+    if (config.has_purning_mask_filename()) {
+      return new StaticPruningHook(config.purning_mask_filename());
+    }
+  }
+  return nullptr;
+}
+
+std::shared_ptr<IParameterUpdaterHook> IParameterUpdaterHook::create(
+    const ParameterConfig& paramConfig, int idx) {
+  std::pair<std::string, int> key = {paramConfig.name(), idx};
+  return g_hookCache_.get(
+      key, [&] { return createImpl(paramConfig.update_hooks(idx)); });
+}
+
+}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdaterHook.h b/paddle/parameter/ParameterUpdaterHook.h
new file mode 100644
index 00000000000000..1c132a733866b8
--- /dev/null
+++ b/paddle/parameter/ParameterUpdaterHook.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+#include <memory>
+
+#include "ParameterConfig.pb.h"
+
+namespace paddle {
+
+class Parameter;
+
+/**
+ * The parameter updater hook interface.
+ *
+ * The Parameter Updater hooks is a group of methods invoke before
+ * ParameterUpdater::updateImpl. It can modify gradient/momentum/etc before
+ * parameter optimization.
+ */
+class IParameterUpdaterHook {
+public:
+  virtual ~IParameterUpdaterHook();
+
+  /**
+   * Create A ParameterUpdaterHook.
+   *
+   * The same parameter shared the same hooks. So it returns shared_ptr.
+   *
+   * @param param_config The parameter config.
+   * @param idx  The element index of param_config.updater_hooks() array.
+   */
+  static std::shared_ptr<IParameterUpdaterHook> create(
+      const ParameterConfig& paramConfig, int idx);
+
+  /**
+   * The update hook method. Invoke before ParameterUpdater::updateImpl
+   */
+  virtual void update(Parameter* para) = 0;
+
+  /**
+   * The init hook method. Invoke in ParameterUpdater::init
+   */
+  virtual void init(Parameter* para) = 0;
+
+protected:
+  /**
+   * Ctor.
+   */
+  IParameterUpdaterHook();
+};
+
+}  // namespace paddle
diff --git a/paddle/parameter/Regularizer.cpp b/paddle/parameter/Regularizer.cpp
new file mode 100644
index 00000000000000..bc7de3ca048dbe
--- /dev/null
+++ b/paddle/parameter/Regularizer.cpp
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Util.h"
+#include "paddle/utils/Flags.h"
+#include "Regularizer.h"
+
+namespace paddle {
+
+Regularizer* Regularizer::get(const std::vector<ParameterType>& types,
+                              const ParameterConfig& paraConfig) {
+  bool useLearningRateVec = std::find(types.begin(), types.end(),
+                                      PARAMETER_LEARNING_RATE) != types.end();
+  if (paraConfig.decay_rate_l1() > 0.0f &&
+      paraConfig.decay_rate() > 0.0f) {  // use L1 and L2
+    if (useLearningRateVec) {
+      static L1L2LrRegularizer regularizer_;
+      return &regularizer_;
+    }
+    static L1L2Regularizer regularizer_;
+    return &regularizer_;
+  }
+  if (paraConfig.decay_rate_l1() > 0.0f) {  // use L1 only
+    if (useLearningRateVec) {
+      static L1LrRegularizer regularizer_;
+      return &regularizer_;
+    }
+    static L1Regularizer regularizer_;
+    return &regularizer_;
+  }
+  if (paraConfig.decay_rate() > 0.0f) {  // use L2 only
+    if (useLearningRateVec) {
+      static L2LrRegularizer regularizer_;
+      return &regularizer_;
+    }
+    static L2Regularizer regularizer_;
+    return &regularizer_;
+  }
+  return nullptr;
+}
+
+}  // namespace paddle
diff --git a/paddle/parameter/Regularizer.h b/paddle/parameter/Regularizer.h
new file mode 100644
index 00000000000000..8c9eb49ab611e8
--- /dev/null
+++ b/paddle/parameter/Regularizer.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "ParameterUpdaterBase.h"
+
+namespace paddle {
+
+// Regularizer function for parameter, e.g. L1/L2
+class Regularizer {
+public:
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+                      real learningRate,  // learningrate from optimizer
+                      int t0,             // last occurence time
+                      int t) const = 0;   // current time
+  virtual ~Regularizer() {}
+
+  static Regularizer* get(const std::vector<ParameterType>& types,
+                          const ParameterConfig& paraConfig);
+};
+
+// L1 Regularizer, |w|_1
+class L1Regularizer : public Regularizer {
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+                      real learningRate, int t0, int t) const {
+    vecs[PARAMETER_VALUE]->applyL1(learningRate * paraConfig.learning_rate(),
+                                   paraConfig.decay_rate_l1() * (t - t0));
+  }
+};
+
+// L1 Lr Regularizer
+class L1LrRegularizer : public Regularizer {
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+                      real learningRate, int t0, int t) const {
+    vecs[PARAMETER_VALUE]->applyL1(*vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate * paraConfig.learning_rate(),
+                                   paraConfig.decay_rate_l1() * (t - t0));
+  }
+};
+
+// L2 Regularizer, |w|_2^2
+class L2Regularizer : public Regularizer {
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+                      real learningRate, int t0, int t) const {
+    vecs[PARAMETER_VALUE]->applyL2(learningRate * paraConfig.learning_rate(),
+                                   paraConfig.decay_rate() * (t - t0));
+  }
+};
+
+// L2 Lr Regularizer
+class L2LrRegularizer : public Regularizer {
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+                      real learningRate, int t0, int t) const {
+    vecs[PARAMETER_VALUE]->applyL2(*vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate * paraConfig.learning_rate(),
+                                   paraConfig.decay_rate() * (t - t0));
+  }
+};
+
+// L1 + L2 Regularizer, |w|_1 + |w|_2^2
+class L1L2Regularizer : public Regularizer {
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+                      real learningRate, int t0, int t) const {
+    vecs[PARAMETER_VALUE]->applyL1(learningRate * paraConfig.learning_rate(),
+                                   paraConfig.decay_rate_l1() * (t - t0));
+    vecs[PARAMETER_VALUE]->applyL2(learningRate * paraConfig.learning_rate(),
+                                   paraConfig.decay_rate() * (t - t0));
+  }
+};
+
+// L1 + L2 Lr Regularizer
+class L1L2LrRegularizer : public Regularizer {
+  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+                      real learningRate, int t0, int t) const {
+    vecs[PARAMETER_VALUE]->applyL1(*vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate * paraConfig.learning_rate(),
+                                   paraConfig.decay_rate_l1() * (t - t0));
+    vecs[PARAMETER_VALUE]->applyL2(*vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate * paraConfig.learning_rate(),
+                                   paraConfig.decay_rate() * (t - t0));
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/parameter/Weight.cpp b/paddle/parameter/Weight.cpp
new file mode 100644
index 00000000000000..ed02355c01a587
--- /dev/null
+++ b/paddle/parameter/Weight.cpp
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Logging.h"
+#include "Weight.h"
+
+namespace paddle {
+
+Weight::Weight(size_t height, size_t width, ParameterPtr param) {
+  VectorPtr vPtr = param->getBuf(PARAMETER_VALUE);
+  VectorPtr gPtr = param->getBuf(PARAMETER_GRADIENT);
+
+  // create a new weight
+  if (param->isSparse()) {
+    CHECK_LE(param->getSize(), width * height);
+  } else {
+    CHECK_EQ(param->getSize(), width * height);
+  }
+
+  // weight_
+  weight_ = param->getMat(PARAMETER_VALUE);
+  if (!weight_ && vPtr) {
+    weight_ = Matrix::create(vPtr->getMemoryHandle(), height, width);
+  }
+  if (weight_) {
+    CHECK_EQ(height, weight_->getHeight());
+    CHECK_EQ(width, weight_->getWidth());
+  }
+
+  // weightGrad
+  weightGrad_ = param->getMat(PARAMETER_GRADIENT);
+  if (!weightGrad_ && gPtr) {
+    weightGrad_ = Matrix::create(gPtr->getMemoryHandle(), height, width);
+  }
+  if (weightGrad_) {
+    CHECK_EQ(height, weightGrad_->getHeight());
+    CHECK_EQ(width, weightGrad_->getWidth());
+  }
+
+  parameter_ = param;
+}
+
+Weight::Weight(size_t height, size_t width, ParameterPtr param, size_t offset) {
+  VectorPtr vPtr = param->getBuf(PARAMETER_VALUE);
+  VectorPtr gPtr = param->getBuf(PARAMETER_GRADIENT);
+
+  // create a new weight
+  CHECK_LE(offset + width * height, param->getSize());
+
+  // weight_
+  if (vPtr) {
+    weight_ = Matrix::create(vPtr->getData() + offset, height, width,
+                             /* trans */ false, param->useGpu());
+  }
+
+  // weightGrad
+  if (gPtr) {
+    weightGrad_ = Matrix::create(gPtr->getData() + offset, height, width,
+                                 /* trans */ false, param->useGpu());
+  }
+
+  parameter_ = param;
+}
+
+const ParameterPtr& Weight::getParameterPtr() { return parameter_; }
+void Weight::setParameterPtr(ParameterPtr param) { parameter_ = param; }
+}  // namespace paddle
diff --git a/paddle/parameter/Weight.h b/paddle/parameter/Weight.h
new file mode 100644
index 00000000000000..531b571cbc0055
--- /dev/null
+++ b/paddle/parameter/Weight.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <memory>
+#include <vector>
+
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseRowMatrix.h"
+#include "paddle/parameter/Parameter.h"
+
+namespace paddle {
+
+class Weight {
+private:
+  MatrixPtr weight_;
+  MatrixPtr weightGrad_;
+  ParameterPtr parameter_;
+
+public:
+  Weight(size_t height, size_t width, ParameterPtr parameter);
+  Weight(size_t height, size_t width, ParameterPtr parameter, size_t offset);
+
+  const MatrixPtr& getW() { return weight_; }
+  const MatrixPtr& getWGrad() { return weightGrad_; }
+  const ParameterPtr& getParameterPtr();
+
+  void incUpdate(const UpdateCallback& callback) {
+    getParameterPtr()->incUpdate(callback);
+  }
+
+  void setParameterPtr(ParameterPtr param);
+};
+
+typedef std::vector<std::unique_ptr<Weight>> WeightList;
+
+}  // namespace paddle
diff --git a/paddle/parameter/tests/CMakeLists.txt b/paddle/parameter/tests/CMakeLists.txt
new file mode 100644
index 00000000000000..177fb2fdfc045e
--- /dev/null
+++ b/paddle/parameter/tests/CMakeLists.txt
@@ -0,0 +1 @@
+add_simple_unittest(test_common)
\ No newline at end of file
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
new file mode 100644
index 00000000000000..3db96ccf941e3b
--- /dev/null
+++ b/paddle/parameter/tests/test_common.cpp
@@ -0,0 +1,215 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <malloc.h>
+#include <paddle/utils/Util.h>
+
+#include <gtest/gtest.h>
+#include <paddle/utils/Flags.h>
+#include <paddle/parameter/ParameterUpdateFunctions.h>
+#include <paddle/utils/Stat.h>
+#include <paddle/utils/Thread.h>
+
+using namespace paddle;  // NOLINT
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+
+  int ret = RUN_ALL_TESTS();
+
+  return ret;
+}
+
+class CommonTest : public ::testing::Test {
+protected:
+  CommonTest() : testStat_("test") {}
+  virtual ~CommonTest() {}
+  virtual void SetUp() {
+    const size_t buffSize[] = {100,  128,   500,    1024,
+                               4096, 10240, 102400, 1000000};
+    sizeVec_.resize(8);
+    memcpy(&sizeVec_[0], &buffSize[0], 8 * sizeof(size_t));
+    valueUint_.resize(4);
+    valueUint_[0].first = 0.0;
+    valueUint_[0].second = 0.0;
+    valueUint_[1].first = 0.0;
+    valueUint_[1].second = 1.0;
+    valueUint_[2].first = 1.0;
+    valueUint_[2].second = 0.0;
+    valueUint_[3].first = 1.0;
+    valueUint_[3].second = 1.0;
+    learningRate_ = 1.0;
+  }
+
+  void test_sgdUpadate(real* gradientBuffer, real* valueBuffer,
+                       real* momentumBuffer, size_t size);
+
+  virtual void TreaDown() { LOG(INFO) << "All Test Finished."; }
+
+protected:
+  std::vector<std::pair<real, real>> valueUint_;
+  std::vector<size_t> sizeVec_;
+  real learningRate_;
+  StatSet testStat_;
+};
+
+void CommonTest::test_sgdUpadate(real* gradientBuffer, real* valueBuffer,
+                                 real* momentumBuffer, size_t size) {
+// sgdUpdateAvx has no double version yet
+#if defined(__AVX__) && !defined(PADDLE_TYPE_DOUBLE)
+  real valueSum1 = 0, valueSum2 = 0, momSum1 = 0, momSum2 = 0;
+  real* gradTmp = new real[size];
+  real* valueTmp = new real[size];
+  real* momentumTmp = new real[size];
+  memcpy(gradTmp, gradientBuffer, size * sizeof(real));
+  memcpy(valueTmp, valueBuffer, size * sizeof(real));
+  memcpy(momentumTmp, momentumBuffer, size * sizeof(real));
+  for (auto& arg : valueUint_) {
+    {
+      {
+        struct timeval t;
+        REGISTER_TIMER("gettimeofday", 0, testStat_);
+        gettimeofday(&t, NULL);
+      }
+      REGISTER_TIMER("avxTimer", 0);
+      sgdUpdateAvx(learningRate_, arg.first, arg.second, size, valueBuffer,
+                   gradientBuffer, momentumBuffer);
+    }
+    for (size_t i = 0; i < size; i++) {
+      valueSum1 += valueBuffer[i];
+      momSum1 += momentumBuffer[i];
+      // std::cout << "["
+      //          << valueBuffer[i]
+      //          << "," << momentumBuffer[i]
+      //          << "," << gradientBuffer[i] << "],";
+    }
+    {
+      REGISTER_TIMER("cpuTimer", 0);
+      sgdUpdateCpu(learningRate_, arg.first, arg.second, size, valueTmp,
+                   gradTmp, momentumTmp);
+    }
+    for (size_t i = 0; i < size; i++) {
+      valueSum2 += valueTmp[i];
+      momSum2 += momentumTmp[i];
+      // std::cout << "["
+      //          << valueTmp[i]
+      //          << "," << momentumTmp[i]
+      //          << "," << gradTmp[i] << "],";
+    }
+
+    VLOG(3) << "valueSum1 = " << valueSum1 << " ; valueSum2 = " << valueSum2;
+    VLOG(3) << "momSum1 = " << momSum1 << " ; momSum2 = " << momSum2;
+    ASSERT_EQ(valueSum1, valueSum2);
+    ASSERT_EQ(momSum1, momSum2);
+  }
+  delete[] gradTmp;
+  delete[] valueTmp;
+  delete[] momentumTmp;
+#endif
+}
+
+TEST_F(CommonTest, sgdUpdate) {
+  const size_t alignHeader[] = {0, 2, 3, 5, 7, 8};
+  for (auto& size : sizeVec_) {
+    real* gradientBuffer = (real*)memalign(32, sizeof(real) * size);
+    real* valueBuffer = (real*)memalign(32, sizeof(real) * size);
+    real* momentumBuffer = (real*)memalign(32, sizeof(real) * size);
+    for (size_t i = 0; i < size; i++) {
+      gradientBuffer[i] = 1.0;
+      valueBuffer[i] = 2.0;
+      momentumBuffer[i] = 3.0;
+    }
+    for (int i = 0; i < 6; i++) {
+      LOG(INFO) << "----------------------" << size << ":" << alignHeader[i]
+                << "-------------------------";
+      test_sgdUpadate(&gradientBuffer[alignHeader[i]],
+                      &valueBuffer[alignHeader[i]],
+                      &momentumBuffer[alignHeader[i]], size - alignHeader[i]);
+    }
+    free(gradientBuffer);
+    free(valueBuffer);
+    free(momentumBuffer);
+  }
+  globalStat.printAllStatus();
+  testStat_.printAllStatus();
+}
+
+TEST_F(CommonTest, syncThreadPool) {
+  SyncThreadPool pool(10);
+
+  std::vector<int> nums;
+  nums.resize(10);
+
+  pool.exec([&](int tid, size_t numThreads) { nums[tid] = tid; });
+  for (size_t i = 0; i < nums.size(); ++i) {
+    EXPECT_EQ((int)i, nums[i]);
+  }
+
+  pool.exec([&](int tid, size_t numThreads) { nums[tid] -= tid; });
+  for (size_t i = 0; i < nums.size(); ++i) {
+    EXPECT_EQ((int)0, nums[i]);
+  }
+}
+
+TEST_F(CommonTest, barrierStat) {
+  const int threadNum = 10;
+
+  SyncThreadPool pool(threadNum);
+
+#define TEST_BARRIER_RANDOM(statName, numConnThreads, ...)               \
+  pool.exec([&](int tid, size_t numThreads) {                            \
+    struct timeval time;                                                 \
+    gettimeofday(&time, nullptr);                                        \
+    uint64_t usec = timeToMicroSecond(time);                             \
+    std::srand(usec);                                                    \
+    auto value = std::rand() % 100000;                                   \
+    usleep(value);                                                       \
+    REGISTER_SLOW_NODES_PROBE(globalStat, statName, numConnThreads, tid, \
+                              __VA_ARGS__);                              \
+  });
+
+  for (auto i = 0; i < 10; i++) {
+    TEST_BARRIER_RANDOM("synThreadBarrier1", threadNum);
+    TEST_BARRIER_RANDOM("synThreadBarrier2", threadNum);
+  }
+
+  globalStat.printAllStatus();
+  globalStat.reset();
+
+  for (auto i = 0; i < 10; i++) {
+    TEST_BARRIER_RANDOM("synThreadBarrier3", threadNum, "tag0");
+    TEST_BARRIER_RANDOM("synThreadBarrier4", threadNum, "tag1");
+  }
+
+  globalStat.printAllStatus();
+  globalStat.reset();
+
+// use it to test accurate barrier gap
+#define TEST_BARRIER(statName, numConnThreads, ...)                      \
+  pool.exec([&](int tid, size_t numThreads) {                            \
+    usleep(tid * 10000);                                                 \
+    REGISTER_SLOW_NODES_PROBE(globalStat, statName, numConnThreads, tid, \
+                              __VA_ARGS__);                              \
+  });
+
+  for (auto i = 0; i < 10; i++) {
+    TEST_BARRIER("synThreadBarrier3", threadNum, "tag0");
+    TEST_BARRIER("synThreadBarrier4", threadNum, "tag1");
+  }
+
+  globalStat.printAllStatus();
+  globalStat.reset();
+}
diff --git a/paddle/pserver/BaseClient.cpp b/paddle/pserver/BaseClient.cpp
new file mode 100644
index 00000000000000..df4daca9bfaf88
--- /dev/null
+++ b/paddle/pserver/BaseClient.cpp
@@ -0,0 +1,81 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <vector>
+#include <string.h>
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/CommandLineParser.h"
+#include "BaseClient.h"
+
+P_DECLARE_string(pservers);
+
+namespace paddle {
+
+BaseClient::BaseClient(bool separate, int numPorts)
+    : stopping_(false), numPorts_(numPorts), separateSendAndRecv_(separate) {
+  CHECK_GT(numPorts, 0);
+}
+
+BaseClient::~BaseClient() {}
+
+void BaseClient::recvData() { recvSyncBarrier_->wait(); }
+
+void BaseClient::synchronize(SyncObject syncObjectId) {
+  SynchronizeRequest request;
+  request.set_sync_object_id(syncObjectId);
+  std::vector<SynchronizeResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void BaseClient::startThreads() {
+  if (!separateSendAndRecv_) {
+    return;
+  }
+  recvSyncBarrier_.reset(new ThreadBarrier(threadNum_ + 1));
+
+  sendThreads_.resize(threadNum_);
+  recvThreads_.resize(threadNum_);
+  sendJobQueue_.resize(threadNum_);
+  recvJobQueue_.resize(threadNum_);
+
+  for (int i = 0; i < threadNum_; ++i) {
+    sendJobQueue_[i].reset(new SendQueue());
+    recvJobQueue_[i].reset(new SendQueue());
+
+    sendThreads_[i].reset(
+        new std::thread([this](int id) { this->send(id); }, i));
+
+    recvThreads_[i].reset(
+        new std::thread([this](int id) { this->recv(id); }, i));
+  }
+}
+
+void BaseClient::finishThreads() {
+  if (!separateSendAndRecv_) {
+    return;
+  }
+  stopping_ = true;
+  for (int i = 0; i < threadNum_; i++) {
+    sendJobQueue_[i]->enqueue(nullptr);
+  }
+  for (auto& thread : sendThreads_) {
+    thread->join();
+  }
+  for (auto& thread : recvThreads_) {
+    thread->join();
+  }
+  stopping_ = false;
+}
+}  // namespace paddle
diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h
new file mode 100644
index 00000000000000..f1c4c9eb375420
--- /dev/null
+++ b/paddle/pserver/BaseClient.h
@@ -0,0 +1,293 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pserver/ProtoServer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Queue.h"
+#include "paddle/utils/TypeDefs.h"
+#include "ParameterService.pb.h"
+
+namespace paddle {
+
+/**
+ * it manages all connections to pservers.
+ * it exists two modes to manage connections to all pservers. Firstly, one
+ * connection owns two threads that separately manage to send and receive
+ * data. Secondly, each thread uses one connection for all activation in it.
+ * the first solution arms with sendThreads_/recvThreads_ and sendJobQueue_/
+ * recvJobQueue_. the second solution use some shared thread pool to manage
+ * connections.
+ * In addition to pserver, metric learning also uses network to exchange
+ * features within multi-machines, so this class just abstracts some basic
+ * threads and queue buffer creation for them
+ */
+class BaseClient {
+protected:
+  typedef std::unique_ptr<std::thread> ThreadPtr;
+  typedef std::vector<std::vector<iovec>> InputIovs;
+  typedef std::vector<SendParameterRequest> SendRequest;
+  typedef std::vector<SendDataRequest> SendDataRequestVec;
+
+  // TODO(yanfei):
+  // refine data structure to unify parameter and features communication
+  struct SendJob {
+    /// store parameters related blocks data
+    InputIovs parallelInputIovs;
+    /// store protobuf request
+    SendRequest parallelRequests;
+    /// store data, such as features for metric learning
+    SendDataRequestVec parallelDataRequests;
+  };
+
+public:
+  explicit BaseClient(bool separate = false, int numPorts = FLAGS_ports_num);
+
+  virtual ~BaseClient();
+
+  typedef std::shared_ptr<SendJob> SendJobPtr;
+  typedef Queue<SendJobPtr> SendQueue;
+
+  /// send data to server, support only synchronize
+  template <class DataType>
+  void putData(int clientId, SendDataType type, DataType* datas, size_t size,
+               DataUpdateMode mode) {
+    synchronize(SYNC_DATA);
+    sendData(clientId, type, mode, datas, size);
+    recvData();
+    synchronize(SYNC_DATA);
+  }
+
+  template <class DataType>
+  void putOwnData(int clientId, SendDataType type, DataType* datas,
+                  size_t size) {
+    putData(clientId, type, datas, size, DATA_UPDATE_MODE_SET_OWN);
+  }
+
+  template <class DataType>
+  void getAllData(int clientId, SendDataType type, DataType* datas,
+                  size_t size) {
+    sendData(clientId, type, DATA_UPDATE_MODE_GET_ALL,
+             reinterpret_cast<DataType*>(NULL), 0);
+    recvData();
+    size_t dataOffset = 0;
+    for (auto& recvMem : recvDataMems_) {
+      CHECK_LE(dataOffset, size);
+      size_t memSize = std::min(recvMem.get()->getSize(),
+                                sizeof(DataType) * (size - dataOffset));
+      CHECK_EQ(memSize % sizeof(DataType), size_t(0));
+      memcpy(datas + dataOffset, recvMem.get()->getBuf(), memSize);
+      dataOffset += memSize / sizeof(DataType);
+    }
+    CHECK_EQ(dataOffset, size);
+  }
+
+  /**
+   * Reduces values on all clients.
+   * This reduce just support SUM.
+   * The results are saved in recvBuf of rootId client
+   */
+  template <class DataType>
+  void reduce(DataType* sendBuf, DataType* recvBuf, size_t size, int clientId,
+              int rootId) {
+    putOwnData(clientId, DATA_REDUCE_SUM, sendBuf, size);
+    if (rootId == clientId) {
+      getAllData(clientId, DATA_REDUCE_SUM, recvBuf, size);
+    }
+  }
+
+  /**
+   * return trans data type according to the input type
+   */
+  virtual TransDataType getTransDtype(const std::type_info& info) {
+    TransDataType dataType;
+    if (typeid(int*) == info) {  // NOLINT
+      dataType = TRANS_INT32;
+    } else if (typeid(uint32_t*) == info) {  // NOLINT
+      dataType = TRANS_UINT32_T;
+    } else if (typeid(int64_t*) == info) {  // NOLINT
+      dataType = TRANS_INT64_T;
+    } else if (typeid(uint64_t*) == info) {  // NOLINT
+      dataType = TRANS_UINT64_T;
+    } else if (typeid(float*) == info) {  // NOLINT
+      dataType = TRANS_FLOAT;
+    } else if (typeid(double*) == info) {  // NOLINT
+      dataType = TRANS_DOUBLE;
+    } else {
+      LOG(FATAL) << "not supported";
+    }
+    return dataType;
+  }
+
+protected:
+  /// for a > 0, b > 0:
+  /// return the smallest x s.t. b*x >= a
+  static int divup(int a, int b) { return (a + b - 1) / b; }
+
+  int calcClientId(int i, int serviceNum) {
+    return (i + FLAGS_trainer_id * numPorts_) % serviceNum;
+  }
+
+  /// start threads in sendThreads_ and recvThreads_
+  void startThreads();
+
+  /// finish threads in sendThreads_ and recvThreads_
+  void finishThreads();
+
+  template <class DataType>
+  void prepareData(int clientId, SendDataType type, DataUpdateMode updateMode,
+                   DataType* datas, size_t size, SendJob* sendJob) {
+    sendJob->parallelDataRequests.resize(serviceNum_);
+    sendJob->parallelInputIovs.resize(serviceNum_);
+    for (int i = 0; i < serviceNum_; ++i) {
+      auto& request = sendJob->parallelDataRequests[i];
+      request.set_update_mode(updateMode);
+      request.set_type(type);
+      request.set_client_id(clientId);
+      request.set_server_id(i);
+    }
+
+    /// split datas which need send to Server into serviceNum_ pieces
+    if (!datas) {
+      CHECK(!size) << "ownSize should be zero since datas is nullptr";
+    }
+    size_t baseSize = size / serviceNum_;
+    size_t dataOffset = 0;
+    for (int i = 0; i < serviceNum_; ++i) {
+      auto& request = sendJob->parallelDataRequests[i];
+      DataBlock* block = request.add_blocks();
+      size_t ownSize = size_t(i) < size % serviceNum_ ? baseSize + 1 : baseSize;
+      size_t realSize = datas ? std::max(ownSize, size_t(1)) : 0;
+      block->set_total_size(realSize * sizeof(DataType));
+      block->set_data_size(sizeof(DataType));
+      // TODO(yuyang18): The getTransDtype can be rewritten as template method
+      //                 to reduce runtime overhead.
+      block->set_data_type(getTransDtype(typeid(DataType*)));  // NOLINT
+      if (datas) {
+        sendJob->parallelInputIovs[i].push_back(
+            {datas + dataOffset, realSize * sizeof(DataType)});
+      }
+      dataOffset += ownSize;
+    }
+    CHECK_EQ(dataOffset, size);
+  }
+
+  /**
+   * @brief send data to all data servers
+   *
+   * @note  each trainer sends all its data to all data servers
+   *        it's for broadcast data synchronization, such as features
+   *        synchronization in metric learning.
+   */
+  template <class DataType>
+  void sendData(int clientId, SendDataType type, DataUpdateMode updateMode,
+                DataType* datas, size_t size) {
+    SendJobPtr sendJob = std::make_shared<SendJob>();
+    prepareData(clientId, type, updateMode, datas, size, sendJob.get());
+    for (int i = 0; i < threadNum_; ++i) {
+      sendJobQueue_[i]->enqueue(sendJob);
+    }
+  }
+
+  /**
+   * @brief recv data from all data servers
+   *
+   * @note  synchronize all recv threads
+   */
+  void recvData();
+
+  /// send request, and recv responses
+  template <typename ProtoIn, typename ProtoOut>
+  void multiCall(const char* funcName, const ProtoIn& request,
+                 std::vector<ProtoOut>* responses) {
+    responses->resize(clients_.size());
+    size_t numClients = clients_.size();
+    for (size_t i = 0; i < numClients; ++i) {
+      clients_[i].send(funcName, request);
+    }
+    for (size_t i = 0; i < numClients; ++i) {
+      clients_[i].recv(&(*responses)[i]);
+    }
+  }
+
+  /**
+   * @brief synchronize all trainers and pservers
+   *
+   * @note  used to ensure that data of all trainers have been received
+   */
+  void synchronize(SyncObject syncObjectId = SYNC_DEFAULT);
+
+  /**
+   * @brief use multithread to separately send data
+   *
+   * @note  each thread should read its own JobQueue to handle requests
+   *        each thread should calcClientId() to retrieve connections
+   *        managed by himself.
+   *        send and recv are implemented in child class.
+   */
+  virtual void send(int threadId) = 0;
+
+  /**
+   * @brief use multithread to separately receive data
+   *
+   * @note  almost same as send()
+   */
+  virtual void recv(int threadId) = 0;
+
+protected:
+  bool stopping_;
+  /// nodes * ports that means the number of real pservers
+  int serviceNum_;
+  /**
+   * threads num for managing all services. Normally the
+   * number of pservers are relatively less than several
+   * hundreds so that using thread-based parallelization
+   * can benifit traffic performance and pserver's sgd
+   * optimization performance.
+   */
+  int threadNum_;
+  /// the connection manager at client end
+  std::vector<ProtoClient> clients_;
+  /// send threads for parallelization
+  std::vector<ThreadPtr> sendThreads_;
+  /// recv threads for parallelization
+  std::vector<ThreadPtr> recvThreads_;
+  std::unique_ptr<ThreadBarrier> recvSyncBarrier_;
+
+  // TODO(yanfei):
+  // current pserver's will return value until all parameters'
+  // optimization are finished so that recv are not overlapped
+  // in reality. More robust implimentation should be to pipeline
+  // all send/recv action based on parameter unit level, and
+  // it will benifits deep and larger model training in future,
+  // especially local node compution power surpasses inter-connection
+  // such as GPU cluster, even with BOX GPU cluster.
+  // queue for buffering send request
+  /**
+   * send/recv queue cooperates with each other to accomplish
+   * overlapping communication with forwardBackward action.
+   */
+  std::vector<std::unique_ptr<SendQueue>> sendJobQueue_;
+  /// queue for buffering recv request
+  std::vector<std::unique_ptr<SendQueue>> recvJobQueue_;
+  /// specific for dserver
+  SendJob sendJob_;
+  /// port num for each node
+  int numPorts_;
+  /// if set, overlapped optimization is disabled
+  bool separateSendAndRecv_;
+  std::vector<CpuMemHandlePtr> recvDataMems_;
+};
+}  // namespace paddle
diff --git a/paddle/pserver/CMakeLists.txt b/paddle/pserver/CMakeLists.txt
new file mode 100644
index 00000000000000..1c1e1964b8d3fd
--- /dev/null
+++ b/paddle/pserver/CMakeLists.txt
@@ -0,0 +1,56 @@
+# parameter server package
+
+######################### paddle_network ####################
+set(NETWORK_SOURCES
+    LightNetwork.cpp
+    SocketChannel.cpp
+    ProtoServer.cpp)
+
+set(NETWORK_HEADERS
+    LightNetwork.h
+    SocketChannel.h
+    ProtoServer.h)
+
+add_library(paddle_network STATIC
+    ${NETWORK_SOURCES})
+
+add_style_check_target(paddle_network ${NETWORK_SOURCES})
+add_style_check_target(paddle_network ${NETWORK_HEADERS})
+
+add_dependencies(paddle_network gen_proto_cpp)
+
+################### paddle_pserver ######################
+set(PSERVER_SOURCES
+    BaseClient.cpp
+    ParameterClient2.cpp
+    ParameterServer2.cpp
+    SparseParameterDistribution.cpp)
+
+set(PSERVER_HEADERS
+    BaseClient.h
+    ParameterClient2.h
+    ParameterServer2.h
+    SparseParameterDistribution.h)
+
+add_library(paddle_pserver STATIC
+    ${PSERVER_SOURCES})
+
+add_style_check_target(paddle_pserver ${PSERVER_SOURCES})
+add_style_check_target(paddle_pserver ${PSERVER_HEADERS})
+
+add_dependencies(paddle_pserver gen_proto_cpp)
+
+set(PSERVER_MAIN_SOURCES
+    ParameterServer2Main.cpp)
+
+add_executable(paddle_pserver_main
+    ${PSERVER_MAIN_SOURCES})
+link_paddle_exe(paddle_pserver_main)
+if(WITH_TESTING)
+  add_subdirectory(test)
+endif()
+install(TARGETS paddle_pserver_main
+    RUNTIME DESTINATION opt/paddle/bin
+    PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
+        GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
+set_target_properties(paddle_pserver_main PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
new file mode 100644
index 00000000000000..fb427832fad646
--- /dev/null
+++ b/paddle/pserver/LightNetwork.cpp
@@ -0,0 +1,417 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <fcntl.h>
+
+#include <arpa/inet.h>
+#include <sys/ioctl.h>
+#include <net/if.h>
+#include <net/if_arp.h>
+#include <sstream>
+#include <linux/tcp.h>
+
+#include "LightNetwork.h"
+#include "paddle/utils/Util.h"
+#include "paddle/utils/StringUtil.h"
+#include "RDMANetwork.h"
+
+/// quick ack can reduce the latency of small message
+P_DEFINE_bool(small_messages, false,
+              "if message size is small, recommend set it True to enable quick "
+              "ack and no delay");
+
+/// reasonable sock_send_buf_size can control the traffic injected into switch
+/// network. Injecting too many data into traffic could cause packets loss which
+/// cause long latency and degrade the efficiency of communication.
+P_DEFINE_int32(sock_send_buf_size, 1024 * 1024 * 40,
+               "restrict sock send buff size, can reduce network congestion if "
+               "set carefully");
+
+/// reasonable size can hold bursted packets and reduce packets loss
+P_DEFINE_int32(sock_recv_buf_size, 1024 * 1024 * 40,
+               "restrict sock recv buff size");
+
+namespace paddle {
+
+/**
+ * @brief get ip address from interface name
+ *
+ * @param[in] device device interface name
+ */
+std::string getIpAddr(std::string &device) {
+  int sock;
+  struct sockaddr_in sin;
+  struct ifreq ifr;
+
+  sock = socket(AF_INET, SOCK_DGRAM, 0);
+  CHECK(sock >= 0) << "Create socket error.";
+
+  strncpy(ifr.ifr_name, device.c_str(), IFNAMSIZ);
+  ifr.ifr_name[IFNAMSIZ - 1] = 0;
+
+  CHECK_GE(ioctl(sock, SIOCGIFADDR, &ifr), 0);
+  memcpy(&sin, &ifr.ifr_addr, sizeof(sin));
+  close(sock);
+  return std::string(inet_ntoa(sin.sin_addr));
+}
+
+/**
+ * @brief set sock option
+ *
+ * @param[in] sockfd sock file descriptor
+ *
+ * @note adjust some default sock option for better performance
+ */
+void setOption(int sockfd) {
+  int sendSize = FLAGS_sock_send_buf_size;
+  int recvSize = FLAGS_sock_recv_buf_size;
+  CHECK_GE(
+      setsockopt(sockfd, SOL_SOCKET, SO_RCVBUF, &recvSize, sizeof(recvSize)),
+      0);
+  CHECK_GE(
+      setsockopt(sockfd, SOL_SOCKET, SO_SNDBUF, &sendSize, sizeof(sendSize)),
+      0);
+  if (FLAGS_small_messages) {
+    int optval = 1;
+    CHECK_GE(
+        setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, &optval, sizeof(optval)),
+        0);
+    optval = 1;
+    CHECK_GE(
+        setsockopt(sockfd, IPPROTO_TCP, TCP_QUICKACK, &optval, sizeof(optval)),
+        0);
+  }
+  int reuse = 1;
+  CHECK_GE(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)),
+           0);
+}
+
+/**
+ * @brief class constructor for SocketServer
+ * @param[in] addr sock bind address
+ * @param[in] port sock bind port
+ * @param[in] rdmaCpu rdma sock bind cpu core
+ *
+ * @note start one socket server which hosts parameter server process.
+ *       rdmaCpu is passed to rdma deamon for better performance, and
+ *       start tcp socket instead of rdma socket if rdmaCpu is equal
+ *       to -1. Each trainer process starts one connection to one socket
+ *       server, and use --ports_num to build more connections to harness
+ *       fat communication channel if necessary.
+ *       each connection is controlled by single thread with blocking
+ *       read and write.
+ */
+SocketServer::SocketServer(const std::string &addr, int port, int rdmaCpu)
+    : port_(port), addr_(addr), stopping_(false) {
+  if (rdmaCpu == -1) {
+    tcpRdma_ = F_TCP;
+    socket_ = 0;
+    maxPendingConnections_ = 100;
+  } else {
+    tcpRdma_ = F_RDMA;
+    rdmaCpu_ = rdmaCpu;
+    rdmaSocket_ = 0;
+
+    std::stringstream ss;
+    ss << port;
+    rdmaUri_ = "rdma://" + addr + ":" + ss.str();
+  }
+
+  /// trigger to initialize RDMA lib
+  PCHECK(RdmaClientDaemons::get()) << "initilizate RDMA failed\n";
+}
+
+SocketServer::~SocketServer() {
+  stopping_ = true;
+  /// trigger accept thread to stop
+  {
+    SocketClient trigger(addr_.empty() ? "127.0.0.1" : addr_, port_, tcpRdma_);
+  }
+  this->join();
+}
+
+/**
+ * @brief start one tcp server which hosts parameter server
+ *
+ * @note do tcp socket bind and listen. it will spawn one thread
+ *       for each connection
+ */
+void SocketServer::tcpServer() {
+  int newsockfd;
+  socklen_t clilen;
+  struct sockaddr_in serv_addr, cli_addr;
+  struct hostent *server;
+
+  /// First call to socket() function
+  socket_ = socket(AF_INET, SOCK_STREAM, 0);
+  PCHECK(socket_ >= 0) << "ERROR opening socket";
+
+  /// Initialize socket structure
+  bzero((char *)&serv_addr, sizeof(serv_addr));
+  serv_addr.sin_family = AF_INET;
+  serv_addr.sin_port = htons(port_);
+  if (!addr_.empty()) {
+    server = gethostbyname(addr_.c_str());
+    PCHECK(server) << "ERROR, no such host: " << addr_;
+    bcopy((char *)server->h_addr, (char *)&serv_addr.sin_addr.s_addr,
+          server->h_length);
+  } else {
+    serv_addr.sin_addr.s_addr = INADDR_ANY;
+  }
+
+  setOption(socket_);
+
+  /// Now bind the host address using bind() call.
+  PCHECK(bind(socket_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
+      << "ERROR on binding " << addr_;
+
+  /// Now start listening for the clients, here process will
+  /// go in sleep mode and will wait for the incoming connection
+  listen(socket_, maxPendingConnections_);
+  clilen = sizeof(cli_addr);
+
+  while (true) {
+    /// Accept actual connection from the client
+    newsockfd = accept(socket_, (struct sockaddr *)&cli_addr, &clilen);
+    if (stopping_) {
+      break;
+    }
+    PCHECK(newsockfd >= 0) << "ERROR on accept";
+    constexpr int kPeerNameLen = 128;
+    char peerName[kPeerNameLen];
+    CHECK(inet_ntop(AF_INET, &cli_addr.sin_addr, peerName, kPeerNameLen));
+
+    SocketWorker *worker =
+        new SocketWorker(createChannel(newsockfd, std::string(peerName)), this);
+    worker->start();
+    worker->detach();
+  }
+  close(socket_);
+  LOG(INFO) << "pserver accept thread finish, addr=" << addr_
+            << " port=" << port_;
+}
+
+/**
+ * @brief start one rdma server which hosts parameter server
+ *
+ * @note do rdma bind and listen, which calling self-defined socket
+ *       like rdma library. it will spawn one thread for each connection
+ */
+void SocketServer::rdmaServer() {
+  struct sxi_sock *newsock;
+
+  /// First call to socket() function
+  rdmaSocket_ = rdma::ssocket(rdmaCpu_);
+  PCHECK(rdmaSocket_) << "ERROR opening RDMA socket";
+
+  PCHECK(rdma::bind(rdmaSocket_, rdmaUri_.c_str()) == 0)
+      << "ERROR bind RDMA socket";
+
+  /// Now start listening for the clients, here process will
+  /// go in sleep mode and will wait for the incoming connection
+  PCHECK(rdma::listen(rdmaSocket_) == 0) << "ERROR listen RDMA socket";
+
+  while (true) {
+    /// Accept actual connection from the client
+    newsock = rdma::accept(rdmaSocket_);
+    if (stopping_) {
+      break;
+    }
+    PCHECK(newsock) << "ERROR on accept";
+
+    constexpr int kPeerNameLen = 128;
+    char peerName[kPeerNameLen];
+
+    struct sockaddr_in *saddr = rdma::getSourceAddress(newsock);
+    CHECK(inet_ntop(AF_INET, &saddr->sin_addr, peerName, kPeerNameLen));
+
+    SocketWorker *worker =
+        new SocketWorker(createChannel(newsock, std::string(peerName)), this);
+    worker->start();
+    worker->detach();
+  }
+  rdma::close(rdmaSocket_);
+  LOG(INFO) << "pserver accept thread finish, rdma uri=" << rdmaUri_;
+}
+
+/**
+ * @brief start a socket server
+ *
+ * @note framework for starting socket server
+ */
+void SocketServer::run() {
+  if (tcpRdma_ == F_TCP) {
+    LOG(INFO) << "tcp server start ";
+    tcpServer();
+  } else if (tcpRdma_ == F_RDMA) {
+    LOG(INFO) << "rdma server start ";
+    rdmaServer();
+  }
+}
+
+/**
+ * @brief class constructor for rdma client deamons
+ *
+ * @note  automatically start several client deamons for better performance
+ */
+std::unique_ptr<RdmaClientDaemons> RdmaClientDaemons::daemons_ = nullptr;
+std::once_flag RdmaClientDaemons::initDataFlag_;
+
+RdmaClientDaemons::RdmaClientDaemons() {
+  if (FLAGS_rdma_tcp == "rdma") {
+    rdma::init();
+
+    struct sxi_socket *socket;
+    onlineCpus_ = rdma::numCpus();
+    for (auto i = 0; i < onlineCpus_; i++) {
+      socket = rdma::csocket(i);
+      PCHECK(socket) << "ERROR open client socket daemon";
+
+      rdmaClientSocket_.push_back(socket);
+    }
+    LOG(INFO) << "RDMA client daemons started, onlineCpus_:" << onlineCpus_;
+    /// round robin scheduler for new connection
+    curCpu_ = 0;
+    /// wait daemons to start completely.
+    sleep(2);
+  }
+}
+
+RdmaClientDaemons::~RdmaClientDaemons() {
+  if (FLAGS_rdma_tcp == "rdma") {
+    for (auto i = 0; i < onlineCpus_; i++) {
+      rdma::close(rdmaClientSocket_[i]);
+    }
+    LOG(INFO) << "RDMA client daemons is destoryed, onlineCpus_ "
+              << onlineCpus_;
+  }
+}
+
+/**
+ * @brief worker thread main context
+ *
+ * @note  each connection from client(trainer) is controlled by single worker
+ *        thread, which is for handling all parameter server requests
+ */
+void SocketWorker::run() {
+  LOG(INFO) << "worker started, peer = " << channel_->getPeerName();
+
+  std::vector<iovec> inputIovs;
+
+  while (true) {
+    std::unique_ptr<MsgReader> msgReader = channel_->readMessage();
+    if (!msgReader) {
+      break;
+    }
+
+    auto callback = [this](const std::vector<iovec> &outputIovs) {
+      channel_->writeMessage(outputIovs);
+    };
+
+    server_->handleRequest(std::move(msgReader), callback);
+  }
+
+  LOG(INFO) << "worker begin to finish, peer = " << channel_->getPeerName();
+  delete this;
+}
+
+/**
+ * @brief start one tcp connection to tcp server
+ * @param[in] serverAddr  tcp server ip
+ * @param[in] serverPort  tcp server port
+ *
+ * @note each object contains one channel which accept byte stream
+ */
+void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
+  struct sockaddr_in serv_addr;
+  struct hostent hostinfo, *server;
+  char buf[1024];  // temp for gethostbyname_r
+  int errRet;      // temp for gethostbyname_r
+
+  /// Create a socket point
+  int sockfd = socket(AF_INET, SOCK_STREAM, 0);
+  PCHECK(sockfd >= 0) << "ERROR opening socket";
+  CHECK_EQ(0, gethostbyname_r(serverAddr.c_str(), &hostinfo, buf, sizeof(buf),
+                              &server, &errRet))
+      << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
+  CHECK(server) << "gethostbyname_r err";
+
+  bzero((char *)&serv_addr, sizeof(serv_addr));
+  serv_addr.sin_family = AF_INET;
+  bcopy((char *)server->h_addr, (char *)&serv_addr.sin_addr.s_addr,
+        server->h_length);
+  serv_addr.sin_port = htons(serverPort);
+
+  setOption(sockfd);
+
+  /// Now connect to the server
+  PCHECK(connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
+      << "ERROR connecting to " << serverAddr;
+
+  channel_.reset(new SocketChannel(sockfd, serverAddr));
+  tcpRdma_ = F_TCP;
+}
+
+/**
+ * @brief start one RDMA connection to rdma server
+ * @param[in] serverAddr  rdma server ip
+ * @param[in] serverPort  rdma server port
+ *
+ * @note  each object contains one channel which accept byte stream
+ *        for rdma, low level sock also provide byte stream api.
+ */
+void SocketClient::RdmaClient(const std::string &serverAddr, int serverPort) {
+  struct sxi_sock *sock;
+
+  std::stringstream ss;
+  ss << serverPort;
+
+  std::string rdmaUri = "rdma://" + serverAddr + ":" + ss.str();
+
+  RdmaClientDaemons *daemons = RdmaClientDaemons::daemons_->get();
+  socketDaemon_ = daemons->selectDaemon();
+
+  /// connect to server with socket daemon
+  sock = rdma::connect(socketDaemon_, rdmaUri.c_str());
+  PCHECK(sock) << "ERROR connect to server" << rdmaUri;
+
+  std::vector<std::string> seg;
+  str::split(rdmaUri, '/', &seg);
+  std::string server = seg.at(seg.size() - 1);
+  channel_.reset(new SocketChannel(sock, server));
+  tcpRdma_ = F_RDMA;
+}
+
+/**
+ * @brief class constructor
+ * @param[in] serverAddr pserver ip address
+ * @param[in] serverPort pserver port
+ * @param[in] ChannelType F_TCP or F_RDMA
+ *
+ * @note  responsible for building one connection to specified pserver port
+ */
+SocketClient::SocketClient(const std::string &serverAddr, int serverPort,
+                           enum ChannelType channelType) {
+  if (channelType == F_RDMA)
+    RdmaClient(serverAddr, serverPort);
+  else
+    TcpClient(serverAddr, serverPort);
+}
+
+}  // namespace paddle
diff --git a/paddle/pserver/LightNetwork.h b/paddle/pserver/LightNetwork.h
new file mode 100644
index 00000000000000..0d6d6bf6b7c6d3
--- /dev/null
+++ b/paddle/pserver/LightNetwork.h
@@ -0,0 +1,186 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "SocketChannel.h"
+
+#include <memory>
+#include <thread>
+#include <vector>
+#include <atomic>
+
+#include "paddle/utils/Thread.h"
+
+struct sxi_socket;
+
+namespace paddle {
+
+class SocketWorker;
+
+/**
+ * @brief class for holding all parameters processing for current port
+ *
+ * @note  each parameter server inherits from one socket server, each
+ *        server contains serveral woker threads which are to parallelize
+ *        the processing of computation, but share some common datas stored
+ *        in child class of socketserver.
+ */
+class SocketServer : public Thread {
+   // rdmaCpu controls the cpu affinity of RDMA server daemon,
+   // which could benifit performance. rdmaCpu = -1 means TCP
+   // is used instead of RDMA transport.
+public:
+  SocketServer(const std::string& addr, int port, int rdmaCpu);
+  ~SocketServer();
+
+  virtual void run();
+
+  typedef std::function<void(const std::vector<iovec>& outputIovs)>
+      ResponseCallback;
+
+protected:
+  //
+  // The derived class needs to implement this function
+  // to handle the request received by SocketWorker
+  // The request is encapsulated by MsgReader, which contains
+  // a set of blocks.
+  virtual void handleRequest(std::unique_ptr<MsgReader> msgReader,
+                             ResponseCallback callback) = 0;
+
+  std::unique_ptr<SocketChannel> createChannel(int sock,
+                                               const std::string& peerName) {
+    return std::unique_ptr<SocketChannel>(new SocketChannel(sock, peerName));
+  }
+  std::unique_ptr<SocketChannel> createChannel(struct sxi_sock* sock,
+                                               const std::string& peerName) {
+    return std::unique_ptr<SocketChannel>(new SocketChannel(sock, peerName));
+  }
+
+  friend class SocketWorker;
+
+private:
+  void rdmaServer();
+  void tcpServer();
+
+  void detach() {}  // detach accept thread is forbidden
+
+protected:
+  enum ChannelType tcpRdma_;
+  // for rdma
+  int rdmaCpu_;
+  std::string rdmaUri_;
+  sxi_socket* rdmaSocket_;
+  // for tcp
+  int port_;
+  std::string addr_;
+  int socket_;
+  int maxPendingConnections_;
+  bool stopping_;
+};
+
+
+/**
+ * @brief class for holding one connection from one trainer
+ *
+ * @note  all parameter processing will run in the context of this worker
+ */
+class SocketWorker : public Thread {
+public:
+  SocketWorker(std::unique_ptr<SocketChannel>&& channel, SocketServer* server)
+      : channel_(std::move(channel)), server_(server) {}
+
+  virtual ~SocketWorker() {}
+
+  virtual void run();
+
+protected:
+  std::unique_ptr<SocketChannel> channel_;
+  SocketServer* server_;
+  enum ChannelType tcpRdma_;
+};
+
+/**
+ * @brief class for providing rdma client deamon thread
+ *
+ * @note  the deamons are required by sock like rdam library. Here
+ *        use singleton model for daemons. Each deamon hosts in
+ *        single cpu core for better load balance performance
+ */
+class RdmaClientDaemons {
+private:
+  RdmaClientDaemons();
+
+  static std::unique_ptr<RdmaClientDaemons> daemons_;
+
+public:
+  static RdmaClientDaemons* get() {
+    std::call_once(RdmaClientDaemons::initDataFlag_,
+                   &RdmaClientDaemons::getInstance);
+
+    return daemons_.get();
+  }
+
+  struct sxi_socket* selectDaemon() {
+    int cpu = curCpu_;
+    curCpu_ = (curCpu_ + 1) % onlineCpus_;
+
+    LOG(INFO) << "select daemon " << cpu << "onlineCpus_ " << onlineCpus_;
+    return rdmaClientSocket_[cpu];
+  }
+
+  ~RdmaClientDaemons();
+
+public:
+  friend class SocketClient;
+
+private:
+  static std::once_flag initDataFlag_;
+  static void getInstance() {
+    if (!daemons_.get()) daemons_.reset(new RdmaClientDaemons());
+  }
+
+  std::vector<struct sxi_socket*> rdmaClientSocket_;
+  std::atomic<int> curCpu_;
+  int onlineCpus_;
+};
+
+/**
+ * @brief management for client connection which are from trainers
+ *
+ * @note  it contains one channel descriptor which used to write and
+ *        read data
+ */
+class SocketClient {
+public:
+  SocketClient(const std::string& serverAddr, int serverPort,
+               enum ChannelType channelType);
+
+  SocketChannel* getChannel() { return channel_.get(); }
+
+protected:
+  std::unique_ptr<SocketChannel> channel_;
+  struct sxi_socket* socketDaemon_;
+  enum ChannelType tcpRdma_;
+
+private:
+  void RdmaClient(const std::string& serverAddr, int serverPort);
+  void TcpClient(const std::string& serverAddr, int serverPort);
+};
+
+std::string getIpAddr(std::string& device);
+void setOption(int sockfd);
+
+}  // namespace paddle
diff --git a/paddle/pserver/ParameterClient2.cpp b/paddle/pserver/ParameterClient2.cpp
new file mode 100644
index 00000000000000..07961cbdcc20cc
--- /dev/null
+++ b/paddle/pserver/ParameterClient2.cpp
@@ -0,0 +1,735 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <unistd.h>
+
+#include "ParameterClient2.h"
+#include "paddle/utils/StringUtil.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/math/SparseRowMatrix.h"
+
+P_DEFINE_string(pservers, "127.0.0.1", "Comma separated addresses of pservers");
+P_DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
+
+namespace paddle {
+
+template <class T>
+void copyToRepeatedField(google::protobuf::RepeatedField<T>* dest, const T* src,
+                         size_t size) {
+  dest->Clear();
+  dest->Reserve(size);
+
+  for (size_t i = 0; i < size; ++i) {
+    dest->AddAlreadyReserved(src[i]);
+  }
+}
+
+template <class T>
+void copyToRepeatedField(const std::vector<T>& src,
+                         google::protobuf::RepeatedField<T>* dest) {
+  copyToRepeatedField(dest, &src[0], src.size());
+}
+
+ParameterClient2::ParameterClient2(bool separate, int port, int numPorts)
+    : BaseClient(separate, numPorts), port_(port) {
+#ifndef PADDLE_DISABLE_TIMER
+    forwardbackwordTime_ = 0;
+#endif
+}
+
+
+int ParameterClient2::calcParameterBlockSize(
+    const std::vector<ParameterPtr>& parameters, size_t serviceNum) {
+  size_t totalSize = 0;
+  for (auto& para : parameters) {
+    totalSize += para->getSize();
+  }
+  size_t perServerSize = totalSize / serviceNum;
+
+  int sizeBits = 64 - __builtin_clzl(perServerSize);
+
+  /// 2^10 is min block size
+  /// 2^7 will be max number of blocks in one pserver
+  int blockSizeBits = std::max((sizeBits - 7), 10);
+  return 1 << blockSizeBits;
+}
+
+void ParameterClient2::initThreads() {
+  threadNum_ = serviceNum_;
+  if (FLAGS_parallel_thread_num > 1) {
+    LOG(INFO) << "parallel_thread_num dosent need to set";
+  }
+  syncThreadPool_.reset(new SyncThreadPool(threadNum_));
+
+  startThreads();
+}
+
+bool ParameterClient2::init(const std::vector<ParameterPtr>& parameters) {
+  destroy();
+
+  std::vector<std::string> hosts;
+  str::split(FLAGS_pservers, ',', &hosts);
+  serviceNum_ = hosts.size() * numPorts_;
+  uint64_t denseBlockSize = calcParameterBlockSize(parameters, serviceNum_);
+
+  /// setup prefetch matrix if exists
+  for (auto& para : parameters) {
+    /// set block size for each parameter
+    para->getConfig().set_parameter_block_size(
+            para->getConfig().sparse_remote_update() ?
+            para->getConfig().dims(1) : denseBlockSize);
+  }
+
+  for (auto& para : parameters) {
+    CHECK_NE(-1UL, para->getID()) << "id in parameter is not initialized";
+    parameterMap_[para->getID()] = para;
+  }
+
+  allSegments_.reserve(parameters.size());
+
+  for (auto& para : parameters) {
+    ParameterSegments segments;
+    segments.name = para->getName();
+    segments.id = para->getID();
+    allSegments_.push_back(segments);
+    if (para->getConfig().sparse_remote_update()) {
+      CHECK_EQ(para->getConfig().parameter_block_size(),
+              para->getConfig().dims(1))
+          << "For sparse remote update parameter,"
+          << " block size is the width of each row.";
+    }
+  }
+
+  /// init clients
+  clients_.reserve(serviceNum_);
+  recvDataMems_.resize(serviceNum_);
+
+  for (size_t i = 0; i < hosts.size(); ++i) {
+    for (int j = 0; j < numPorts_; ++j) {
+      LOG(INFO) << "pserver " << i * numPorts_ + j << " " << hosts[i] << ":"
+                << port_ + j;
+      if (FLAGS_rdma_tcp == "rdma") {
+        clients_.emplace_back(hosts[i], port_ + j, F_RDMA);
+      } else {
+        clients_.emplace_back(hosts[i], port_ + j, F_TCP);
+      }
+    }
+  }
+
+  sparseDistribution_.reset(new SparseParameterDistribution(serviceNum_));
+
+  sleep(2);
+
+  initThreads();
+
+  return true;
+}
+
+ParameterClient2::~ParameterClient2() { destroy(); }
+
+void ParameterClient2::destroy() {
+  if (clients_.empty()) {
+    /// this means not initialized.
+    return;
+  }
+  finishThreads();
+
+  parameterMap_.clear();
+  allSegments_.clear();
+  clients_.clear();
+}
+
+void ParameterClient2::sendParallel(int tid, size_t numThreads,
+                                    ParameterType recvParameterType) {
+  int numMyClients = divup(serviceNum_ - tid, numThreads);
+
+  for (int j = 0; j < numMyClients; ++j) {
+    REGISTER_TIMER("client_sendAndRecv_send");
+    int i = numThreads * j + tid;
+    /// Try to make different clients to send data to different pservers
+    /// at the same time so that they will not flood data to the same
+    /// pserver.
+    i = calcClientId(i, serviceNum_);
+    clients_[i].send("sendParameter", sendJob_.parallelRequests[i],
+                     sendJob_.parallelInputIovs[i]);
+
+    /// clear large structure
+    sendJob_.parallelRequests[i].Clear();
+    sendJob_.parallelInputIovs[i].clear();
+  }
+
+  std::vector<void*> bufs;
+  SendParameterResponse response;
+  for (int j = 0; j < numMyClients; ++j) {
+    REGISTER_TIMER("client_sendAndRecv_recv");
+    int i = numThreads * j + tid;
+    i = calcClientId(i, serviceNum_);
+    auto msgReader = clients_[i].recv(&response);
+    CHECK_EQ(msgReader->getNumBlocks(), (size_t)response.blocks_size());
+    bufs.clear();
+    bufs.reserve(response.blocks_size());
+    for (auto& block : response.blocks()) {
+      auto it = parameterMap_.find(block.para_id());
+      CHECK(it != parameterMap_.end());
+      Parameter* parameter = it->second.get();
+      real* buf = nullptr;
+      if (parameter->getBuf(recvParameterType)) {
+        buf = parameter->getBuf(recvParameterType)->getPoint(block.begin_pos());
+      } else {
+        auto recvMat = dynamic_cast<SparseRowCpuMatrix*>(
+            parameter->getMat(recvParameterType).get());
+        CHECK(recvMat);
+        size_t width = parameter->getConfig().dims(1);
+        buf = recvMat->getLocalRow(block.begin_pos() / width);
+      }
+      /// sparse_id is not useful while receiving data since sparse data
+      /// storage is continuous, do commit recieved data as that of dense.
+      bufs.push_back(buf);
+    }
+    msgReader->readBlocks(bufs);
+  }
+}
+
+void ParameterClient2::prepareSendData(
+    ParameterUpdateMode updateMode, ParameterType parameterType,
+    const std::vector<ParameterSegments>& parameterSegments, int64_t numSamples,
+    real cost, bool sendBackParameter, ParameterType sendBackParameterType,
+    BatchStatus batchStatus, SendJob* sendJob) {
+  sendJob->parallelRequests.resize(serviceNum_);
+  sendJob->parallelInputIovs.resize(serviceNum_);
+
+  for (auto& request : sendJob->parallelRequests) {
+#ifndef PADDLE_DISABLE_TIMER
+    if (updateMode == PSERVER_UPDATE_MODE_ADD_GRADIENT) {
+      request.set_forwardbackward_time(forwardbackwordTime_);
+    }
+#endif
+    request.set_trainer_id(trainerId_);
+    request.set_update_mode(updateMode);
+    request.set_send_back_parameter(sendBackParameter);
+    request.set_send_back_parameter_type(sendBackParameterType);
+    request.set_num_samples(numSamples);
+    request.set_cost(cost);
+    request.set_batch_status(batchStatus);
+    CHECK_EQ(request.blocks_size(), 0);
+  }
+  for (const auto& segments : parameterSegments) {
+    const auto it = parameterMap_.find(segments.id);
+    CHECK(it != parameterMap_.end());
+    Parameter* parameter = it->second.get();
+    CHECK(parameter != nullptr) << "parameter is nullptr";
+    int64_t nameHash = std::hash<std::string>()(segments.name);
+    bool sendingPara = !(updateMode == PSERVER_UPDATE_MODE_GET_PARAM ||
+                         updateMode == PSERVER_UPDATE_MODE_GET_PARAM_SPARSE ||
+                         updateMode == PSERVER_UPDATE_MODE_SET_PARAM_ZERO);
+    bool sparseUpdate = parameter->getConfig().sparse_remote_update() &&
+                        (updateMode == PSERVER_UPDATE_MODE_ADD_GRADIENT ||
+                         updateMode == PSERVER_UPDATE_MODE_ASYNC_SGD ||
+                         updateMode == PSERVER_UPDATE_MODE_GET_PARAM_SPARSE);
+
+    const auto blockSize = parameter->getConfig().parameter_block_size();
+    CHECK_GE(blockSize, 1LU) << "blockSize should > 0 " << blockSize;
+    const auto paraSize = parameter->getSize();
+    if (sparseUpdate) {
+      const auto prefetchMat = parameter->getPrefetchMatrix();
+      CHECK(prefetchMat != nullptr) << "prefetchMat is nullptr";
+      auto sendMat = dynamic_cast<SparseRowCpuMatrix*>(
+        parameter->getMat(parameterType).get());
+      CHECK(sendMat != nullptr) << "sendMat is nullptr";
+
+      syncThreadPool_->exec([&](int tid, size_t numThreads) {
+        const auto &localIndices = prefetchMat->getLocalIndices();
+        /// num of sparse rows
+        size_t nLocalBlocks = localIndices.size();
+        uint64_t beginDim = 0;
+        uint64_t endDim = 0;
+        for (size_t row = 0; row < nLocalBlocks; ++row) {
+          int64_t blockId = localIndices[row];  // local row -> sparse row
+          int serverId = std::abs((blockId + nameHash) % serviceNum_);
+          if (serverId % numThreads != (size_t)tid) {
+            continue;
+          }
+
+          beginDim = blockId * blockSize;
+          endDim = std::min<int64_t>(beginDim + blockSize, paraSize);
+
+          auto& request = sendJob->parallelRequests[serverId];
+          ParameterBlock* block = request.add_blocks();
+          block->set_para_id(segments.id);
+          /// global sparse row id
+          block->set_block_id(blockId);
+          /// local row offset
+          block->set_begin_pos(row * blockSize);
+          /// block len
+          block->set_block_size(endDim - beginDim);
+
+          if (sendingPara) {
+            sendJob->parallelInputIovs[serverId].push_back(
+                {sendMat->getLocalRow(row), sizeof(real) * blockSize});
+            /// detect sparse parameter distribution
+            sparseDistribution_->probeDistribution(serverId,
+                    sizeof(real) * blockSize);
+          }
+        }
+      });
+
+    } else {  /// parameter set for dense and sparse
+      real* buf = sendingPara ?
+          parameter->getBuf(parameterType)->getPoint(0) : nullptr;
+      uint64_t endDim = 0;
+      for (uint64_t beginDim = 0; beginDim < paraSize; beginDim = endDim) {
+        endDim = std::min<int64_t>(beginDim + blockSize, paraSize);
+        int64_t blockId = beginDim / blockSize;
+        int serverId = std::abs((blockId + nameHash) % serviceNum_);
+
+        auto& request = sendJob->parallelRequests[serverId];
+        ParameterBlock* block = request.add_blocks();
+        block->set_para_id(segments.id);
+        block->set_block_id(blockId);
+        block->set_begin_pos(beginDim);
+        block->set_block_size(endDim - beginDim);
+        if (buf) {
+            sendJob->parallelInputIovs[serverId].push_back(
+                    {buf + beginDim, sizeof(real) * (endDim - beginDim)});
+        }
+      }
+    }
+  }  // parameterSegments
+
+  sparseDistribution_->checkAndResetDistribution();
+}
+
+void ParameterClient2::sendAndReceiveParameter(
+    ParameterUpdateMode updateMode, ParameterType parameterType,
+    const std::vector<ParameterSegments>& parameterSegments, int64_t numSamples,
+    real cost, bool sendBackParameter, ParameterType sendBackParameterType,
+    ParameterType recvParameterType) {
+  prepareSendData(updateMode, parameterType, parameterSegments, numSamples,
+                  cost, sendBackParameter, sendBackParameterType,
+                  /*batchStatus = */ BATCH_START_AND_FINISH, &sendJob_);
+
+  syncThreadPool_->exec([&](int tid, size_t numThreads) {
+    this->sendParallel(tid, numThreads, recvParameterType);
+  });
+}
+
+void ParameterClient2::sendParameter(
+    ParameterUpdateMode updateMode, ParameterType parameterType,
+    const std::vector<ParameterSegments>& parameterSegments, int64_t numSamples,
+    real cost, bool sendBackParameter, BatchStatus batchStatus) {
+  SendJobPtr sendJob = std::make_shared<SendJob>();
+  prepareSendData(updateMode, parameterType, parameterSegments, numSamples,
+                  cost, sendBackParameter, PARAMETER_VALUE, batchStatus,
+                  sendJob.get());
+
+  for (int i = 0; i < threadNum_; i++) {
+    sendJobQueue_[i]->enqueue(sendJob);
+  }
+}
+
+void ParameterClient2::recvParameter() { recvSyncBarrier_->wait(); }
+
+void ParameterClient2::send(int threadId) {
+  int index = threadId;
+  LOG(INFO) << "send thread " << threadId << " started";
+  int numMyClients = divup(serviceNum_ - index, threadNum_);
+  while (true) {
+    SendJobPtr recvJob = sendJobQueue_[index]->dequeue();
+    if (stopping_) {
+      recvJobQueue_[index]->enqueue(recvJob);
+      break;
+    }
+    for (int j = 0; j < numMyClients; ++j) {
+      REGISTER_TIMER("client_send");
+      int i = threadNum_ * j + index;
+      /// Try to make different clients to send data to different pservers
+      /// at the same time so that they will not flood data to the same
+      /// pserver.
+      i = calcClientId(i, serviceNum_);
+      if (recvJob->parallelRequests.size()) {
+        clients_[i].send("sendParameter", recvJob->parallelRequests[i],
+                         recvJob->parallelInputIovs[i]);
+      } else {
+        clients_[i].send("sendData", recvJob->parallelDataRequests[i],
+                         recvJob->parallelInputIovs[i]);
+      }
+    }
+    recvJobQueue_[index]->enqueue(recvJob);
+  }
+}
+
+void ParameterClient2::recv(int threadId) {
+  LOG(INFO) << "recv thread " << threadId << " started";
+  int index = threadId;
+  int numMyClients = divup(serviceNum_ - index, threadNum_);
+  while (true) {
+    std::vector<void*> bufs;
+    SendParameterResponse response;
+    SendDataResponse dataResponse;
+    SendJobPtr recvJob = recvJobQueue_[index]->dequeue();
+    if (stopping_) break;
+    for (int j = 0; j < numMyClients; ++j) {
+      REGISTER_TIMER("client_recv");
+      int i = threadNum_ * j + index;
+      i = calcClientId(i, serviceNum_);
+      if (recvJob->parallelRequests.size()) {
+        auto msgReader = clients_[i].recv(&response);
+        CHECK_EQ(msgReader->getNumBlocks(), (size_t)response.blocks_size());
+        bufs.clear();
+        bufs.reserve(response.blocks_size());
+        for (auto& block : response.blocks()) {
+          auto it = parameterMap_.find(block.para_id());
+          CHECK(it != parameterMap_.end());
+          Parameter* parameter = it->second.get();
+          real* buf =
+              parameter->getBuf(PARAMETER_VALUE)->getPoint(block.begin_pos());
+          CHECK_EQ(msgReader->getBlockLength(bufs.size()),
+                   sizeof(real) * (block.block_size()));
+          bufs.push_back(buf);
+        }
+        msgReader->readBlocks(bufs);
+      } else {
+        auto msgReader = clients_[i].recv(&dataResponse);
+        CHECK_EQ(msgReader->getNumBlocks(), (size_t)dataResponse.blocks_size());
+        size_t totalLen = msgReader->getTotalLength();
+        if (0 == totalLen) {
+          continue;
+        }
+        auto& recvMem = recvDataMems_[dataResponse.server_id()];
+        CHECK_EQ(dataResponse.blocks_size(), 1)
+            << "Only one block currently support now!";
+        auto& block = dataResponse.blocks(0);
+        CHECK_EQ(totalLen % sizeof(block.data_size()), 0U);
+        recvMem = std::make_shared<CpuMemoryHandle>(totalLen);
+        msgReader->readNextBlock(recvMem.get()->getBuf());
+      }
+    }
+    recvSyncBarrier_->wait();
+  }
+}
+
+void ParameterClient2::waitPassStart() {
+  WaitPassStartRequest request;
+  std::vector<WaitPassStartResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void ParameterClient2::waitPassFinish() {
+  WaitPassFinishRequest request;
+  std::vector<WaitPassFinishResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void ParameterClient2::synchronize(SyncObject syncObjectId) {
+  SynchronizeRequest request;
+  request.set_sync_object_id(syncObjectId);
+  std::vector<SynchronizeResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void ParameterClient2::asyncFinishPass(SyncObject syncObjectId) {
+  SynchronizeRequest request;
+  request.set_sync_object_id(syncObjectId);
+  request.set_trainer_id(trainerId_);
+  std::vector<SynchronizeResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void ParameterClient2::setConfig(const OptimizationConfig& optConfig,
+                                 const std::string& saveDir,
+                                 bool isSparseServer) {
+  SetConfigRequest request;
+  std::vector<SetConfigResponse> responses;
+
+  for (auto& nameAndPara : parameterMap_) {
+    *request.add_param_configs() = nameAndPara.second->getConfig();
+  }
+
+  *request.mutable_opt_config() = optConfig;
+  request.set_save_dir(saveDir);
+  request.set_is_sparse_server(isSparseServer);
+
+  std::vector<SetConfigRequest> requests;
+  requests.resize(clients_.size());
+  for (size_t i = 0; i < requests.size(); ++i) {
+    requests[i].CopyFrom(request);
+    requests[i].set_server_id(i);
+  }
+
+  responses.resize(clients_.size());
+  size_t numClients = clients_.size();
+  for (size_t i = 0; i < numClients; ++i) {
+    clients_[i].send(__func__, requests[i]);
+  }
+  for (size_t i = 0; i < numClients; ++i) {
+    clients_[i].recv(&responses[i]);
+  }
+}
+
+bool ParameterClient2::inStatus(PServerStatus status) {
+  GetStatusRequest request;
+  std::vector<GetStatusResponse> responses;
+
+  bool ok = true;
+  multiCall("getStatus", request, &responses);
+  for (auto& response : responses) {
+    if (response.status() != status) {
+      ok = false;
+    }
+  }
+
+  return ok;
+}
+
+void ParameterClient2::setStatus(PServerStatus status) {
+  SetStatusRequest request;
+  request.set_status(status);
+  std::vector<SetStatusResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void ParameterClient2::waitForStatus(PServerStatus status) {
+  while (!inStatus(status)) {
+    sleep(1);
+  }
+}
+
+template <typename Proto>
+static void validateResponses(const std::vector<Proto>& responses) {
+  for (auto& response : responses) {
+    CHECK(response.return_message().empty())
+        << "client" << &response - &responses[0]
+        << " error:" << response.return_message();
+  }
+}
+
+PServerVector ParameterClient2::createVector() {
+  CreateVectorRequest request;
+  std::vector<CreateVectorResponse> responses;
+  int64_t handle = -1;
+
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+
+  for (auto& response : responses) {
+    if (handle == -1) {
+      handle = response.handle();
+    } else {
+      CHECK_EQ(handle, response.handle()) << "Inconsistent handle from client"
+                                          << &response - &responses[0] << " "
+                                          << handle << " " << response.handle();
+    }
+  }
+  return PServerVector{handle};
+}
+
+void ParameterClient2::releaseVector(PServerVector handle) {
+  ReleaseVectorRequest request;
+  std::vector<ReleaseVectorResponse> responses;
+
+  request.set_handle(handle.handle);
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+}
+
+PServerMatrix ParameterClient2::createMatrix(int32_t numCols) {
+  CreateMatrixRequest request;
+  std::vector<CreateMatrixResponse> responses;
+  int64_t handle = -1;
+
+  request.set_num_cols(numCols);
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+
+  for (auto& response : responses) {
+    if (handle == -1) {
+      handle = response.handle();
+    } else {
+      CHECK_EQ(handle, response.handle()) << "Inconsistent handle from client"
+                                          << &response - &responses[0] << " "
+                                          << handle << " " << response.handle();
+    }
+  }
+  return PServerMatrix{handle};
+}
+
+void ParameterClient2::releaseMatrix(PServerMatrix handle) {
+  ReleaseMatrixRequest request;
+  std::vector<ReleaseMatrixResponse> responses;
+
+  request.set_handle(handle.handle);
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+}
+
+void PreparedOperations::addOperationHelper(Operation* op, CpuVectorPtr vec) {
+  ProtoVector& pvec = *op->add_vectors();
+  size_t dim = vec->getSize();
+  pvec.set_dim(dim);
+  copyToRepeatedField(pvec.mutable_values(), vec->getData(), vec->getSize());
+}
+
+void PreparedOperations::addOperationHelper(Operation* op, CpuMatrixPtr mat) {
+  ProtoMatrix& pmat = *op->add_matrices();
+  pmat.set_num_cols(mat->getWidth());
+  pmat.set_num_rows(mat->getHeight());
+  copyToRepeatedField(pmat.mutable_values(), mat->getData(),
+                      pmat.num_cols() * pmat.num_rows());
+}
+
+void ParameterClient2::doOperation(PreparedOperations& ops,
+                                   bool waitForGradient, bool sendBackGradient,
+                                   bool releasePass) {
+  std::vector<DoOperationResponse> responses;
+  ops.request_.set_wait_for_gradient(waitForGradient);
+  ops.request_.set_send_back_parameter(sendBackGradient);
+  ops.request_.set_release_pass(releasePass);
+  multiCall(__func__, ops.request_, &responses);
+  validateResponses(responses);
+  size_t numPassFinishServers = 0;
+
+  size_t numOps = ops.request_.operations_size();
+  for (auto& response : responses) {
+    numPassFinishServers += response.pass_finish();
+    CHECK_EQ(numOps, (size_t)response.results_size());
+    for (size_t opId = 0; opId < numOps; ++opId) {
+      const OperationResult& result = response.results(opId);
+      std::vector<real*>& resultScalars = ops.localResults_[opId].resultScalars;
+      std::vector<CpuVectorPtr>& resultVectors =
+          ops.localResults_[opId].resultVectors;
+      std::vector<CpuMatrixPtr>& resultMatrices =
+          ops.localResults_[opId].resultMatrices;
+
+      if (&response == &responses[0]) {
+        /// Initialize results to zero
+
+        resultScalars.resize(result.scalars_size());
+        for (auto p : resultScalars) {
+          if (!p) continue;
+          *p = 0;
+        }
+        size_t numVectors = result.vectors_size();
+        resultVectors.resize(numVectors);
+        for (size_t i = 0; i < numVectors; ++i) {
+          if (!resultVectors[i]) continue;
+          resultVectors[i]->resize(result.vectors(i).dim());
+          resultVectors[i]->zeroMem();
+        }
+        size_t numMatrices = result.matrices_size();
+        resultMatrices.resize(numMatrices);
+        for (size_t i = 0; i < numMatrices; ++i) {
+          if (!resultMatrices[i]) continue;
+          resultMatrices[i]->resize(result.matrices(i).num_rows(),
+                                    result.matrices(i).num_cols());
+          resultMatrices[i]->zeroMem();
+        }
+      }
+
+      // aggregate results from each pserver to results
+
+      CHECK_EQ(resultScalars.size(), (size_t)result.scalars_size());
+      for (ssize_t i = 0; i < result.scalars_size(); ++i) {
+        real* rscalar = resultScalars[i];
+        if (!rscalar) continue;
+        *rscalar += result.scalars(i);
+      }
+
+      CHECK_EQ(resultVectors.size(), (size_t)result.vectors_size());
+      for (auto& vec : result.vectors()) {
+        int i = &vec - &result.vectors(0);
+        CpuVectorPtr rvec = resultVectors[i];
+        if (!rvec) continue;
+        CHECK_EQ(rvec->getSize(), (size_t)vec.dim());
+        CpuVector avec(rvec->getSize(), const_cast<real*>(vec.values().data()));
+        rvec->add(avec);
+      }
+
+      CHECK_EQ(resultMatrices.size(), (size_t)result.matrices_size());
+      for (auto& mat : result.matrices()) {
+        int i = &mat - &result.matrices(0);
+        CpuMatrixPtr rmat = resultMatrices[i];
+        if (!rmat) continue;
+        CHECK_EQ(rmat->getHeight(), (size_t)mat.num_rows());
+        CHECK_EQ(rmat->getWidth(), (size_t)mat.num_cols());
+        CpuMatrixPtr amat =
+            std::make_shared<CpuMatrix>(const_cast<real*>(mat.values().data()),
+                                        rmat->getHeight(), rmat->getWidth());
+        rmat->add(*amat);
+      }
+    }
+  }
+  passFinish_ = numPassFinishServers == clients_.size();
+}
+
+real ParameterClient2::vectorDotProduct(PServerVector u, PServerVector v) {
+  real result = 0.0;
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_utv, u, v)(&result);
+  doOperation(ops, false, false);
+  return result;
+}
+
+void ParameterClient2::vectorScale(PServerVector u, real a) {
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_au, u, a);
+  doOperation(ops, false, false);
+}
+
+void ParameterClient2::vectorCopy(PServerVector src, PServerVector dst) {
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_COPY, src, dst);
+  doOperation(ops, false, false);
+}
+
+void ParameterClient2::vectorAddMult(PServerVector u, PServerVector v, real a) {
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)1);
+  doOperation(ops, false, false);
+}
+
+void ParameterClient2::vectorAddMultInto(PServerVector u, PServerVector v,
+                                         PServerVector w, real a) {
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_au_bv_cw, v, w, u, (real)1, a, (real)0);
+  doOperation(ops, false, false);
+}
+
+void ParameterClient2::vectorScaleInto(PServerVector u, PServerVector v,
+                                       real a) {
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)0);
+  doOperation(ops, false, false);
+}
+
+void ParameterClient2::loadValueVector(const std::string& dirName) {
+  LoadValueRequest request;
+  request.set_dir_name(dirName);
+  std::vector<LoadValueResponse> responses;
+
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+}
+
+void ParameterClient2::saveValueVector(const std::string& dirName) {
+  SaveValueRequest request;
+  request.set_dir_name(dirName);
+  std::vector<SaveValueResponse> responses;
+
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+}
+
+}  // namespace paddle
diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h
new file mode 100644
index 00000000000000..7a4085ad823074
--- /dev/null
+++ b/paddle/pserver/ParameterClient2.h
@@ -0,0 +1,570 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <atomic>
+#include <mutex>
+#include <vector>
+#include <unordered_map>
+
+#include "paddle/utils/Locks.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/parameter/Parameter.h"
+#include "paddle/utils/Queue.h"
+#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/Util.h"
+#include "paddle/math/Vector.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/pserver/BaseClient.h"
+
+#include "ParameterService.pb.h"
+
+#include "SparseParameterDistribution.h"
+#include "ProtoServer.h"
+
+P_DECLARE_int32(parallel_thread_num);
+
+namespace paddle {
+
+struct PServerMatrix {
+  int64_t handle;
+};
+
+struct PServerVector {
+  int64_t handle;
+};
+
+/**
+ * @brief A class to help to prepare server-side operations.
+ */
+class PreparedOperations {
+protected:
+  class ResultsAdder;
+  struct LocalOperationResult;
+
+public:
+  /**
+   * Offers an easy way to prepare operations that will be performed on
+   * server-side.
+   *
+   * Usage:
+   * @code
+   *   addOperation(optype, arguments...)(results...)
+   * @endcode
+   *
+   * Examples:
+   * 1. set pserver vector to 1:
+   * @code
+   *   PServerVector u = parameterClient.createVector();
+   *   addOperation(PSERVER_OP_RESET, u, (real)1);
+   * @endcode
+   *
+   * 2. Compute inner product of to pserver vectors.
+   * @code
+   *   PServerVector u = parameterClient.createVector();
+   *   PServerVector v = parameterClient.createVector();
+   *   real result;
+   *   addOperation(PSERVER_OP_utv, u, v)(&result)
+   * @endcode
+   *
+   * @param[in] operation The operation that pserver will perform.
+   * @param[in] args Argument list of the operation
+   * @return A ResultsAdder object initialized with the last element of
+   *         localResults_.
+   */
+  template <typename... Args>
+  ResultsAdder addOperation(MatrixVectorOperation operation, Args... args) {
+    Operation* op = request_.add_operations();
+    op->set_operation(operation);
+    localResults_.emplace_back();
+    addOperationHelper(op, args...);
+    return ResultsAdder(&localResults_.back());
+  }
+
+protected:
+  void addOperationHelper(Operation* op) {}
+
+  /**
+   * @brief Helper function to add an new operation that takes a PServerVector
+   *        as an operand.
+   */
+  void addOperationHelper(Operation* op, PServerVector arg) {
+    op->add_pvectors(arg.handle);
+  }
+
+  /**
+   * @brief Helper function to add an new operation that takes a PServerMatrix
+   *        as an operand.
+   */
+  void addOperationHelper(Operation* op, PServerMatrix arg) {
+    op->add_pmatrices(arg.handle);
+  }
+
+  /**
+   * @brief Helper function to add an new operation that takes a real valued
+   *        scalar as an operand.
+   */
+  void addOperationHelper(Operation* op, real arg) { op->add_scalars(arg); }
+
+  /**
+   * @brief Helper function to add an new operation that takes a CpuVectorPtr
+   *        as an operand.
+   * @note The array of CpuVectors that arg points to will be copied to
+   *       op's vectors field.
+   */
+  void addOperationHelper(Operation* op, CpuVectorPtr arg);
+
+  /**
+   * @brief Helper function to add an new operation that takes a CpuMatrixPtr
+   *        as an operand.
+   * @note The array of CpuMatrixs that arg points to will be copied to
+   *       op's matrices field.
+   */
+  void addOperationHelper(Operation* op, CpuMatrixPtr arg);
+
+  /**
+   * @brief Helper function to add an new operation and prepare the operands.
+   *
+   * @tparam Arg An operand of the operation.
+   * @tparam Args A list of rest operands of the operation.
+   * @param op Pointer to an Operation object.
+   */
+  template <typename Arg, typename... Args>
+  void addOperationHelper(Operation* op, Arg arg, Args... args) {
+    addOperationHelper(op, arg);
+    addOperationHelper(op, args...);
+  }
+
+  /**
+   * @brief ResultsAdder offers easy ways to quickly store operation results.
+   */
+  class ResultsAdder {
+  public:
+    explicit ResultsAdder(LocalOperationResult* localResult)
+        : localResult_(localResult) {}
+    template <typename... Args>
+    void operator()(Args... args) {
+      addResult(args...);
+    }
+    void addResult() {}
+    void addResult(real* arg) { localResult_->resultScalars.push_back(arg); }
+    void AddResult(CpuVectorPtr arg) {
+      localResult_->resultVectors.push_back(arg);
+    }
+    void AddResult(CpuMatrixPtr arg) {
+      localResult_->resultMatrices.push_back(arg);
+    }
+    template <typename Arg, typename... Args>
+    void addResult(Arg arg, Args... args) {
+      addResult(arg);
+      addResult(args...);
+    }
+
+  protected:
+    LocalOperationResult* localResult_;
+  };
+
+protected:
+  DoOperationRequest request_;
+  std::vector<iovec> inputIovs_;
+  struct LocalOperationResult {
+    std::vector<real*> resultScalars;
+    std::vector<CpuVectorPtr> resultVectors;
+    std::vector<CpuMatrixPtr> resultMatrices;
+  };
+  std::vector<LocalOperationResult> localResults_;
+  friend class ParameterClient2;
+};
+
+struct ParameterSegments {
+  std::string name;               // name of the parameter
+  size_t id;                      // id of the parameter
+};
+
+/**
+ * The client interface for parameter server. ParameterClient2 supports 2 modes
+ * for managing connections to parameter servers, in the 1st mode one connection
+ * is shared by 2 threads that are separately responsible for sending and
+ * recieving activities, in the 2nd mode one connection is owned by only one
+ * thread, and all the sending and recieving activities run in that single
+ * thread.
+ * TODO(yanfei):
+ * Additional core idea to further optimizate pserver performance is
+ * to do sync-sgd based parameter level instead of pserver level.
+ * full-parallelization based parameter level for sync-sgd also can
+ * sense forwardbackward computation layer-by-layer for more deeper layer
+ * model.
+ * Firstly, pserver can do full-parallelization on all computation based
+ * parameter level instead of waiting for all gradients are finished and
+ * start to send back parameters value immediately if parameter is ready
+ * instead of waiting for all parameters value are ready
+ * Secondly, parameter client can write back parameters to GPU instead of
+ * waiting until all parameters are received to CPU host end.
+ */
+class ParameterClient2 : public BaseClient {
+public:
+  /** Constructor.
+   * @param separate True if sending and recieving activities are separated
+   *                 into 2 threads, otherwise false.
+   * @param port Port number that parameter client runs on.
+   * @param numPorts Number of ports parameter clients occupies,
+   *                 numPorts * pserver number is the total number of
+   *                 connections the parameter client maintains.
+   */
+  ParameterClient2(bool separate = false,
+                   int port = FLAGS_port, int numPorts = FLAGS_ports_num);
+
+  ~ParameterClient2();
+
+  static int calcParameterBlockSize(const std::vector<ParameterPtr>& parameters,
+                                    size_t serviceNum);
+
+public:
+  bool init(const std::vector<ParameterPtr>& parameters);
+
+  /// service functions
+
+  /**
+   * @brief Sends the segments in parameter to parameter servers, then receives
+   *        the response from the servers.
+   * @param[in] updateMode Indicates how parameters should be updated on the
+   *            server side.
+   * @param[in] parameterType Type of parameter that will be sent.
+   * @param[in] segments Segments in the parameter that will be sent.
+   * @param[in] numSamples Number of samples this update is based on.
+   * @param[in] cost Cost of the batch, will be used to calculate global object
+   *            value.
+   * @param[in] sendBackParameter True if the updated parameters should be sent
+   *            back, otherwise false.
+   * @param[in] sendBackParameterType Send back parameter type on pserver,
+   *            PARAMETER_VALUE by default
+   * @param[in] recvParameterType pserver[sendBackParameterType] will be copy to
+   *            client[recvParameterType]
+   * @note Only parameterType will be sent.
+   */
+  void sendAndReceiveParameter(
+      ParameterUpdateMode updateMode,
+      ParameterType parameterType,
+      const std::vector<ParameterSegments>& segments,
+      int64_t numSamples,
+      real cost, bool sendBackParameter,
+      ParameterType sendBackParameterType,
+      ParameterType recvParameterType);
+
+  /**
+   * @brief Sends all parameters to parameter servers, and receives the response
+   *        from the servers.
+   */
+  void sendAndReceiveParameter(
+      ParameterUpdateMode updateMode,
+      ParameterType parameterType,
+      int64_t numSamples,
+      real cost,
+      bool sendBackParameter,
+      ParameterType sendBackParameterType = PARAMETER_VALUE,
+      ParameterType recvParameterType = PARAMETER_VALUE) {
+    sendAndReceiveParameter(updateMode, parameterType, allSegments_, numSamples,
+                            cost, sendBackParameter, sendBackParameterType,
+                            recvParameterType);
+  }
+
+  /**
+   * @brief Sends the segments in parameter to parameter servers. Each
+   *        sendParameter() must be paired with a recvParameter() in the future.
+   *        Only parameterType will be sent.
+   *
+   * @param[in] updateMode Indicates how parameters should be updated on the
+   *            server side.
+   * @param[in] parameterType Type of parameter that will be sent.
+   * @param[in] segments Segments in the parameter that will be sent.
+   * @param[in] numSamples Number of samples this update is based on.
+   * @param[in] cost Cost of the batch, will be used to calculate global object
+   *            value.
+   * @param[in] sendBackParameter True if the updated parameters should be sent
+   *            back, otherwise false.
+   * @param[in] batchStatus Status of the batch.
+   * @note This function is non-blocking. This means that parameter should
+   *       not change between this call and recvParameter()
+   */
+  void sendParameter(ParameterUpdateMode updateMode,
+                     ParameterType parameterType,
+                     const std::vector<ParameterSegments>& segments,
+                     int64_t numSamples, real cost, bool sendBackParameter,
+                     BatchStatus batchStatus);
+
+  void recvParameter();
+
+  /**
+   * Sends all parameters to parameter servers, recvParameter() have to be invoked
+   * afterwards.
+   *
+   * @note This function is non-blocking. This means that if parameter should
+   *       not changes between this call and recvParameter()
+   */
+  void sendParameter(ParameterUpdateMode updateMode,
+                     ParameterType parameterType, int64_t numSamples, real cost,
+                     bool sendBackParameter, BatchStatus batchStatus) {
+    sendParameter(updateMode, parameterType, allSegments_, numSamples, cost,
+                  sendBackParameter, batchStatus);
+  }
+
+  /// Get all parameters from parameter servers
+  void getParameter(ParameterType recvParameterType = PARAMETER_VALUE,
+                    ParameterType sendBackParameterType = PARAMETER_VALUE) {
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM, PARAMETER_VALUE,
+                            0,     // numSamples = 0
+                            0,     // cost = 0
+                            true,  // sendBackParameter = true
+                            sendBackParameterType,
+                            recvParameterType);
+  }
+
+  /// Get parameters by sparse row ids from parameter servers
+  void getParameterSparse(
+      ParameterType recvParameterType = PARAMETER_VALUE,
+      ParameterType sendBackParameterType = PARAMETER_VALUE) {
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM_SPARSE,
+                            PARAMETER_VALUE,
+                            0,     // numSamples = 0
+                            0,     // cost = 0
+                            true,  // sendBackParameter = true
+                            sendBackParameterType, recvParameterType);
+  }
+
+  /// Set all parameters on parameter servers using the local parameters
+  void setParameter() {
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM, PARAMETER_VALUE,
+                            0,       // numSamples = 0
+                            0,       // cost = 0
+                            false);  // sendBackParameter = false
+  }
+  /**
+   * Set all parameters on parameter servers, values will be zero
+   * means do not sending local parameters
+   */
+  void setParameterZero() {
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM_ZERO, PARAMETER_VALUE,
+                            0,       // numSamples = 0
+                            0,       // cost = 0
+                            false);  // sendBackParameter = false
+  }
+
+  /**
+   * @brief Wait until all gradient servers start one pass.
+   *
+   * @note This is now only used by the gradient servers for "sgd"
+   *       algorithm. Calling this function means that the calling gradient
+   *       server is ready to start a new pass.
+   */
+  void waitPassStart();
+
+  /**
+   * @brief Wait until all gradient servers finish one pass.
+   *
+   * @note This is now only used by the gradient servers for "sgd" algorithm.
+   *       Calling this function means that the calling gradient server
+   *       finishes one pass.
+   */
+  void waitPassFinish();
+
+  /// Wait until all gradient servers call this function.
+  void synchronize(SyncObject syncObjectId = SYNC_DEFAULT);
+
+  /// Called when async-sgd finish pass.
+  void asyncFinishPass(SyncObject syncObjectId = SYNC_DEFAULT);
+
+  void asyncStartPass(SyncObject syncObjectId = SYNC_DEFAULT) {
+    return synchronize(syncObjectId);
+  }
+
+  /**
+   * @brief Execute the prepared operations on pservers, fetch the results and
+   *        aggregate results from different pservers.
+   * @param[in] ops Prepared operations that will be executed on pservers.
+   * @param[in] waitForGradient If true, wait for gradient to be ready before
+   *            starting the operations.
+   * @param[in] sendBackParameter If true, send back the parameter to clients
+   *            after the operations are finished.
+   * @param[in] If true, and if all clients call waitPassFinish, signal all
+   *            clients finish the pass.
+   */
+  void doOperation(PreparedOperations& ops, bool waitForGradient,
+                   bool sendBackParameter, bool releasePass = true);
+
+  /**
+   * Set the configuration of pserver, including parameter config and
+   * optimization config
+   */
+  void setConfig(const OptimizationConfig& optConfig,
+                 const std::string& saveDir = "", bool isSparseServer = false);
+
+  /// Return true if all pservers are in the given status
+  bool inStatus(PServerStatus status);
+  bool isPassFinish() { return passFinish_; }
+
+  /// Set pserver status
+  void setStatus(PServerStatus status);
+
+  /**
+   * @brief Wait until all pservers are at status
+   * @note This function is not suitable for frequent use,
+   *       because it sleeps 1 second each time when condition is satisfied.
+   */
+  void waitForStatus(PServerStatus status);
+
+  /// Create a column vector. The size is the dimension of parameter.
+  PServerVector createVector();
+
+  /// Release the PServerVector given handle.
+  void releaseVector(PServerVector handle);
+
+  /**
+   * Create a column major matrix. The number of rows is the dimension of
+   * parameter. The number of columns is specifed by numCols.
+   */
+  PServerMatrix createMatrix(int32_t numCols);
+
+  /// Release the PServerMatrix given handle.
+  void releaseMatrix(PServerMatrix handle);
+
+  // Some basic algebra functions
+  /// Calculate the dot product of u and v
+  real vectorDotProduct(PServerVector u, PServerVector v);
+
+  /// Scale u by a
+  void vectorScale(PServerVector u, real a);
+
+  /// Copy from src to dest
+  void vectorCopy(PServerVector src, PServerVector dst);
+
+  /// u += v * a
+  void vectorAddMult(PServerVector u, PServerVector v, real a);
+
+  /// u = v + w * a
+  void vectorAddMultInto(PServerVector u, PServerVector v, PServerVector w,
+                         real a);
+  /// u = v * a
+  void vectorScaleInto(PServerVector u, PServerVector v, real a);
+
+  /// Return pserver parameter value.
+  PServerVector getPServerParameterValue() {
+    PServerVector vec;
+    vec.handle = PARAMETER_VALUE;
+    return vec;
+  }
+
+  /// Return pserver parameter gradient.
+  PServerVector getPServerParameterGradient() {
+    PServerVector vec;
+    vec.handle = PARAMETER_GRADIENT;
+    return vec;
+  }
+
+  /**
+   * Tell pservers to load value vector from file.
+   *
+   * @param[in] dirName The directory that contains the value vector file.
+   */
+  void loadValueVector(const std::string& dirName);
+
+  /// Tell pservers to save value vector to file.
+  void saveValueVector(const std::string& dirName);
+
+  void setTrainerId(int trainerId) { trainerId_ = trainerId; }
+
+#ifndef PADDLE_DISABLE_TIMER
+  void setForwardbackwardTime(uint64_t delta) { forwardbackwordTime_ = delta; }
+#endif
+
+protected:
+  template <typename ProtoIn, typename ProtoOut>
+  void multiCall(const char* funcName, const ProtoIn& request,
+                 std::vector<ProtoOut>* responses) {
+    responses->resize(clients_.size());
+    size_t numClients = clients_.size();
+    for (size_t i = 0; i < numClients; ++i) {
+      clients_[i].send(funcName, request);
+    }
+    for (size_t i = 0; i < numClients; ++i) {
+      clients_[i].recv(&(*responses)[i]);
+    }
+  }
+
+private:
+  void destroy();
+
+  /**
+   * @brief management function for parallelizing send/recv all connections
+   *        to all pservers. it is called under one SyncThreadPool. it
+   *        supports to use N thread to control M connections. the receiving
+   *        actions can be started until all sending action to all connections
+   *        owned by current thread are finished. Different connections controlled
+   *        by different threads can transfer data asynchronously.
+   */
+  void sendParallel(int tid, size_t numThreads,
+                    ParameterType recvParameterType);
+  /// sending thread routine for asynchronously send data
+  void send(int threadId);
+  /// receiving thread routing for asynchronously receive data
+  void recv(int threadId);
+
+  /**
+   * @brief main routine to build data for pserver
+   *
+   * @note  it can prepare different kinds of parameter type data. it can
+   *        be regarded as layer for bridging real parameters data and
+   *        protobuf data for communication.
+   *        TODO(yanfei):
+   *        can abstract additional layer to encode and decode data to/from
+   *        protobuf data.
+   */
+  void prepareSendData(
+      ParameterUpdateMode updateMode,
+      ParameterType parameterType,  // client send type
+      const std::vector<ParameterSegments>& parameterSegments,
+      int64_t numSamples, real cost, bool sendBackParameter,
+      ParameterType sendBackParameterType,  // send back type in pserver
+      BatchStatus batchStatus, SendJob* sendJob);
+
+  /// start necessary threads for threadPool
+  void initThreads();
+
+protected:
+  /// start port number of pserver
+  /// it deduce all ports for dense and sparse with some rules
+  int port_;
+  /// identify the trainer id using this client
+  int trainerId_;
+
+#ifndef PADDLE_DISABLE_TIMER
+  uint64_t forwardbackwordTime_;
+#endif
+
+  /// map id to parameter used for decoding protobuf data
+  std::unordered_map<size_t, ParameterPtr> parameterMap_;
+  /// segments for all parameters that needed to sync
+  std::vector<ParameterSegments> allSegments_;
+
+  /// module for sensing sparse parameters distribution on all pservers
+  std::unique_ptr<SparseParameterDistribution> sparseDistribution_;
+
+  /// thread pool for parallelizing all connections to pservers
+  std::unique_ptr<SyncThreadPool> syncThreadPool_;
+
+  bool passFinish_;
+};
+
+}  // namespace paddle
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
new file mode 100644
index 00000000000000..bb3caeb728d1c4
--- /dev/null
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -0,0 +1,1573 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ParameterServer2.h"
+
+#include <algorithm>
+#include <fstream>
+
+#include "paddle/math/SIMDFunctions.h"
+
+#include "paddle/parameter/AverageOptimizer.h"
+#include "paddle/parameter/FirstOrderOptimizer.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/parameter/OptimizerFunctions.h"
+#include "paddle/parameter/OptimizerWithRegularizer.h"
+#include "paddle/parameter/ParameterUpdateFunctions.h"
+#include "paddle/parameter/ParameterOptimizer.h"
+#include "paddle/parameter/Regularizer.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/GlobalConstants.h"
+
+P_DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
+P_DEFINE_double(async_lagged_ratio_min, 1.0,
+                "control config_.async_lagged_grad_discard_ratio() min value");
+P_DEFINE_double(
+    async_lagged_ratio_default, 1.5,
+    "if async_lagged_grad_discard_ratio is not set in trainer_config.conf"
+    "use it as defalut value");
+
+namespace paddle {
+
+const std::string ParameterServer2::kRetMsgInvalidMatrixHandle =
+    "Invalid matrix handle";
+const std::string ParameterServer2::kRetMsgInvalidVectorHandle =
+    "Invalid vector handle";
+const std::string ParameterServer2::kRetMsgUnknownOperation =
+    "Unknown operation";
+
+ParameterServer2::ParameterServer2(const std::string& addr, int port,
+                                   int rdmaCpu)
+    : ProtoServer(addr, port, rdmaCpu),
+      dataSize_(0),
+      size_(0),
+      gradientReadyBarrier_(FLAGS_num_gradient_servers + 1),
+      parameterReadyBarrier_(FLAGS_num_gradient_servers + 1),
+      passBarrier_(FLAGS_num_gradient_servers + 1),
+      numPassFinishClients_(0),
+      allClientPassFinish_(false),
+      serverId_(-1),
+      batchId_(-1) {
+ /**
+  * register function for remote client calling, these functions
+  * will be mapped to a data structure for quick looking up. each
+  * request from trainer can contains one function name to indicate
+  * remote action. this architecture looks like rpc style for pserver.
+  */
+  REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendParameter);
+  REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendData);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, setConfig);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, setStatus);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, getStatus);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, doOperation);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, createVector);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, releaseVector);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, createMatrix);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, releaseMatrix);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, waitPassStart);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, waitPassFinish);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, synchronize);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, asyncFinishPass);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, loadValueVector);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, saveValueVector);
+
+  /// thread pool for parallelizing some computations
+  if (FLAGS_pserver_num_threads > 1) {
+    syncThreadPool_.reset(new SyncThreadPool(FLAGS_pserver_num_threads, false));
+  }
+}
+
+bool ParameterServer2::init() {
+  vectors_.resize(NUM_PARAMETER_TYPES);
+  configMap_.clear();
+
+  numSamplesProcessed_ = 0;
+  cost_ = 0;
+  char* mpienv = getenv("OMPI_COMM_WORLD_SIZE");
+  if (mpienv != NULL) {
+    mpiSize_ = atoi(mpienv);
+  } else {
+    mpiSize_ = 1;
+  }
+  status_ = PSERVER_STATUS_NOT_SET;
+  dataMems_.resize(FLAGS_num_gradient_servers);
+  synchronizeBarriers_.resize(SyncObject_ARRAYSIZE);
+  for (auto& barrier : synchronizeBarriers_) {
+    barrier.reset(new ThreadBarrier(FLAGS_num_gradient_servers));
+  }
+
+  // initialization for dicarding lagging gradient
+  asyncUpdateSteps_ = 0;
+  asyncTrainerSteps_.resize(FLAGS_num_gradient_servers);
+  asyncTrainerSteps_.assign(asyncTrainerSteps_.size(), 0);
+  asyncLaggedGradientsNum_ = 0;
+  asyncUpdateStat_.resize(static_cast<int>(FLAGS_num_gradient_servers *
+                                           FLAGS_async_lagged_ratio_default));
+  asyncUpdateStat_.assign(asyncUpdateStat_.size(), 0);
+  asyncTrainerDiscardStat_.resize(FLAGS_num_gradient_servers);
+  asyncTrainerDiscardStat_.assign(asyncTrainerDiscardStat_.size(), 0);
+  asyncTrainerCommitStat_.resize(FLAGS_num_gradient_servers);
+  asyncTrainerCommitStat_.assign(asyncTrainerCommitStat_.size(), 0);
+
+  return true;
+}
+
+void ParameterServer2::getStatus(const GetStatusRequest& request,
+                                 ProtoResponseCallback callback) {
+  (void)request;
+  GetStatusResponse response;
+  response.set_status(status_);
+  callback(response);
+}
+
+void ParameterServer2::setStatus(const SetStatusRequest& request,
+                                 ProtoResponseCallback callback) {
+  status_ = request.status();
+  SetStatusResponse response;
+  callback(response);
+}
+
+void ParameterServer2::setConfig(const SetConfigRequest& request,
+                                 ProtoResponseCallback callback) {
+  {
+    std::lock_guard<RWLock> guard(parameterMutex_);
+
+    serverId_ = request.server_id();
+    isSparseServer_ = request.is_sparse_server();
+
+    if (!request.save_dir().empty()) {
+      mkDir(request.save_dir().c_str());
+    }
+
+  for (const auto& config : request.param_configs()) {
+    CHECK(!configMap_.count(config.para_id()))
+        << "Duplicated parameter name: " << config.name();
+    configMap_[config.para_id()] = config;
+    CHECK_EQ(config.sparse_remote_update(), isSparseServer_);
+  }
+
+    config_ = request.opt_config();
+    if (config_.algorithm() == TrainAlgorithm::AsyncSGD) {
+      auto asyncLaggedRatio = config_.async_lagged_grad_discard_ratio();
+      if (asyncLaggedRatio <= FLAGS_async_lagged_ratio_min) {
+        LOG(INFO) << "WARNING: async_lagged_grad_discard_ratio is too small"
+                  << "reset to default, async_lagged_grad_discard_ratio = "
+                  << FLAGS_async_lagged_ratio_default;
+        asyncLaggedRatio = FLAGS_async_lagged_ratio_default;
+      }
+      asyncLaggedThreshold_ =
+          static_cast<int64_t>(FLAGS_num_gradient_servers * asyncLaggedRatio);
+      LOG(INFO) << "discard lagged async gradient ratio: " << asyncLaggedRatio
+                << " asyncLaggedhreshold: " << asyncLaggedThreshold_;
+    }
+    if (isSparseServer_ && config_.num_batches_per_send_parameter() > 1) {
+      /// sparse server must NOT use local update mode
+      config_.set_num_batches_per_send_parameter(1);
+    }
+
+    if (config_.num_batches_per_send_parameter() > 1 &&
+        config_.center_parameter_update_method() == "average") {
+      /// scaling L1/L2 decay rate as large as L1/L2 apply in trainer
+      /// if parameter regularization in pserver
+      for (auto& pair : configMap_) {
+        ParameterConfig& config = pair.second;
+        if (config_.num_batches_per_send_parameter() ==
+            config.num_batches_regularization()) {
+          real scale =
+              config_.delta_add_rate() * config.num_batches_regularization();
+          if (config_.algorithm() == "sgd") {
+            scale *= FLAGS_num_gradient_servers;
+          }
+          config.set_decay_rate(config.decay_rate() * scale);
+          if (config.decay_rate() > 0.1f) {
+            LOG(FATAL) << "L2 decay=" << config.decay_rate()
+                       << " for parameter:" << config.name()
+                       << " is too large after scale in pserver!";
+          }
+          config.set_decay_rate_l1(config.decay_rate_l1() * scale);
+          if (config.decay_rate_l1() > 0.1f) {
+            LOG(FATAL) << "L1 decay=" << config.decay_rate_l1()
+                       << " for parameter:" << config.name()
+                       << " is too large after scale in pserver!";
+          }
+
+          LOG(INFO) << "parameter:" << config.name()
+                    << " decay apply in pserver,"
+                    << " L1 decay=" << config.decay_rate_l1()
+                    << " L2 decay=" << config.decay_rate();
+        }
+      }
+    }
+  }
+
+  SetConfigResponse response;
+  callback(response);
+
+  /// always defined, barrier slowest node function need it.
+  statSet_.reset(new StatSet("ParameterServer" + std::to_string(serverId_)));
+}
+
+real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) {
+  real sum = 0;
+  for (const auto buffer : buffers) {
+    for (size_t i = 0; i < buffer.size; ++i) {
+      sum += buffer.base[i];
+    }
+  }
+  return sum;
+}
+
+void ParameterServer2::mergeSegments(BlockSegments* segments) {
+  if (segments->empty()) {
+    return;
+  }
+  std::sort(segments->begin(), segments->end());
+  auto curr = segments->begin();
+  for (auto it = segments->begin(); it != segments->end(); ++it) {
+    if (it->first <= curr->second) {
+      curr->second = std::max(curr->second, it->second);
+    } else {
+      ++curr;
+      *curr = *it;
+    }
+  }
+  ++curr;
+  segments->erase(curr, segments->end());
+}
+
+void ParameterServer2::setParameter(const SendParameterRequest& request,
+                                    std::vector<Buffer>& inputBuffers,
+                                    SendParameterResponse* response,
+                                    std::vector<Buffer>* outputBuffers) {
+  (void)response;
+  (void)outputBuffers;
+  LOG(INFO) << "pserver: setParameter";
+  std::lock_guard<RWLock> guard(parameterMutex_);
+
+  int64_t numBlocks = blockIdMap_.size();
+  CHECK_EQ(blockIdMap_.size(), blockOffsetMap_.size());
+  /// total bytes for all the added blocks
+  int64_t totalSize = size_;
+  std::vector<int64_t> offsets;
+  offsets.reserve(request.blocks_size());
+  std::vector<int64_t> blockIds;
+  blockIds.reserve(request.blocks_size());
+  int bufferIndex = 0;
+  for (const auto& block : request.blocks()) {
+    /// block size for parameter(e.g. 128 for sparse row, 1K for dense)
+    uint64_t blockSize = getParameterConfig(block).parameter_block_size();
+    BlockKey key(block.para_id(), block.block_id());
+    if (inputBuffers.size()) {  // if !=PSERVER_UPDATE_MODE_SET_PARAM_ZERO
+      Buffer buffer = inputBuffers[bufferIndex];
+      ++bufferIndex;
+      CHECK_EQ(buffer.size, block.block_size())
+          << "data size is too big:"
+          << " block_size=" << block.block_size()
+          << " data_size=" << buffer.size;
+    }
+
+    /// add a new block
+    if (blockIdMap_.count(key) == 0) {
+      blockOffsetMap_[key] = totalSize;
+      blockIdMap_[key] = numBlocks;
+      ++numBlocks;
+      totalSize += blockSize;
+    }
+    offsets.push_back(blockOffsetMap_[key]);
+    blockIds.push_back(blockIdMap_[key]);
+  }
+
+  size_ = totalSize;
+  LOG(INFO) << "pserver: new cpuvector: size=" << size_;
+  if (!vectors_[PARAMETER_VALUE]) {
+    /// vectors_
+    const auto types = sgdOptimizerGetTypes(config_, true /*inPserver*/);
+    for (const auto type : types) {
+      vectors_[type].reset(new CpuVector(size_));
+      vectors_[type]->zeroMem();
+    }
+
+    blockInfos_.resize(numBlocks);
+    for (auto& info : blockInfos_) {
+      info.lock.reset(new std::mutex());
+    }
+  } else {
+    CHECK_EQ((size_t)size_, vectors_[PARAMETER_VALUE]->getSize())
+        << "Currently adding new blocks is not supported. "
+        << "All blocks must be added in one setParameter call";
+  }
+
+  VectorPtr buf = vectors_[PARAMETER_VALUE];
+  usedSegments_.reserve(offsets.size());
+  /// if offsets is empty, means parameter_block_size is too big or too many
+  /// nodes.
+  if (offsets.empty()) {
+    LOG(WARNING) << "in setParameter: offsets is empty";
+  }
+  for (size_t i = 0; i < offsets.size(); ++i) {
+    size_t blockId = blockIds[i];
+    BlockInfo& info = blockInfos_[blockId];
+    const ParameterConfig& config = getParameterConfig(request.blocks(i));
+    info.config = &config;
+    info.offset = offsets[i];
+    info.optimizer.reset(sgdOptimizerCreate(
+        config_, config, config.sparse_remote_update(), true /*inPserver*/));
+    if (config.sparse_remote_update()) {
+      size_t width = config.dims(1);
+      CHECK_EQ(config.parameter_block_size(), width)
+          << "block size: " << config.parameter_block_size()
+          << "width : " << width;
+    }
+    info.optimizer->init(1, info.config);
+    usedSegments_.push_back(std::make_pair(offsets[i],
+                offsets[i] + request.blocks(i).block_size()));
+  }
+  mergeSegments(&usedSegments_);
+
+  if (request.update_mode() == PSERVER_UPDATE_MODE_SET_PARAM) {
+    /// copy param from trainer
+    for (size_t i = 0; i < offsets.size(); ++i) {
+      Buffer buffer = inputBuffers[i];
+      real* start = buf->getPoint(offsets[i]);
+      CHECK_LE(offsets[i] + buffer.size, buf->getSize());
+      memcpy(start, buffer.base, sizeof(real) * buffer.size);
+    }
+  } else {
+    CHECK(request.update_mode() == PSERVER_UPDATE_MODE_SET_PARAM_ZERO);
+    /// nothing to do, value vector zero mem already
+  }
+}
+
+void ParameterServer2::addGradient(const SendParameterRequest& request,
+                                   std::vector<Buffer>& inputBuffers,
+                                   SendParameterResponse* response,
+                                   std::vector<Buffer>* outputBuffers) {
+  VLOG(1) << "pserver: addGradient";
+
+  /// forwardbackward delta from all trainers
+  /// indicate the fluctuation caused by forwardbackward.
+#ifndef PADDLE_METRIC_LEARNING
+  // @TODO(yanfei):
+  // add support tuning forwardbackward balance for metric learning
+  if (!numPassFinishClients_) {
+    REGISTER_BARRIER_DELTA_SERVER_SET(
+        *statSet_, "forwardbackwardDelta", FLAGS_num_gradient_servers,
+        request.trainer_id(), request.forwardbackward_time(),
+        isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
+  }
+#endif
+
+  {
+    /// approximately pure network overhead
+    REGISTER_TIMER_DYNAMIC_SET(
+        "pushRecv", timeToMicroSecond(*handleRequestBegin_), -1, *statSet_);
+  }
+
+#ifndef PADDLE_DISABLE_TIMER
+  gettimeofday(&(*addGradBegin_), nullptr);
+#endif
+
+  /// barrier fluctuation caused by network and previous forwardbackward
+  if (!numPassFinishClients_) {
+    REGISTER_BARRIER_TIMER_SERVER_SET(
+        *statSet_, "handleReqBegin", FLAGS_num_gradient_servers,
+        request.trainer_id(), (*handleRequestBegin_),
+        isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
+  }
+
+  if (!numPassFinishClients_) {
+    REGISTER_BARRIER_TIMER_SERVER(
+        *statSet_, "addGradBegin", FLAGS_num_gradient_servers,
+        request.trainer_id(),
+        isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
+  }
+
+  {
+    REGISTER_TIMER_DYNAMIC("addGradCore", -1, *statSet_);
+    ReadLockGuard guard(parameterMutex_);
+    int bufferIndex = 0;
+    for (const auto& block : request.blocks()) {
+      int64_t offset = getBlockOffset(block);
+      CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
+                          << " id=" << block.para_id()
+                          << " block id=" << block.block_id();
+
+      int64_t blockId = getBlockId(block);
+      CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
+                          << " id=" << block.para_id()
+                          << " block id=" << block.block_id();
+
+      Buffer buffer = inputBuffers[bufferIndex];
+      ++bufferIndex;
+
+      const real* gradientBuffer = buffer.base;
+      real* gradientSumBuffer = vectors_[PARAMETER_GRADIENT]->getPoint(offset);
+
+      size_t size = buffer.size;
+
+      BlockInfo& info = blockInfos_[blockId];
+      const ParameterConfig& config = getParameterConfig(blockId);
+      if (config.sparse_remote_update()) {
+        CHECK_EQ(size, config.parameter_block_size());
+      } else {  // dense
+        CHECK_LE(size, config.parameter_block_size());
+      }
+      std::lock_guard<std::mutex> guard(*info.lock);
+      simd::addTo(gradientSumBuffer, gradientBuffer, size);
+    }
+
+    if (!numPassFinishClients_) {
+      REGISTER_BARRIER_TIMER_SERVER(
+          *statSet_, "addGradCoreFinish", FLAGS_num_gradient_servers,
+          request.trainer_id(),
+          isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
+    }
+  }
+  if (request.batch_status() == BATCH_FINISH ||
+      request.batch_status() == BATCH_START_AND_FINISH) {
+    numSamplesProcessed_ += request.num_samples();
+    cost_ += request.cost();
+    VLOG(1) << "num samples: " << numSamplesProcessed_
+            << ", new cost:" << cost_;
+
+    /// numPassFinishClients_ means some trainer has entered finishPass
+    if (!numPassFinishClients_) {
+      REGISTER_SLOW_NODES_PROBE(
+          *statSet_, "SLOW_NODES", FLAGS_num_gradient_servers,
+          request.trainer_id(),
+          isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
+    }
+
+    /// notify doOperation gradient ready
+    gradientReadyBarrier_.wait();
+
+    /// if wait pass finish does not start, do check
+    if (!numPassFinishClients_) {
+      CHECK_BARRIER_TIMER(*statSet_, "SLOW_NODES", FLAGS_num_gradient_servers,
+                          isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
+    }
+
+    /// barrier performance while all parameter add is finished
+    /// can indicate the fluctation caused by computation at pserver.
+    if (!numPassFinishClients_) {
+      REGISTER_BARRIER_TIMER_SERVER(
+          *statSet_, "paraReady", FLAGS_num_gradient_servers,
+          request.trainer_id(),
+          isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
+    }
+    /// wait doOperation finish
+    parameterReadyBarrier_.wait();
+    VLOG(1) << "start send back";
+    {
+      /// total time except overhead of network.
+      REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecvNoSend",
+                                 timeToMicroSecond(*addGradBegin_), -1,
+                                 *statSet_);
+    }
+  }
+}
+
+bool ParameterServer2::asyncGrdientCommitCheckAndStat(
+    const SendParameterRequest& request) {
+  const auto trainerId = request.trainer_id();
+  int64_t trainerSteps = asyncTrainerSteps_[trainerId];
+  CHECK_GE(asyncUpdateSteps_, trainerSteps)
+      << " async update steps overflows "
+      << " trainer id: " << trainerId
+      << " async update steps in pserver: " << asyncUpdateSteps_
+      << " async update steps in request: " << trainerSteps;
+
+  asyncUpdateSteps_++;
+  bool commitGradient = true;
+
+  int64_t delta = asyncUpdateSteps_ - trainerSteps;
+  if (delta >= asyncLaggedThreshold_) {
+    VLOG(1) << "discard Async Update: "
+            << " trainer id: " << trainerId
+            << " pserver steps: " << asyncUpdateSteps_
+            << " request steps: " << trainerSteps;
+    asyncLaggedGradientsNum_++;
+    commitGradient = false;
+  }
+  /// stat on lagged steps, to get total discard distribution
+  if (static_cast<size_t>(delta) < asyncUpdateStat_.size()) {
+    asyncUpdateStat_[delta]++;
+  } else {
+    asyncUpdateStat_[asyncUpdateStat_.size() - 1]++;
+  }
+  /// stat on trainerId and discard, to get trainer condition
+  if (commitGradient) {
+    asyncTrainerCommitStat_[trainerId]++;
+  } else {
+    asyncTrainerDiscardStat_[trainerId]++;
+  }
+
+  return commitGradient;
+}
+
+void ParameterServer2::printAsyncGradientCommitStatAndReset() {
+  std::stringstream statFormat;
+  if (asyncUpdateSteps_) {
+    statFormat << "async discard gradients stat: " << std::endl;
+    statFormat << "serverId: " << serverId_
+               << " serverType: " << isSparseServer_
+               << " total updates: " << asyncUpdateSteps_
+               << " discard updates: " << asyncLaggedGradientsNum_
+               << " discard ratio: "
+               << (real)asyncLaggedGradientsNum_ / (real)asyncUpdateSteps_;
+    statFormat << std::endl;
+    statFormat << std::endl;
+
+    statFormat << "Async Gradient Update Steps distribution: " << std::endl
+               << "Sample: 1:1912(0.00284449) means "
+               << "the updates step=1 count 1912 times "
+               << "and account for 0.284449% of total updates" << std::endl;
+    size_t index = 0;
+    for (const auto& stat : asyncUpdateStat_) {
+      statFormat << index << ":" << stat << "("
+                 << (real)stat / (real)asyncUpdateSteps_ << ") ";
+      index++;
+    }
+    statFormat << std::endl;
+    statFormat << std::endl;
+
+    statFormat << "Async Gradient Discard based on trainer_id: " << std::endl
+               << "Sample: 2:22(0.0016363) means "
+               << "total discarded updates from trainer_id=2 count 22 "
+               << "and account for 0.16363% of all updates from trainer_id=2"
+               << std::endl;
+    for (auto i = 0; i < FLAGS_num_gradient_servers; i++) {
+      real ratio =
+          (real)asyncTrainerDiscardStat_[i] /
+          (real)(asyncTrainerCommitStat_[i] + asyncTrainerDiscardStat_[i]);
+      statFormat << i << ":" << asyncTrainerDiscardStat_[i] << "(" << ratio
+                 << ")"
+                 << " ";
+    }
+    LOG(INFO) << statFormat.str();
+
+    /// reset stat
+    asyncUpdateSteps_ = 0;
+    asyncTrainerSteps_.assign(asyncTrainerSteps_.size(), 0);
+    asyncLaggedGradientsNum_ = 0;
+    asyncUpdateStat_.assign(asyncUpdateStat_.size(), 0);
+    asyncTrainerDiscardStat_.assign(asyncTrainerDiscardStat_.size(), 0);
+    asyncTrainerCommitStat_.assign(asyncTrainerCommitStat_.size(), 0);
+  }
+}
+
+static ThreadLocal<std::vector<bool>> localBlockBitset_;
+
+void ParameterServer2::asyncSGD(const SendParameterRequest& request,
+                                std::vector<Buffer>& inputBuffers,
+                                SendParameterResponse* response,
+                                std::vector<Buffer>* outputBuffers) {
+  int64_t numBlocks = blockIdMap_.size();
+  auto& localBlockBitset = *localBlockBitset_;
+
+  if (isSparseServer_) {
+    if (localBlockBitset.empty()) {
+      localBlockBitset.resize(numBlocks);
+    }
+    localBlockBitset.assign(numBlocks, false);
+  }
+
+  ReadLockGuard guard(parameterMutex_);
+
+  if (request.send_back_parameter()) {
+    outputBuffers->reserve(request.blocks_size());
+  }
+
+  bool commitGradient = asyncGrdientCommitCheckAndStat(request);
+
+  VectorPtr* vecs = Parameter::getTlsTempBufs();
+  size_t bufferIndex = 0;
+  for (const auto& block : request.blocks()) {
+    int64_t offset = getBlockOffset(block);
+    CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
+                        << " id=" << block.para_id()
+                        << " block id=" << block.block_id();
+    int64_t blockId = getBlockId(block);
+    CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
+        << " id=" << block.para_id() << " block id=" << block.block_id();
+    Buffer buffer = inputBuffers[bufferIndex];
+    ++bufferIndex;
+
+    size_t size = buffer.size;
+
+    BlockInfo& info = blockInfos_[blockId];
+    const ParameterConfig& config = getParameterConfig(blockId);
+
+    std::lock_guard<std::mutex> guard(*info.lock);
+    /// gradients are too obsolete, will be discarded
+    if (commitGradient) {
+      info.optimizer->startBatch(numSamplesProcessed_);
+
+      for (const auto type : info.optimizer->getParameterTypes()) {
+        vecs[type]->subVecFrom(*vectors_[type], offset, size);
+      }
+      vecs[PARAMETER_GRADIENT]->subVecFrom(buffer.base, 0, size);
+      info.optimizer->update(vecs, config, isSparseServer_ ? 0 : -1);
+
+      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
+        blockTraverse(info, config, offset, size, vecs, callback);
+      }
+      info.optimizer->finishBatch();
+    }
+
+    if (commitGradient && isSparseServer_) {
+      localBlockBitset[blockId] = true;
+    }
+
+    if (!isSparseServer_ && request.send_back_parameter()) {  // dense
+      int type = request.send_back_parameter_type();
+      sendBackParameter(block, type, response, &buffer, outputBuffers);
+    }
+  }  /// foreach block
+
+  asyncTrainerSteps_[request.trainer_id()] = asyncUpdateSteps_;
+
+  if (commitGradient && isSparseServer_) {
+    /// find blocks that trainer do not request update
+    for (int64_t blockId = 0; blockId < numBlocks; ++blockId) {
+      if (localBlockBitset[blockId]) {
+        continue;
+      }
+
+      BlockInfo& info = blockInfos_[blockId];
+      const ParameterConfig& config = *info.config;
+      size_t size = config.parameter_block_size();
+
+      std::lock_guard<std::mutex> guard(*info.lock);
+      info.optimizer->startBatch(numSamplesProcessed_);
+      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
+        blockTraverse(info, config, info.offset, size, vecs, callback);
+      }
+      info.optimizer->finishBatch();
+    }
+  }
+
+  if (commitGradient && (request.batch_status() == BATCH_FINISH ||
+                         request.batch_status() == BATCH_START_AND_FINISH)) {
+    numSamplesProcessed_ += request.num_samples();
+  }
+
+  /// show some performance log if needed
+  if (request.trainer_id() == 0) {
+    /// batchId_ is approximately equal to "real batchId_"
+    batchId_++;
+    tuningAsyncsgdMidOutput();
+  }
+}
+
+void ParameterServer2::getParameter(const SendParameterRequest& request,
+                                    std::vector<Buffer>& inputBuffers,
+                                    SendParameterResponse* response,
+                                    std::vector<Buffer>* outputBuffers) {
+  (void)inputBuffers;
+  LOG(INFO) << "pserver: getParameter";
+  ReadLockGuard guard(parameterMutex_);
+  for (const auto& block : request.blocks()) {
+    int type = request.send_back_parameter_type();
+    sendBackParameter(block, type, response, outputBuffers);
+  }
+}
+
+void ParameterServer2::getParameterSparse(const SendParameterRequest& request,
+                                          std::vector<Buffer>& inputBuffers,
+                                          SendParameterResponse* response,
+                                          std::vector<Buffer>* outputBuffers) {
+  (void)inputBuffers;
+  auto& buffer = *readWriteBuffer_;
+  size_t numReals = 0;
+  for (const auto& block : request.blocks()) {
+    numReals += getParameterConfig(block).dims(1);
+  }
+  buffer.resize(numReals);
+
+  VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals;
+
+  ReadLockGuard guard(parameterMutex_);
+  size_t offset = 0;
+  for (const auto& block : request.blocks()) {
+    size_t width = getParameterConfig(block).dims(1);
+    Buffer buf = {buffer.data() + offset, width};
+    int type = request.send_back_parameter_type();
+    sendBackParameterSparse(block, type, response, &buf, width, outputBuffers);
+    offset += width;
+  }
+}
+
+void ParameterServer2::sendBackParameter(const ParameterBlock& block,
+                                         int parameterType,
+                                         SendParameterResponse* response,
+                                         std::vector<Buffer>* outputBuffers) {
+  ParameterBlock* returnBlock = response->add_blocks();
+  returnBlock->set_para_id(block.para_id());
+  returnBlock->set_block_id(block.block_id());
+  returnBlock->set_begin_pos(block.begin_pos());
+  returnBlock->set_block_size(block.block_size());
+
+  int64_t offset = getBlockOffset(block);
+  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
+      << " id=" << block.para_id() << " block id=" << block.block_id();
+
+  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
+  outputBuffers->push_back({valueBuffer, block.block_size()});
+}
+
+void ParameterServer2::sendBackParameter(const ParameterBlock& block,
+                                         int parameterType,
+                                         SendParameterResponse* response,
+                                         Buffer* buffer,
+                                         std::vector<Buffer>* outputBuffers) {
+  ParameterBlock* returnBlock = response->add_blocks();
+  returnBlock->set_para_id(block.para_id());
+  returnBlock->set_block_id(block.block_id());
+  returnBlock->set_begin_pos(block.begin_pos());
+  returnBlock->set_block_size(block.block_size());
+
+  int64_t offset = getBlockOffset(block);
+  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
+      << " id=" << block.para_id() << " block id=" << block.block_id();
+
+  size_t size = buffer->size;
+  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
+  /// copy to second buffer to avoid to be polluted by other request
+  memcpy(buffer->base, valueBuffer, sizeof(real) * size);
+  outputBuffers->push_back({buffer->base, size});
+}
+
+void ParameterServer2::sendBackParameterSparse(
+    const ParameterBlock& block, int parameterType,
+    SendParameterResponse* response, Buffer* buffer, size_t width,
+    std::vector<Buffer>* outputBuffers) {
+  ParameterBlock* returnBlock = response->add_blocks();
+  returnBlock->set_para_id(block.para_id());
+  returnBlock->set_block_id(block.block_id());
+  returnBlock->set_begin_pos(block.begin_pos());
+  returnBlock->set_block_size(block.block_size());
+  int64_t offset = getBlockOffset(block);
+  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
+      << " id=" << block.para_id() << " block id=" << block.block_id();
+
+  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
+  CHECK_EQ(buffer->size, width);
+  memcpy(buffer->base, valueBuffer, width * sizeof(real));
+  outputBuffers->push_back(*buffer);
+}
+
+void ParameterServer2::readAllBlocks(
+    MsgReader* msgReader, std::vector<ParameterServer2::Buffer>* buffers) {
+  auto& buffer = *readWriteBuffer_;
+  size_t numBlocks = msgReader->getNumBlocks();
+  buffer.resizeWithAlignHints(msgReader->getTotalLength()/sizeof(real),
+                              numBlocks);
+  std::vector<void*> bufs(numBlocks);
+  buffers->clear();
+  buffers->reserve(numBlocks);
+  buffer.resetAlignAlloc();
+  for (size_t i = 0; i < numBlocks; ++i) {
+    size_t len = msgReader->getBlockLength(i);
+    CHECK_EQ(len % sizeof(real), (size_t)0);
+    size_t size = len / sizeof(real);
+    bufs[i] = buffer.nextBlock(size);
+    buffers->push_back({(real*)bufs[i], size});
+  }
+  msgReader->readBlocks(bufs);
+}
+
+void ParameterServer2::sendParameter(const SendParameterRequest& request,
+                                     std::unique_ptr<MsgReader> msgReader,
+                                     ProtoResponseCallbackEx callback) {
+  SendParameterResponse response;
+  std::vector<Buffer> inputBuffers;
+  std::vector<Buffer> outputBuffers;
+  readAllBlocks(msgReader.get(), &inputBuffers);
+  msgReader.reset();
+
+  switch (request.update_mode()) {
+    case PSERVER_UPDATE_MODE_SET_PARAM:
+    case PSERVER_UPDATE_MODE_SET_PARAM_ZERO:
+      setParameter(request, inputBuffers, &response, &outputBuffers);
+      break;
+    case PSERVER_UPDATE_MODE_GET_PARAM:
+      getParameter(request, inputBuffers, &response, &outputBuffers);
+      break;
+    case PSERVER_UPDATE_MODE_GET_PARAM_SPARSE:
+      getParameterSparse(request, inputBuffers, &response, &outputBuffers);
+      break;
+    case PSERVER_UPDATE_MODE_ASYNC_SGD:
+      asyncSGD(request, inputBuffers, &response, &outputBuffers);
+      break;
+    case PSERVER_UPDATE_MODE_ADD_GRADIENT:
+      addGradient(request, inputBuffers, &response, &outputBuffers);
+      break;
+    case PSERVER_UPDATE_MODE_AVERAGE_PARAMETER:
+      break;
+  }
+  switch (request.update_mode()) {
+    case PSERVER_UPDATE_MODE_ADD_GRADIENT:
+      (*requestVec_).push_back(request);
+      (*callbackVec_).push_back(callback);
+      if (request.batch_status() == BATCH_FINISH ||
+          request.batch_status() == BATCH_START_AND_FINISH) {
+        for (size_t i = 0; i < (*requestVec_).size(); i++) {
+          ReadLockGuard guard(parameterMutex_);
+          SendParameterRequest& request = (*requestVec_)[i];
+          SendParameterResponse responseTemp;
+
+          std::vector<iovec> outputIovs;
+          if (request.send_back_parameter()) {
+            CHECK(!isSparseServer_);
+            std::vector<Buffer> outputBuffersTemp;
+            for (const auto& block : request.blocks()) {
+              int type = request.send_back_parameter_type();
+              sendBackParameter(block, type, &responseTemp, &outputBuffersTemp);
+            }
+            outputIovs.reserve(outputBuffersTemp.size());
+            for (auto buffer : outputBuffersTemp) {
+              outputIovs.push_back({buffer.base, buffer.size * sizeof(real)});
+            }
+          }
+
+          ProtoResponseCallbackEx& callbackTemp = (*callbackVec_)[i];
+          callbackTemp(responseTemp, outputIovs);
+        }
+        (*requestVec_).clear();
+        (*callbackVec_).clear();
+
+        /// barrier perfromance while all data are send finished.
+        /// indicates network flucatuation for big message.
+        if (!numPassFinishClients_) {
+          REGISTER_BARRIER_TIMER_SERVER(
+              *statSet_, "sendParamFinish", FLAGS_num_gradient_servers,
+              request.trainer_id(),
+              isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
+        }
+        /// all time exhausted in parameterServer for big message.
+        /// it contains network and computation at pserver.
+        {
+          /// total time including overhead of network.
+          REGISTER_TIMER_DYNAMIC_SET("sendParaTotal",
+                                     timeToMicroSecond(*handleRequestBegin_),
+                                     -1, *statSet_);
+        }
+        /// all time exhausted in pserverServer except recieve network.
+        {
+          /// total time except overhead of network receive
+          REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecv",
+                                     timeToMicroSecond(*addGradBegin_), -1,
+                                     *statSet_);
+        }
+      }
+      break;
+    case PSERVER_UPDATE_MODE_SET_PARAM:
+    case PSERVER_UPDATE_MODE_SET_PARAM_ZERO:
+    case PSERVER_UPDATE_MODE_GET_PARAM:
+    case PSERVER_UPDATE_MODE_GET_PARAM_SPARSE:
+    case PSERVER_UPDATE_MODE_ASYNC_SGD:
+    case PSERVER_UPDATE_MODE_AVERAGE_PARAMETER:
+      std::vector<iovec> outputIovs;
+      outputIovs.reserve(outputBuffers.size());
+      for (auto buffer : outputBuffers) {
+        outputIovs.push_back({buffer.base, buffer.size * sizeof(real)});
+      }
+      callback(response, outputIovs);
+      break;
+  }
+}
+
+template <typename Dtype>
+void ParameterServer2::reduceAndSendData(const SendDataRequest& request,
+                                         std::unique_ptr<MsgReader>& msgReader,
+                                         ProtoResponseCallbackEx& callback) {
+  SendDataResponse response;
+  response.set_type(request.type());
+  response.set_server_id(serverId_);
+
+  auto sendData = reinterpret_cast<Dtype*>(dataMems_[0].get()->getBuf());
+  size_t rawMemSize = dataMems_[0].get()->getSize();
+  CHECK_EQ(rawMemSize % sizeof(Dtype), 0U);
+  size_t dataMemSize = rawMemSize / sizeof(Dtype);
+  for (size_t i = 1; i < dataMems_.size(); ++i) {
+    CHECK_EQ(dataMems_[i].get()->getSize(), rawMemSize);
+    auto data = reinterpret_cast<Dtype*>(dataMems_[i].get()->getBuf());
+    for (size_t j = 0; j < dataMemSize; ++j) {
+      sendData[j] += data[j];
+    }
+  }
+  std::vector<iovec> outputIovs;
+  auto block = response.add_blocks();
+  outputIovs.push_back({sendData, rawMemSize});
+  block->set_total_size(rawMemSize);
+  block->set_data_size(sizeof(Dtype));
+  callback(response, outputIovs);
+}
+
+void ParameterServer2::templateReduceSum(const SendDataRequest& request,
+                                         std::unique_ptr<MsgReader>& msgReader,
+                                         ProtoResponseCallbackEx& callback) {
+  const auto& block = request.blocks(0);
+  switch (block.data_type()) {
+    case TRANS_FLOAT:
+      reduceAndSendData<float>(request, msgReader, callback);
+      break;
+    case TRANS_DOUBLE:
+      reduceAndSendData<double>(request, msgReader, callback);
+      break;
+    case TRANS_INT32:
+      reduceAndSendData<int>(request, msgReader, callback);
+      break;
+    case TRANS_UINT32_T:
+      reduceAndSendData<uint32_t>(request, msgReader, callback);
+      break;
+    case TRANS_INT64_T:
+      reduceAndSendData<int64_t>(request, msgReader, callback);
+      break;
+    case TRANS_UINT64_T:
+      reduceAndSendData<uint64_t>(request, msgReader, callback);
+      break;
+    default:
+      LOG(FATAL) << "not supported";
+      break;
+  }
+}
+
+void ParameterServer2::sendData(const SendDataRequest& request,
+                                std::unique_ptr<MsgReader> msgReader,
+                                ProtoResponseCallbackEx callback) {
+  SendDataResponse response;
+  response.set_type(request.type());
+  response.set_server_id(serverId_);
+
+  switch (request.update_mode()) {
+    case DATA_UPDATE_MODE_SET_OWN: {
+      CHECK_EQ(msgReader->getNumBlocks(), (size_t)(request.blocks_size()));
+      size_t totalLen = msgReader->getTotalLength();
+      if (totalLen > 0) {
+        CHECK_EQ(msgReader->getNumBlocks(), 1U)
+            << "Only one block currently support now!";
+        const auto& block = request.blocks(0);
+        if (0 == dataSize_) {
+          dataSize_ = block.data_size();
+        } else {
+          CHECK_EQ(dataSize_, block.data_size());
+        }
+        int64_t serverId = request.server_id();
+        if (serverId_ < 0) {
+          serverId_ = serverId;
+        } else {
+          CHECK_EQ(serverId_, serverId);
+        }
+        int64_t clientId = request.client_id();
+        dataMems_[clientId] = std::make_shared<CpuMemoryHandle>(totalLen);
+        CHECK_EQ(totalLen % sizeof(block.data_size()), 0U);
+        msgReader->readNextBlock(dataMems_[clientId].get()->getBuf());
+      }
+      msgReader.reset();
+      std::vector<iovec> outputIovs;
+      callback(response, outputIovs);
+      break;
+    }
+    case DATA_UPDATE_MODE_GET_ALL: {
+      /// Currently only support DATA_REDUCE_SUM
+      /// And their Operations are just add
+      CHECK(DATA_REDUCE_SUM == request.type());
+      templateReduceSum(request, msgReader, callback);
+      break;
+    }
+    default: { LOG(FATAL) << "not supported"; }
+  }
+}
+
+void ParameterServer2::clearUnusedSegments(CpuVector* vec) {
+  real* data = vec->getData();
+  if (usedSegments_.empty()) {
+    return;
+  }
+  memset(data, 0, sizeof(real) * usedSegments_[0].first);
+  memset(data + usedSegments_.back().second, 0,
+         sizeof(real) * (size_ - usedSegments_.back().second));
+  size_t n = size_ - usedSegments_.back().second;
+
+  for (size_t i = 1; i < usedSegments_.size(); ++i) {
+    memset(
+        data + usedSegments_[i - 1].second, 0,
+        sizeof(real) * (usedSegments_[i].first - usedSegments_[i - 1].second));
+    n += usedSegments_[i].first - usedSegments_[i - 1].second;
+  }
+}
+
+void ParameterServer2::parallelExecForEachBlock(ExecFunc func) {
+  SyncThreadPool::execHelper(syncThreadPool_.get(), [&](int tid,
+                                                        size_t numThreads) {
+    int64_t numBlocks = blockIdMap_.size();
+    VectorPtr* vecs = Parameter::getTlsTempBufs();
+    for (int64_t blockId = tid; blockId < numBlocks; blockId += numThreads) {
+      func(blockId, vecs);
+    }
+  });
+}
+
+void ParameterServer2::blockTraverse(
+    BlockInfo& info, const ParameterConfig& config, int64_t offset, size_t size,
+    const VectorPtr vecs[],
+    const ParameterOptimizer::TraverseCallback& callback) {
+  /// setup sub bufs
+  for (const auto type : info.optimizer->getParameterTypes()) {
+      vecs[type]->subVecFrom(*vectors_[type], offset, size);
+  }
+  callback(vecs, config, config.sparse_remote_update() ? 0 : -1LU);
+}
+
+void ParameterServer2::op_SGD(const Operation& operation,
+                              OperationResult* result) {
+  (void)operation;
+  (void)result;
+
+  if (allClientPassFinish_) {
+    /// when all clients signal pass finished, the update
+    /// is empty.
+    return;
+  }
+
+  {
+    REGISTER_TIMER_DYNAMIC("op_SGD", -1, *statSet_);
+
+    parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
+      BlockInfo& info = blockInfos_[blockId];
+      const ParameterConfig& config = getParameterConfig(blockId);
+      int64_t offset = info.offset;
+      size_t size = config.parameter_block_size();
+
+      info.optimizer->startBatch(numSamplesProcessed_);
+
+      for (const auto type : info.optimizer->getParameterTypes()) {
+          vecs[type]->subVecFrom(*vectors_[type], offset, size);
+      }
+      info.optimizer->update(vecs, config,
+              config.sparse_remote_update() ? 0 : -1LU);
+      vecs[PARAMETER_GRADIENT]->zeroMem();
+
+      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
+        blockTraverse(info, config, offset, size, vecs, callback);
+      }
+      info.optimizer->finishBatch();
+    });
+  }
+
+  batchId_++;
+  tuningSgdMidOutput();
+}
+
+void ParameterServer2::op_start_pass(const Operation& operation,
+                                     OperationResult* result) {
+  (void)operation;
+  (void)result;
+
+  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
+    BlockInfo& info = blockInfos_[blockId];
+    info.optimizer->startPass();
+  });
+}
+
+void ParameterServer2::op_finish_pass(const Operation& operation,
+                                      OperationResult* result) {
+  (void)operation;
+  (void)result;
+
+  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
+    BlockInfo& info = blockInfos_[blockId];
+    const ParameterConfig& config = getParameterConfig(blockId);
+    size_t size = config.parameter_block_size();
+
+    /// catch up with
+    if (auto callback = info.optimizer->startCatchUpWith()) {
+      blockTraverse(info, config, info.offset, size, vecs, callback);
+      info.optimizer->finishCatchUpWith();
+    }
+
+    /// finish pass
+    info.optimizer->finishPass();
+  });
+
+  tuningSgdFinished();
+  batchId_ = 0;
+}
+
+void ParameterServer2::op_apply(const Operation& operation,
+                                OperationResult* result) {
+  (void)operation;
+  (void)result;
+
+  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
+    BlockInfo& info = blockInfos_[blockId];
+    const ParameterConfig& config = getParameterConfig(blockId);
+    int64_t offset = info.offset;
+    size_t size = config.parameter_block_size();
+
+    // catch up with
+    if (auto callback = info.optimizer->startCatchUpWith()) {
+      blockTraverse(info, config, offset, size, vecs, callback);
+      info.optimizer->finishCatchUpWith();
+    }
+
+    // apply to PARAMETER_APPLY
+    if (auto callback = info.optimizer->apply()) {
+      blockTraverse(info, config, offset, size, vecs, callback);
+    }
+  });
+}
+
+void ParameterServer2::op_randomize(const Operation& operation,
+                                    OperationResult* result) {
+  LOG(INFO) << "ParameterServer2::op_randomize: serverId=" << serverId_;
+
+  CpuVector& valueVec = *vectors_[PARAMETER_VALUE];
+
+  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
+    BlockInfo& info = blockInfos_[blockId];
+    const ParameterConfig& config = getParameterConfig(blockId);
+    size_t size = config.parameter_block_size();
+
+    vecs[PARAMETER_VALUE]->subVecFrom(valueVec, info.offset, size);
+    Parameter::randomize(vecs[PARAMETER_VALUE], config);
+  });
+}
+
+void ParameterServer2::loadValueVector(const LoadValueRequest& request,
+                                       ProtoResponseCallback callback) {
+  LoadValueResponse response;
+  LOG(INFO) << "ParameterServer2::loadValueVector: serverId=" << serverId_;
+
+  constexpr int kBufLen = 100;
+  char buf[kBufLen];
+  snprintf(buf, kBufLen, "/pserver.%04d", static_cast<int>(serverId_));
+  std::string filename = request.dir_name() + buf;
+
+  std::ifstream fs(filename, std::ios_base::binary);
+  CHECK(fs) << "Fail to open " << filename;
+
+  CpuVector& vec = *vectors_[PARAMETER_VALUE];
+  Parameter::Header header;
+  CHECK(fs.read(reinterpret_cast<char*>(&header), sizeof(header)))
+      << "Fail to read parameters in pserver";
+  CHECK_EQ(header.version, Parameter::kFormatVersion)
+      << "Incorrect format version: " << header.version;
+  CHECK_EQ(header.size, (size_t)size_)
+      << "The size (" << header.size << ") in the file does not match the size "
+      << "(" << size_ << ") of the pserver: " << serverId_;
+  CHECK_EQ(header.valueSize, sizeof(real)) << "Unsupported valueSize "
+                                           << header.valueSize;
+  CHECK(fs.read(reinterpret_cast<char*>(vec.getData()),
+                header.size * sizeof(real)));
+
+  callback(response);
+}
+
+void ParameterServer2::saveValueVector(const SaveValueRequest& request,
+                                       ProtoResponseCallback callback) {
+  SaveValueResponse response;
+  LOG(INFO) << "ParameterServer2::SaveValueVector: serverId=" << serverId_;
+
+  mkDir(request.dir_name().c_str());
+
+  constexpr int kBufLen = 100;
+  char buf[kBufLen];
+  snprintf(buf, kBufLen, "/pserver.%04d", static_cast<int>(serverId_));
+  std::string filename = request.dir_name() + buf;
+
+  std::ofstream fs(filename, std::ios_base::binary);
+  CHECK(fs) << "Fail to open " << filename;
+
+  CpuVector& vec = vectors_[PARAMETER_APPLY] ? *vectors_[PARAMETER_APPLY]
+                                             : *vectors_[PARAMETER_VALUE];
+  Parameter::Header header;
+  header.version = Parameter::kFormatVersion;
+  header.valueSize = sizeof(real);
+  header.size = size_;
+
+  CHECK_EQ(header.size, vec.getSize());
+
+  CHECK(fs.write(reinterpret_cast<char*>(&header), sizeof(header)))
+      << "Fail to write parameter in pserver: " << serverId_;
+
+  CHECK(fs.write(reinterpret_cast<char*>(vec.getData()),
+                 header.size * sizeof(real)))
+      << "Fail to write parameter in pserver: " << serverId_;
+
+  callback(response);
+}
+
+void ParameterServer2::op_RESET(const Operation& operation,
+                                OperationResult* result) {
+  (void)result;
+  CpuVector* u = vectors_[operation.pvectors(0)].get();
+  u->reset(operation.scalars(0));
+  clearUnusedSegments(u);
+}
+
+void ParameterServer2::op_utv(const Operation& operation,
+                              OperationResult* result) {
+  real* u = vectors_[operation.pvectors(0)]->getData();
+  real* v = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  double sum = 0;
+  for (int64_t i = 0; i < size; ++i) {
+    sum += (double)u[i] * (double)v[i];
+  }
+  result->add_scalars(sum);
+}
+
+void ParameterServer2::op_au_bv(const Operation& operation,
+                                OperationResult* result) {
+  (void)result;
+  real* u = vectors_[operation.pvectors(0)]->getData();
+  real* v = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  real a = operation.scalars(0);
+  real b = operation.scalars(1);
+  for (int64_t i = 0; i < size; ++i) {
+    v[i] = a * u[i] + b * v[i];
+  }
+}
+
+void ParameterServer2::op_COPY(const Operation& operation,
+                               OperationResult* result) {
+  (void)result;
+  real* u = vectors_[operation.pvectors(0)]->getData();
+  real* v = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  for (int64_t i = 0; i < size; ++i) {
+    v[i] = u[i];
+  }
+}
+
+void ParameterServer2::op_au(const Operation& operation,
+                             OperationResult* result) {
+  (void)result;
+  real* u = vectors_[operation.pvectors(0)]->getData();
+  int64_t size = size_;
+  real a = operation.scalars(0);
+  for (int64_t i = 0; i < size; ++i) {
+    u[i] *= a;
+  }
+}
+
+void ParameterServer2::op_au_bv_cw(const Operation& operation,
+                                   OperationResult* result) {
+  (void)result;
+  real* u = vectors_[operation.pvectors(0)]->getData();
+  real* v = vectors_[operation.pvectors(1)]->getData();
+  real* w = vectors_[operation.pvectors(2)]->getData();
+  int64_t size = size_;
+  real a = operation.scalars(0);
+  real b = operation.scalars(1);
+  real c = operation.scalars(2);
+  for (int64_t i = 0; i < size; ++i) {
+    w[i] = a * u[i] + b * v[i] + c * w[i];
+  }
+}
+
+void ParameterServer2::op_make_steepest_desc_dir(const Operation& operation,
+                                                 OperationResult* result) {
+  (void)result;
+  real* dir = vectors_[operation.pvectors(0)]->getData();
+  real* grad = vectors_[operation.pvectors(1)]->getData();
+  real* x = vectors_[operation.pvectors(2)]->getData();
+  int64_t size = size_;
+  real l1weight = operation.scalars(0);
+  for (int64_t i = 0; i < size; ++i) {
+    if (x[i] < 0) {
+      dir[i] = -grad[i] + l1weight;
+    } else if (x[i] > 0) {
+      dir[i] = -grad[i] - l1weight;
+    } else {
+      if (grad[i] < -l1weight) {
+        dir[i] = -grad[i] - l1weight;
+      } else if (grad[i] > l1weight) {
+        dir[i] = -grad[i] + l1weight;
+      } else {
+        dir[i] = 0;
+      }
+    }
+  }
+}
+
+void ParameterServer2::op_fix_dir_signs(const Operation& operation,
+                                        OperationResult* result) {
+  (void)result;
+  real* dir = vectors_[operation.pvectors(0)]->getData();
+  real* steepestDescDir = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  for (int64_t i = 0; i < size; ++i) {
+    if (dir[i] * steepestDescDir[i] <= 0) {
+      dir[i] = 0;
+    }
+  }
+}
+
+void ParameterServer2::op_fix_omega_signs(const Operation& operation,
+                                          OperationResult* result) {
+  (void)result;
+  real* x = vectors_[operation.pvectors(0)]->getData();
+  real* newx = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  for (int64_t i = 0; i < size; ++i) {
+    if (x[i] * newx[i] < 0) {
+      newx[i] = 0;
+    }
+  }
+}
+
+void ParameterServer2::op_dir_deriv(const Operation& operation,
+                                    OperationResult* result) {
+  real* dir = vectors_[operation.pvectors(0)]->getData();
+  real* grad = vectors_[operation.pvectors(1)]->getData();
+  real* x = vectors_[operation.pvectors(2)]->getData();
+  int64_t size = size_;
+  real l1weight = operation.scalars(0);
+  double sum = 0;
+  for (int64_t i = 0; i < size; ++i) {
+    if (dir[i] != 0) {
+      if (x[i] < 0) {
+        sum += dir[i] * (grad[i] - l1weight);
+      } else if (x[i] > 0) {
+        sum += dir[i] * (grad[i] + l1weight);
+      } else if (dir[i] < 0) {
+        sum += dir[i] * (grad[i] - l1weight);
+      } else if (dir[i] > 0) {
+        sum += dir[i] * (grad[i] + l1weight);
+      }
+    }
+  }
+  result->add_scalars(sum);
+}
+
+void ParameterServer2::op_cost(const Operation& operation,
+                               OperationResult* result) {
+  real* x = vectors_[operation.pvectors(0)]->getData();
+  real* newgrad = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  real l1weight = operation.scalars(0);
+  real l2weight = operation.scalars(1);
+  double cost_real = cost_ / mpiSize_;
+  double sum_weight_l1 = 0;
+  double sum_weight_l2 = 0;
+  for (int64_t i = 0; i < size; ++i) {
+    sum_weight_l1 += std::abs(x[i]);
+    sum_weight_l2 += x[i] * x[i];
+    newgrad[i] += 2.0 * l2weight * x[i];
+  }
+  cost_real += l1weight * sum_weight_l1 + l2weight * sum_weight_l2;
+  result->add_scalars(cost_real);
+}
+
+ParameterServer2::OperatorFunction ParameterServer2::opFuncs[] = {
+    nullptr,                         // PSERVER_OP_utu = 0;
+    &ParameterServer2::op_utv,       // PSERVER_OP_utv = 1;
+    &ParameterServer2::op_au,        // PSERVER_OP_au = 2;
+    &ParameterServer2::op_au_bv,     // PSERVER_OP_au_bv = 3;
+    nullptr,                         // PSERVER_OP_aAx_bu = 4;
+    &ParameterServer2::op_SGD,       // PSERVER_OP_SGD = 5;
+    &ParameterServer2::op_RESET,     // PSERVER_OP_RESET = 6;
+    &ParameterServer2::op_COPY,      // PSERVER_OP_COPY = 7;
+    &ParameterServer2::op_au_bv_cw,  // PSERVER_OP_au_bv_cw = 8;
+    &ParameterServer2::op_make_steepest_desc_dir,
+    /// PSERVER_OP_MAKE_STEEPEST_DESC_DIR = 9;
+    &ParameterServer2::op_fix_dir_signs,    // PSERVER_OP_FIX_SIGNS = 10;
+    &ParameterServer2::op_dir_deriv,        // PSERVER_OP_DIR_DERIV = 11;
+    &ParameterServer2::op_fix_omega_signs,  // PSERVER_OP_FIX_OMEGA_SIGNS = 12;
+    &ParameterServer2::op_cost,             // PSERVER_OP_COST = 13
+    &ParameterServer2::op_start_pass,       // PSERVER_OP_START_PASS = 14
+    &ParameterServer2::op_finish_pass,      // PSERVER_OP_FINISH_PASS = 15
+    &ParameterServer2::op_randomize,        // PSERVER_OP_RANDOMIZE = 16
+    &ParameterServer2::op_apply,            // PSERVER_OP_APPLY = 17
+};
+
+void ParameterServer2::doOperation(const DoOperationRequest& request,
+                                   ProtoResponseCallback callback) {
+  if (request.wait_for_gradient()) {
+    /// wait gradient update
+    gradientReadyBarrier_.wait();
+    allClientPassFinish_ = numPassFinishClients_ == FLAGS_num_gradient_servers;
+  }
+
+  DoOperationResponse response;
+  response.set_pass_finish(allClientPassFinish_);
+
+  for (const auto& op : request.operations()) {
+    OperationResult* opResult = response.add_results();
+    if (op.operation() >= ARRAYSIZE(opFuncs)) {
+      LOG(ERROR) << "Unknown operation " << op.operation();
+      response.set_return_message(kRetMsgUnknownOperation);
+    }
+    OperatorFunction opFunc = opFuncs[op.operation()];
+    if (!opFunc) {
+      LOG(ERROR) << "Operation not implemented: " << op.operation();
+      response.set_return_message(kRetMsgUnknownOperation);
+    }
+    (this->*opFunc)(op, opResult);
+  }
+
+  if (request.send_back_parameter()) {
+    /// clean current cost
+    cost_ = 0;
+
+    if (allClientPassFinish_ && request.release_pass()) {
+      /// This signals that all clients finish one pass, so waitPassFinish()
+      /// will stop waiting.
+      numPassFinishClients_ = 0;
+    }
+
+    /// notify addGradient() to send back parameter
+    parameterReadyBarrier_.wait();
+  }
+  callback(response);
+}
+
+void ParameterServer2::waitPassStart(const WaitPassStartRequest& request,
+                                     ProtoResponseCallback callback) {
+  passBarrier_.wait();
+  callback(WaitPassStartResponse());
+}
+
+void ParameterServer2::waitPassFinish(const WaitPassFinishRequest& request,
+                                      ProtoResponseCallback callback) {
+  numPassFinishClients_ += 1;
+
+  while (numPassFinishClients_ != 0) {
+    /// notify doOperation gradient ready
+    gradientReadyBarrier_.wait();
+    /// wait doOperation finish
+    parameterReadyBarrier_.wait();
+  }
+
+  callback(WaitPassFinishResponse());
+}
+
+void ParameterServer2::synchronize(const SynchronizeRequest& request,
+                                   ProtoResponseCallback callback) {
+  CHECK_LT(request.sync_object_id(), SyncObject_ARRAYSIZE);
+  synchronizeBarriers_[request.sync_object_id()]->wait();
+  dataSize_ = 0;
+  callback(SynchronizeResponse());
+}
+
+void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request,
+                                       ProtoResponseCallback callback) {
+  CHECK_LT(request.sync_object_id(), SyncObject_ARRAYSIZE);
+  synchronizeBarriers_[request.sync_object_id()]->wait();
+  callback(SynchronizeResponse());
+
+  if (request.trainer_id() == 0) {
+    tuningAsyncsgdFinished();
+    batchId_ = 0;
+  }
+}
+
+void ParameterServer2::createVector(const CreateVectorRequest& request,
+                                    ProtoResponseCallback callback) {
+  (void)request;
+  CreateVectorResponse response;
+  LOG(INFO) << "ParameterServer2::createVector: size=" << size_;
+  CpuVectorPtr vec = std::make_shared<CpuVector>(size_);
+  int64_t handle = -1;
+  {
+    std::lock_guard<RWLock> guard(parameterMutex_);
+    handle = vectors_.size();
+    vectors_.push_back(vec);
+  }
+  response.set_handle(handle);
+  callback(response);
+}
+
+void ParameterServer2::releaseVector(const ReleaseVectorRequest& request,
+                                     ProtoResponseCallback callback) {
+  ReleaseVectorResponse response;
+  CpuVectorPtr vec;
+  {
+    std::lock_guard<RWLock> guard(parameterMutex_);
+    vec.swap(vectors_[request.handle()]);
+  }
+  callback(response);
+}
+
+void ParameterServer2::createMatrix(const CreateMatrixRequest& request,
+                                    ProtoResponseCallback callback) {
+  CreateMatrixResponse response;
+  /// We need to create column major matrix of size_ * num_cols
+  /// Matrix is row majoar. Need to tranpose when use it.
+  CpuMatrixPtr mat = std::make_shared<CpuMatrix>(request.num_cols(), size_);
+  int64_t handle = -1;
+  {
+    std::lock_guard<RWLock> guard(parameterMutex_);
+    handle = matrices_.size();
+    matrices_.push_back(mat);
+  }
+  response.set_handle(handle);
+  callback(response);
+}
+
+void ParameterServer2::releaseMatrix(const ReleaseMatrixRequest& request,
+                                     ProtoResponseCallback callback) {
+  ReleaseMatrixResponse response;
+  CpuMatrixPtr mat;
+  {
+    std::lock_guard<RWLock> guard(parameterMutex_);
+    mat.swap(matrices_[request.handle()]);
+  }
+  callback(response);
+}
+
+void ParameterServer2::tuningSgdMidOutput() {
+  if (batchId_ && batchId_ % FLAGS_log_period_server == 0) {
+    LOG(INFO) << "======== Batch=" << batchId_ << "=======";
+    statSet_->setThreadInfo(true);
+    statSet_->printAllStatus();
+    /// not reset raw data for reducing the overhead of performance tuning
+    statSet_->reset(false);
+  }
+}
+
+void ParameterServer2::tuningSgdFinished() {
+  LOG(INFO) << "======== Batch=" << batchId_ << " pass END"
+            << "=======";
+  statSet_->setThreadInfo(true);
+  statSet_->printAllStatus();
+  /**
+   * reset raw data at end of pass since some raw data could be not
+   * complete. Otherwise the raw data will pollute next pass performance
+   * tuning
+   */
+  statSet_->reset();
+}
+
+void ParameterServer2::tuningAsyncsgdMidOutput() {
+#ifndef PADDLE_DISABLE_TIMER
+  if (batchId_ && batchId_ % FLAGS_log_period_server == 0) {
+    LOG(INFO) << "======== [not accurate] Batch=" << batchId_ << "=======";
+    printAsyncGradientCommitStatAndReset();
+  }
+#endif
+}
+
+void ParameterServer2::tuningAsyncsgdFinished() {
+  LOG(INFO) << "======== [not accurate] Batch=" << batchId_ << " pass END"
+            << "=======";
+  printAsyncGradientCommitStatAndReset();
+}
+
+}  // namespace paddle
diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h
new file mode 100644
index 00000000000000..ceb1ad69e9ec51
--- /dev/null
+++ b/paddle/pserver/ParameterServer2.h
@@ -0,0 +1,737 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <atomic>
+#include <mutex>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <type_traits>
+#include <limits>
+
+#include <stddef.h>
+#include <stdlib.h>
+
+#include "paddle/utils/Locks.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/parameter/Parameter.h"
+#include "paddle/parameter/ParameterOptimizer.h"
+#include "paddle/utils/ThreadLocal.h"
+#include "paddle/utils/TypeDefs.h"
+#include "paddle/math/Vector.h"
+#include "paddle/utils/Stat.h"
+
+#include "ParameterService.pb.h"
+
+#include "ProtoServer.h"
+
+P_DECLARE_int32(port);
+
+namespace paddle {
+
+// @TODO(yanfei):
+// if armed with high density computation resource per node, pserver could also
+// utilize GPU to reduce overhead. if this mechanism is used, it could pipeline
+// network receiving and GPU computation to reduce the network overhead even
+// further. the pipeline could help to accelerate BIG model training.
+// @TODO:(yanfei)
+// for cpu and less/low gpu machine, the time exhausted by forward and backward
+// could be larger than optimization at pserver. However, if armed with lots of
+// gpus per node and if the model size is so large enough that limited cpu
+// computation causes big optmization latency, the GPU may be required by
+// pserver.
+
+
+/**
+ * Client interface for the parameter server
+ *
+ * it implements several rpc API for remote parameter client usage.
+ * for sync-sgd, client needs one controller thread to build connections
+ * to all pservers, these controller connections do barriers
+ * synchronization with these connections used for transfering data.
+ * each data connection uses block based fine grained synchronization
+ * to gain better scalability. Merging gradients from different trainers
+ * are concurrently executed with block units, so that some network
+ * overhead will be hidden in merging gradient.
+ * for async-sgd, the difference is that pserver will do optimization
+ * immediately if the gradients are ready, so that pserver needs to
+ * prepare separate buffer to store value for sending back to trainer
+ * to prevent from being polluted.
+ */
+class ParameterServer2 : public ProtoServer {
+protected:
+  /// parameter_ mutex.
+  RWLock parameterMutex_;
+
+  typedef std::pair<size_t, int64_t> BlockKey;
+  struct BlockKeyHash {
+    size_t operator()(const BlockKey& key) const {
+      return std::hash<size_t>()(key.first) + key.second;
+    }
+  };
+
+  // TODO(yanfei):
+  // if index data structure is based on parameters instead of blocks, the
+  // lookup performance could be better. In addition, the block memory
+  // access almost exhibits good locality, so index data structure and
+  // block data structure can be refined further, especially if gpu is used
+  // for pserver.
+  /**
+   * all parameters are stored in CpuVector with a blockMap_ data structure
+   * to index block data required by requests.
+   */
+  typedef std::unordered_map<BlockKey, int64_t, BlockKeyHash> BlockMap;
+  /// <(para, block), global offset(byte) in all parameters>
+  BlockMap blockOffsetMap_;
+  /// <(para, block), global idx [0, nBlocksInAllParameters]>
+  BlockMap blockIdMap_;
+
+  std::vector<CpuVectorPtr> vectors_;
+  std::vector<CpuMatrixPtr> matrices_;
+  std::vector<CpuMemHandlePtr> dataMems_;
+
+  // TODO(yanfei):
+  // if storing sparse_remote_update() flag in request instead of
+  // reading configMap_, and storing config within new block wise
+  // overview data structure, the config mapping, block mapping
+  // can be unified in single clean data structure. Use para_id
+  // to index parameters, use offset to index block within parameter
+  // and keep two index into single one.
+  /**
+   * mapping between parameter and config
+   * different parameter allows different config, such as decay_rate.
+   * for each request, it need to read config for adding gradient
+   * and optmization.
+   */
+  std::unordered_map<size_t, ParameterConfig> configMap_;
+
+  /**
+   * to parallelize the multi-thread and multi-connnection
+   * computation at pserver, it use block unit to reduce
+   * the contention for computation, even further use block
+   * level optimizater control for each block for some special
+   * reason annotated below.
+   */
+  struct BlockInfo {
+    const ParameterConfig* config;
+    std::unique_ptr<std::mutex> lock;
+    /// global offset for all parameters
+    uint64_t offset;
+    /**
+     *
+     * Async sgd in pserver is very different from sync sgd.
+     * Each trainer follows startBatch, update*, finishBatch as in
+     * sync sgd, but all these actions are almost executed by
+     * multi-core and multi-thread simutaneously, so that async
+     * sgd optimization is based on block level in reality, then
+     * per block optimization is necessary indeed. In addition,
+     * per block optimization is also perfered for performance
+     * with multithreads.
+     */
+    std::unique_ptr<ParameterOptimizer> optimizer;
+  };
+  std::vector<BlockInfo> blockInfos_;
+
+  typedef std::vector<std::pair<int64_t, int64_t>> BlockSegments;
+  /// Because some blocks might not be fully used. We keep a
+  /// record of which segments are used.
+  BlockSegments usedSegments_;
+
+  /// record pserver status, all status defined in ParameterService.pb
+  PServerStatus status_;
+  /// record all samples processed which could be used by optimizater
+  std::atomic<int64_t> numSamplesProcessed_;
+  double cost_;
+  int mpiSize_;
+  int dataSize_;
+  /// configuration for current parameter optimizer
+  OptimizationConfig config_;
+
+  /**
+   * The ReadWriteBuffer is based on std::vector, but aligned for avx/sse
+   * compute. And add some helper method to allocate memory aligned blocks.
+   *
+   * @param T          type of element.
+   * @param AlignBytes the memory aligned bytes for allocated blocks.
+   */
+  template <typename T, size_t AlignBytes>
+  class ReadWriteBuffer
+      : public std::vector<T, AlignedAllocator<T, AlignBytes>> {
+  public:
+    static_assert(sizeof(T) % AlignBytes == 0 || AlignBytes % sizeof(T) == 0,
+                  "Type T must be able to aligned.");
+
+    /**
+     * @brief IsTLargerThanAlign compiled time calculated constant for is type
+     * T larger than alignments.
+     */
+    constexpr static bool IsTLargerThanAlign = sizeof(T) >= AlignBytes;
+
+    static_assert(std::is_pod<T>::value, "T must be POD type.");
+
+    /**
+     * @brief if AlignBytes > sizeof(T), then will calcuate how many elements
+     * can be stored in AlignBytes.
+     */
+    constexpr static size_t AlignElementCount = AlignBytes / sizeof(T);
+
+    static_assert(
+        AlignElementCount == (AlignElementCount & -AlignElementCount)
+          || AlignBytes > sizeof(T), "AlignElementCount should be exp of 2");
+
+    /**
+     * @brief Resize Buffer, with block count that will be allocated. Each block
+     * will be memory aligned in AlignBytes.
+     * @param size The element count in all blocks.
+     * @param alignBlockCount The block count that will be allocated.
+     */
+    void resizeWithAlignHints(size_t size, size_t alignBlockCount = 1) {
+      if (IsTLargerThanAlign) {  //! So, each elements is memory aligned.
+        this->resize(size);
+      } else {
+        //! at most, we need such elements in buffer to make sure each block is
+        //! aligned.
+        this->resize(size + alignBlockCount* (AlignElementCount - 1));
+      }
+    }
+
+    /**
+     * @brief reset aligned allocate blocks.
+     */
+    void resetAlignAlloc() { this->curOffset_ = 0; }
+
+    /**
+     * @brief get next aligned block address.
+     * @param blockSize is the element count in each block.
+     * @return Aligned block address.
+     */
+    T* nextBlock(size_t blockSize) {
+      T* r = &this->operator[](curOffset_);
+      curOffset_ += blockSize;
+
+      if (!IsTLargerThanAlign) {
+        curOffset_ = (curOffset_ + AlignElementCount - 1) &
+            ~(AlignElementCount -1);
+      }
+      return r;
+    }
+
+  private:
+    size_t curOffset_;
+  };
+
+  /// to buffer the data from network for further processing to
+  /// reduce redundant memory allocation.
+  ThreadLocal<ReadWriteBuffer<real, ALIGN_HINT>> readWriteBuffer_;
+
+  /// size of the parameter
+  int64_t size_;
+
+  /// for synchronized training, check details in addGradient()
+  /// and doOperation()
+  ThreadBarrier gradientReadyBarrier_;
+  ThreadBarrier parameterReadyBarrier_;
+  ThreadBarrier passBarrier_;
+  ThreadLocal<std::vector<SendParameterRequest>> requestVec_;
+  ThreadLocal<std::vector<ProtoResponseCallbackEx>> callbackVec_;
+
+  std::atomic<int> numPassFinishClients_;
+  bool allClientPassFinish_;
+
+  std::vector<std::unique_ptr<ThreadBarrier>> synchronizeBarriers_;
+  std::atomic<int> serverId_;
+
+  /**
+   *
+   * for lagged async gradient gradient commit control in Async Sgd.
+   * discard lagged gradients from too slow nodes, whose gradients
+   * exhibits bad quality.
+   * Algorithm:
+   * pserver:
+   * 1. initial asyncUpdaterSteps = 0, asyncTrainerSteps_[N] = 0.
+   * syncUpdaterSteps means
+   *    the version of parameter value.
+   * 2. when pull arrives, record asyncUpdateSteps_ into
+   * syncTrainerSteps_[trainer_id]
+   * 3. when push arrives, compare asyncUpdateSteps_ with
+   * syncTrainerSteps_[trainer_id]
+   *    if delta > threshold, discard current gradient, else commit
+   *    gradient.
+   * 4. reset asyncUpdaterSteps_ and asyncTrainerSteps_[N] when pass
+   * finished
+   * Note:
+   * it can not discard all lag-gradient strictly in some special
+   * condition. part of gradients could be discarded if
+   * ConcurrentRemoteParameterUpdater is sed.
+   * this algorithm is implemented in asynSGD()
+   */
+  int64_t asyncLaggedThreshold_;
+  std::atomic<int64_t> asyncUpdateSteps_;
+  std::vector<int64_t> asyncTrainerSteps_;
+  size_t asyncLaggedGradientsNum_;
+  /// stat all async update
+  std::vector<size_t> asyncUpdateStat_;
+  /// stat per trainer_id
+  std::vector<size_t> asyncTrainerDiscardStat_;
+  /// stat per trainer_id
+  std::vector<size_t> asyncTrainerCommitStat_;
+
+  /// only used by controller and other control cmd from trainer number 0
+  std::unique_ptr<SyncThreadPool> syncThreadPool_;
+
+  /// pserver for sparse remote update parameters
+  bool isSparseServer_;
+
+  /// barrier performance tuning sync-sgd required
+  std::atomic<int64_t> batchId_;
+
+  /// the beginning of addGradient without network overhead
+  ThreadLocal<struct timeval> addGradBegin_;
+
+  /**
+   * tuning barrier performance
+   * to better control log for sparse and dense parameter,
+   * we use different log entities for different parameterServer
+   * objects.
+   * it will output lots of performance stats to perceive the
+   * overhead of network, fluctuation of computation from
+   * forwardbackward and network, computation from optimization
+   * at pserver end, barrier overhead, etc. to understand tuning
+   * data, focus on the synchronization between addGradient and
+   * doOperation which indirectly call op_SGD operation controlled
+   * by remote updater controller
+   */
+  std::unique_ptr<StatSet> statSet_;
+
+public:
+  struct Buffer {
+    real* base;
+    size_t size;
+  };
+
+protected:
+  /// async gradient commit control
+  bool asyncGrdientCommitCheckAndStat(const SendParameterRequest& request);
+  void printAsyncGradientCommitStatAndReset();
+
+public:
+  /// disable default parameter for overloading
+  /// @rdmaCpu:the id of cpu core hosting RDMA server(0-N)
+  /// -1 means using TCP transport instead of RDMA
+  ParameterServer2(const std::string& addr, int port, int rdmaCpu = -1);
+
+  ~ParameterServer2() {}
+
+  static const std::string kRetMsgInvalidMatrixHandle;
+  static const std::string kRetMsgInvalidVectorHandle;
+  static const std::string kRetMsgUnknownOperation;
+
+  /// service functions
+  template <typename Dtype>
+  void reduceAndSendData(const SendDataRequest& request,
+                         std::unique_ptr<MsgReader>& msgReader,
+                         ProtoResponseCallbackEx& callback);
+
+  void templateReduceSum(const SendDataRequest& request,
+                         std::unique_ptr<MsgReader>& msgReader,
+                         ProtoResponseCallbackEx& callback);
+
+  /**
+   * @brief framework for sending parameters
+   *
+   * @note  different parameter data type can be sent to pserver.
+   *        in most case, the api is used to send gradients from
+   *        trainer to pserver.
+   *        it also can be used to retrieve parameters from pserver
+   */
+  void sendParameter(const SendParameterRequest& request,
+                     std::unique_ptr<MsgReader> msgReader,
+                     ProtoResponseCallbackEx callback);
+
+  void sendData(const SendDataRequest& request,
+                std::unique_ptr<MsgReader> msgReader,
+                ProtoResponseCallbackEx callback);
+
+  /**
+   * @brief send config to pserver
+   *
+   * @note  it can help pserver to understand the configuration for optimization,
+   *        logging control, duplicated initialization, etc.
+   */
+  void setConfig(const SetConfigRequest& request,
+                 ProtoResponseCallback callback);
+
+  /**
+   * @brief get status for pserver
+   *
+   * @note  used to check if parameters are ready at pserver
+   */
+  void getStatus(const GetStatusRequest& request,
+                 ProtoResponseCallback callback);
+
+  /**
+   * @brief set status for pserver
+   *
+   * @note  used to check if parameters are ready at pserver, since parameters
+   *        at pserver are initialized by trainer
+   */
+  void setStatus(const SetStatusRequest& request,
+                 ProtoResponseCallback callback);
+
+  /**
+   * @brief framework for doing some operation at pserver end
+   *
+   * @note  if sync-sgd is used, controller will calling op_SGD action
+   *        for gradient optimization.
+   *        check avaiable operations in opFuncs[]
+   */
+  void doOperation(const DoOperationRequest& request,
+                   ProtoResponseCallback callback);
+
+  /// Create a column vector. The size is the dimension of parameter
+  void createVector(const CreateVectorRequest& request,
+                    ProtoResponseCallback callback);
+
+  void releaseVector(const ReleaseVectorRequest& request,
+                     ProtoResponseCallback callback);
+
+  /// Create a column major matrix. The number of rows is the dimension of
+  /// parameter. The number of columns is specifed by num_cols.
+  void createMatrix(const CreateMatrixRequest& request,
+                    ProtoResponseCallback callback);
+
+  void releaseMatrix(const ReleaseMatrixRequest& request,
+                     ProtoResponseCallback callback);
+  /**
+   * @brief stateful control for indicationg sync pass start
+   *
+   * @note  it is valuable for logging and state control,
+   *        especially for sync-sgd control
+   */
+  void waitPassStart(const WaitPassStartRequest& request,
+                     ProtoResponseCallback callback);
+
+  /**
+   * @brief stateful control for indicationg sync pass end
+   *
+   * @note  it is valuable for logging and state control,
+   *        especially for sync-sgd control
+   */
+  void waitPassFinish(const WaitPassFinishRequest& request,
+                      ProtoResponseCallback callback);
+
+  /**
+   * @brief synchronize all distributed trainers
+   *
+   * @note  it's general api for synchronizing trainer and pserver
+   */
+  void synchronize(const SynchronizeRequest& request,
+                   ProtoResponseCallback callback);
+
+  /**
+   * @brief stateful control for indicating async pass is finished
+   *
+   * @note  it is valuable for logging control, state reset, etc.
+   */
+  void asyncFinishPass(const SynchronizeRequest& request,
+                       ProtoResponseCallback callback);
+
+  void loadValueVector(const LoadValueRequest& request,
+                       ProtoResponseCallback callback);
+
+  void saveValueVector(const SaveValueRequest& request,
+                       ProtoResponseCallback callback);
+
+public:
+  /**
+   * @brief initialize parameter server
+   */
+  bool init();
+
+  /**
+   * @brief set parameters at pserver
+   *
+   * @note  do parameter initialization if neccessy.
+   */
+  void setParameter(const SendParameterRequest& request,
+                    std::vector<Buffer>& inputBuffers,
+                    SendParameterResponse* response,
+                    std::vector<Buffer>* outputBuffers);
+
+  /**
+   * @brief receive gradients and do optimization for async-sgd
+   *
+   * @note  this api asynchronizately receives all data from all
+   *        trainers, and immediately do optimization and return
+   *        optimizated value for trainer.
+   *        this above routine are block based atomic updating,
+   *        which means different block could based different stale
+   *        gradient.
+   *        it will discard some lagged gradients by default for
+   *        better convergence.
+   */
+  void asyncSGD(const SendParameterRequest& request,
+                std::vector<Buffer>& inputBuffers,
+                SendParameterResponse* response,
+                std::vector<Buffer>* outputBuffers);
+
+  /**
+   * @brief merge gradients from all trainer
+   *
+   * @note  this api use block based parallelization as fine grained
+   *        parallelization which benifits lock contention and latency
+   *        hidden for communication, also can harness multi-core
+   *        efficiently.
+   *        it also implements the synchronization for sync-sgd
+   */
+  void addGradient(const SendParameterRequest& request,
+                   std::vector<Buffer>& inputBuffers,
+                   SendParameterResponse* response,
+                   std::vector<Buffer>* outputBuffers);
+
+  /**
+   * @brief get dense parameters from pserver
+   *
+   * @note  for some specified condition, trainer will get parameters from
+   *        pservers.
+   *        e.g.
+   *        if all parameters are stored at perver end for big model training
+   *        trainer can use it to retrieve all parameters if necessary.
+   */
+  void getParameter(const SendParameterRequest& request,
+                    std::vector<Buffer>& inputBuffers,
+                    SendParameterResponse* response,
+                    std::vector<Buffer>* outputBuffers);
+
+  /**
+   * @brief get sparse value from parameter server
+   *
+   * @note  with sparse enabled, pservers own all latest value
+   *        while trainer only retrieve value that only are needed.
+   *        e.g.
+   *        trainer will do prefetch action to retrieve necessary latest
+   *        value from pserver for sparse calculation.
+   */
+  void getParameterSparse(const SendParameterRequest& request,
+                          std::vector<Buffer>& inputBuffers,
+                          SendParameterResponse* response,
+                          std::vector<Buffer>* outputBuffers);
+
+protected:
+  void mergeSegments(BlockSegments* segments);
+
+  /// set the unused segments to zero
+  void clearUnusedSegments(CpuVector* vec);
+
+  // TODO(yanfei):
+  // if read data and do optimization interleavely block by block,
+  // the performance could be better for gaining less network congestion.
+  /// read all data from connection and store it in static pre-allocated buffer
+  void readAllBlocks(MsgReader* msgReader,
+                     std::vector<ParameterServer2::Buffer>* buffers);
+
+  const ParameterConfig& getParameterConfig(const ParameterBlock& block) {
+    CHECK_LT(block.para_id(), -1UL)
+        << "invalid parameter id:" << block.para_id();
+    const auto it = configMap_.find(block.para_id());
+    CHECK(it != configMap_.end())
+        << "can not find parameter id: " << block.para_id();
+    return it->second;
+  }
+
+  /// it implictly check blockOffsetMap_ while retrieving blockId
+  const ParameterConfig& getParameterConfig(int64_t blockId) const {
+    CHECK(blockId >= 0 && blockId < (int64_t) blockInfos_.size())
+        << "block idx out of range, id: " << blockId
+        << " info size: " << blockInfos_.size();
+    return *(blockInfos_[blockId].config);
+  }
+
+  template <class Response>
+  bool isValidVectorHandle(int64_t handle, Response* response) {
+    if (handle < 0 || (size_t)handle >= vectors_.size()) {
+      LOG(ERROR) << "Invalid vector handle " << handle;
+      response->set_return_message(kRetMsgInvalidVectorHandle);
+      return false;
+    }
+    return true;
+  }
+
+  template <class Response>
+  bool isValidMatrixHandle(int64_t handle, Response* response) {
+    if (handle < 0 || (size_t)handle >= matrices_.size()) {
+      LOG(ERROR) << "Invalid matrix handle " << handle;
+      response->set_return_message(kRetMsgInvalidMatrixHandle);
+      return false;
+    }
+    return true;
+  }
+
+  /**
+   * @brief get block offset
+   *
+   * @note  block.begin_dim is added to the block offset.
+   *        return -1 if block cannot be found
+   */
+  int64_t getBlockOffset(const ParameterBlock& block) const {
+    BlockKey key(block.para_id(), block.block_id());
+    auto it = blockOffsetMap_.find(key);
+    if (it == blockOffsetMap_.end()) {
+      return -1;
+    }
+    return it->second;
+  }
+
+  /// return -1 if block cannot be found
+  int64_t getBlockId(const ParameterBlock& block) const {
+    BlockKey key(block.para_id(), block.block_id());
+    auto it = blockIdMap_.find(key);
+    if (it == blockIdMap_.end()) {
+      return -1;
+    }
+    return it->second;
+  }
+
+  /**
+   * @brief prepare data for sending back
+   *
+   * @note  modify reponse and outputBuffers for sending parameter
+   *        back to client. The buffer for socket sending uses
+   *        vectors_[parameterType] directly
+   *        for dense with sync-sgd
+   */
+  void sendBackParameter(const ParameterBlock& block, int parameterType,
+                         SendParameterResponse* response,
+                         std::vector<Buffer>* outputBuffers);
+
+  /**
+   * @brief prepare data for sending back
+   *
+   * @note  modify response and outputBuffers for sending parameter
+   *        back to client. The buffer for socket sending uses buffer->base
+   *        The parameter values are copied from vectors_[parameterType]
+   *        to buffer->base.
+   *        for dense with async-sgd
+   */
+  void sendBackParameter(const ParameterBlock& block, int parameterType,
+                         SendParameterResponse* response, Buffer* buffer,
+                         std::vector<Buffer>* outputBuffers);
+  /**
+   * @brief prepare data for sending back
+   *
+   * @note  specified for sparse
+   */
+  void sendBackParameterSparse(const ParameterBlock& block, int parameterType,
+                               SendParameterResponse* response, Buffer* buffer,
+                               size_t width,
+                               std::vector<Buffer>* outputBuffers);
+
+  /**
+   * framework routine for block parallelization
+   * e.g.
+   * for optimization on all blocks at pserver end, this routine can facilitize
+   * the parallelize of do optimization on all blocks with multithreads.
+   */
+  typedef std::function<void(int64_t blockId, const VectorPtr vecs[])> ExecFunc;
+  void parallelExecForEachBlock(ExecFunc func);
+  void blockTraverse(BlockInfo& info, const ParameterConfig& config,
+                     int64_t offset, size_t size, const VectorPtr vecs[],
+                     const ParameterOptimizer::TraverseCallback& callback);
+
+public:
+  typedef void (ParameterServer2::*OperatorFunction)(const Operation& operation,
+                                                     OperationResult* result);
+
+  /**
+   * doOperation will call following operations indirectly
+   * e.g.
+   * for sync-sgd control, the controller in remote updater will send op_SGD
+   * command to pserver, then send sendParameter request to pserver immediately.
+   * the two function at pserver end will do cooperation to achieve the sync-sgd
+   * gradient merge and optimization.
+   * the most following operations are specified for owlqn, all operations are
+   * under the context of doOperation function
+   */
+  static OperatorFunction opFuncs[];
+
+  void op_SGD(const Operation& operation, OperationResult* result);
+
+  void op_RESET(const Operation& operation, OperationResult* result);
+
+  void op_utv(const Operation& operation, OperationResult* result);
+
+  void op_au_bv(const Operation& operation, OperationResult* result);
+
+  void op_COPY(const Operation& operation, OperationResult* result);
+
+  void op_au(const Operation& operation, OperationResult* result);
+
+  void op_au_bv_cw(const Operation& operation, OperationResult* result);
+
+  void op_make_steepest_desc_dir(const Operation& operation,
+                                 OperationResult* result);
+
+  void op_fix_dir_signs(const Operation& operation, OperationResult* result);
+
+  void op_dir_deriv(const Operation& operation, OperationResult* result);
+
+  void op_fix_omega_signs(const Operation& operation, OperationResult* result);
+
+  void op_cost(const Operation& operation, OperationResult* result);
+
+  void op_start_pass(const Operation& operation, OperationResult* result);
+  void op_finish_pass(const Operation& operation, OperationResult* result);
+
+  void op_apply(const Operation& operation, OperationResult* result);
+
+  void op_randomize(const Operation& operation, OperationResult* result);
+
+  void op_load(const Operation& operation, OperationResult* result);
+  void op_save(const Operation& operation, OperationResult* result);
+
+  /**
+   * @brief output log in at the middle stage of training
+   *
+   * @note  flush log histroy and state at the end for sgd
+   */
+  void tuningSgdMidOutput();
+
+  /**
+   * @brief output log in at the end stage of training
+   *
+   * @note  flush log histroy and state at the end for sgd. it will also
+   *        flush some stateful stat for next pass.
+   */
+  void tuningSgdFinished();
+
+  /**
+   * @brief output log in at the middle stage of training
+   *
+   * @note  flush log histroy and state at the end for async-sgd.
+   *        it will log some performance log if some lagged node are found
+   */
+  void tuningAsyncsgdMidOutput();
+
+  /**
+   * @brief output log in at the end stage of training
+   *
+   * @note  flush log histroy and state at the end for async-sgd.
+   */
+  void tuningAsyncsgdFinished();
+};
+
+}  // namespace paddle
diff --git a/paddle/pserver/ParameterServer2Main.cpp b/paddle/pserver/ParameterServer2Main.cpp
new file mode 100644
index 00000000000000..b15ef8c3ccc922
--- /dev/null
+++ b/paddle/pserver/ParameterServer2Main.cpp
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Util.h"
+#include "paddle/utils/StringUtil.h"
+#include <fstream>
+
+#include "paddle/utils/Flags.h"
+#include "ParameterServer2.h"
+#include "RDMANetwork.h"
+
+using namespace paddle;  // NOLINT
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+
+  std::vector<std::string> devices;
+  std::vector<std::shared_ptr<ParameterServer2>> pservers;
+
+  // round robin to loadbalance RDMA server ENGINE
+  int rdmaCpu = 0;
+  int onlineCpus = rdma::numCpus();
+  int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
+  if (FLAGS_nics.empty()) {
+    pservers.resize(numPorts);
+    for (int i = 0; i < numPorts; ++i) {
+      if (FLAGS_rdma_tcp == "rdma") {
+        pservers[i].reset(
+            new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++));
+        rdmaCpu = rdmaCpu % onlineCpus;
+      } else {
+        pservers[i].reset(new ParameterServer2(std::string(), FLAGS_port + i));
+      }
+      CHECK(pservers[i]->init()) << "Fail to initialize parameter server"
+                                 << FLAGS_port + i;
+      LOG(INFO) << "pserver started : " << FLAGS_port + i;
+      pservers[i]->start();
+    }
+  } else {
+    str::split(FLAGS_nics, ',', &devices);
+    pservers.resize(devices.size() * numPorts);
+    for (int i = 0; i < numPorts; ++i) {
+      for (size_t j = 0; j < devices.size(); ++j) {
+        if (FLAGS_rdma_tcp == "rdma") {
+          pservers[i * devices.size() + j].reset(new ParameterServer2(
+              getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++));
+          rdmaCpu = rdmaCpu % onlineCpus;
+        } else {
+          pservers[i * devices.size() + j].reset(
+              new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i));
+        }
+        CHECK(pservers[i * devices.size() + j]->init())
+            << "Fail to initialize parameter server" << devices[j]
+            << FLAGS_port + i;
+        LOG(INFO) << "pserver started : " << devices[j] << ":"
+                  << FLAGS_port + i;
+        pservers[i * devices.size() + j]->start();
+      }
+    }
+  }
+
+  for (auto& pserver : pservers) {
+    pserver->join();
+  }
+
+  return 0;
+}
diff --git a/paddle/pserver/ProtoServer.cpp b/paddle/pserver/ProtoServer.cpp
new file mode 100644
index 00000000000000..0ce06ddf918029
--- /dev/null
+++ b/paddle/pserver/ProtoServer.cpp
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "ProtoServer.h"
+
+namespace paddle {
+
+void ProtoServer::handleRequest(std::unique_ptr<MsgReader> msgReader,
+                                ResponseCallback callback) {
+  /// 0 for funcName
+  /// 1 for proto
+  CHECK_GE(msgReader->getNumBlocks(), (size_t)2);
+
+  std::string funcName(msgReader->getNextBlockLength(), 0);
+  /// read function name string
+  msgReader->readNextBlock(&funcName[0]);
+  /// looking up rpc wrapped callback function
+  auto it = nameToFuncMap_.find(funcName);
+  if (it != nameToFuncMap_.end()) {
+#ifndef PADDLE_DISABLE_TIMER
+    gettimeofday(&(*(handleRequestBegin_)), nullptr);
+#endif
+    it->second(std::move(msgReader), callback);
+  } else {
+    LOG(ERROR) << "Unknown funcName: " << funcName;
+    std::vector<iovec> iovs;
+    callback(iovs);
+  }
+}
+
+void ProtoServer::registerServiceFunctionImp(const std::string& funcName,
+                                             ServiceFunction func) {
+  CHECK(!nameToFuncMap_.count(funcName))
+      << "Duplicated registration: " << funcName;
+  nameToFuncMap_[funcName] = func;
+}
+
+void ProtoClient::send(const char* funcName,
+                       const google::protobuf::MessageLite& proto,
+                       const std::vector<iovec>& userIovs) {
+  std::string protoStr;
+  CHECK(proto.SerializeToString(&protoStr));
+  std::vector<iovec> iovs;
+  iovs.reserve(iovs.size() + 2);
+  /// sending function name string, protobuf data and user additional data
+  iovs.push_back({(void*)funcName, strlen(funcName)});
+  iovs.push_back({&protoStr[0], protoStr.size()});
+  iovs.insert(iovs.end(), userIovs.begin(), userIovs.end());
+  channel_->writeMessage(iovs);
+}
+
+std::unique_ptr<MsgReader> ProtoClient::recv(
+    google::protobuf::MessageLite* proto) {
+  std::vector<iovec> iovs;
+  std::unique_ptr<MsgReader> msgReader = channel_->readMessage();
+  CHECK_GE(msgReader->getNumBlocks(), (size_t)1);
+  std::string str(msgReader->getNextBlockLength(), 0);
+  msgReader->readNextBlock(&str[0]);
+  CHECK(proto->ParseFromString(str));
+  return msgReader;
+}
+
+}  // namespace paddle
diff --git a/paddle/pserver/ProtoServer.h b/paddle/pserver/ProtoServer.h
new file mode 100644
index 00000000000000..86e715868356ca
--- /dev/null
+++ b/paddle/pserver/ProtoServer.h
@@ -0,0 +1,251 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "LightNetwork.h"
+
+#include <map>
+
+#include <google/protobuf/message_lite.h>
+
+namespace paddle {
+
+  /**
+   *
+   * It implements the rpc framework, which launchs one thread for each
+   * connection. Here define one parameter server as single TCP server
+   * binding on single port. All connections share single tcp ProtoServer
+   * object, each connection handles all requests from specified trainer
+   * within single worker thread.
+   * to accelerate bandwidth efficiency and harness multicore for pserver
+   * optimization to reduce pserver latency, you could launch more port
+   * for single NIC hardward with --port=N(N>1) for small cluster job.
+   */
+class ProtoServer : public SocketServer {
+public:
+  /// rdmaCpu controls the cpu affinity of RDMA server daemon,
+  /// which could benifit performance. rdmaCpu = -1 means TCP
+  /// is used instead of RDMA transport.
+  ProtoServer(const std::string& addr, int port, int rdmaCpu = -1)
+      : SocketServer(addr, port, rdmaCpu) {}
+
+  typedef std::function<void(const google::protobuf::MessageLite& protoOut,
+                             const std::vector<iovec>& outputIovs)>
+      ProtoResponseCallbackEx;
+
+  typedef std::function<void(const google::protobuf::MessageLite& protoOut)>
+      ProtoResponseCallback;
+
+  /**
+   * Register a service function for this server
+   * void(const ProtoIn& request,
+   *      ProtoResponseCallback callback)
+   * The service function process the request and call the callback
+   * after it finishes the request.
+
+   * Use macro REGISTER_SERVICE_FUNCTION as a helper
+   * to simplify the use.
+   */
+  template <class ProtoIn>
+  void registerServiceFunction(
+      const std::string& funcName,
+      std::function<void(const ProtoIn& request,
+                         ProtoResponseCallback callback)> func);
+
+  /**
+   * Register a service function for this server
+   * The signature of the service function is
+   * void(const ProtoIn&,
+   *      std::unique_ptr<MsgReader> msgReader,
+   *      ProtoResponseCallbackEx callback)
+   * The service function process the request and call the callback
+   * after it finishes the request.
+   * The extended service function can take extra input blocks from
+   * the communication channel by reading msgReader. It can also
+   * send extra blocks to the communication channel by providing
+   * outputIovs as the argument for the callback function.
+
+   * Use macro REGISTER_SERVICE_FUNCTION_EX as a helper
+   * to simplify the use.
+   */
+  template <class ProtoIn>
+  void registerServiceFunctionEx(
+      const std::string& funcName,
+      std::function<void(const ProtoIn&, std::unique_ptr<MsgReader> msgReader,
+                         ProtoResponseCallbackEx callback)> func);
+
+protected:
+  /**
+   * @brief handle rpc request
+   * @param[in] msgReader  Message reader for reading data from connection
+   * @param[in] callback   equal to channel->writeMessage
+   *
+   * @note  it lookups rpc function mapping table to find function pointer,
+   *        then call this function with further reading data from connection
+   */
+  virtual void handleRequest(std::unique_ptr<MsgReader> msgReader,
+                             ResponseCallback callback);
+
+  typedef std::function<void(std::unique_ptr<MsgReader> msgReader,
+                             ResponseCallback callback)> ServiceFunction;
+
+  /**
+   * @brief register one RPC function in function mapping
+   * @param[in] funcName  function name string
+   * @param[in] func      rpc function wrapped with reading and writing data
+   */
+  void registerServiceFunctionImp(const std::string& funcName,
+                                  ServiceFunction func);
+
+protected:
+  /// Tuning bare network overhead: the beginning of receiving request
+  ThreadLocal<struct timeval> handleRequestBegin_;
+
+  /// mapping to find rpc function while handling request
+  std::map<std::string, ServiceFunction> nameToFuncMap_;
+};
+
+class ProtoClient : public SocketClient {
+public:
+  ProtoClient(const std::string& serverAddr, int serverPort,
+              enum ChannelType channelType = F_TCP)
+      : SocketClient(serverAddr, serverPort, channelType) {}
+
+  /**
+   * @brief Make a request to the server.
+   * @param[in] funcName  request rpc function name string
+   * @param[in] proto     protobuf data for sending to pserver
+   * @param[in] iov       additional iov data for sending to pserver
+   *
+   * @note  iov provides additional blocks which need to be written to the
+   *        communication channel
+   */
+  void send(const char* funcName, const google::protobuf::MessageLite& proto,
+            const std::vector<iovec>& iov = std::vector<iovec>());
+
+  /**
+   * @brief receive the response from the server.
+   * @param[in] proto     proto binary buffer
+   *
+   * @note  this must be paired with a corresponding send() call. The
+   *        returned MsgReader allows the caller to receive additional
+   *        blocks from the communication channel.
+   */
+  std::unique_ptr<MsgReader> recv(google::protobuf::MessageLite* proto);
+
+  /// combines send() and recv()
+  std::unique_ptr<MsgReader> sendAndRecv(
+      const char* funcName, const google::protobuf::MessageLite& protoIn,
+      google::protobuf::MessageLite* protoOut) {
+    send(funcName, protoIn);
+    return recv(protoOut);
+  }
+
+  /// combines send() and recv()
+  std::unique_ptr<MsgReader> sendAndRecv(
+      const char* funcName, const google::protobuf::MessageLite& protoIn,
+      const std::vector<iovec>& iov, google::protobuf::MessageLite* protoOut) {
+    send(funcName, protoIn, iov);
+    return recv(protoOut);
+  }
+};
+
+template <class>
+struct service_arg_type;
+/// helper class for obtaining the argument type of a service function
+template <class R, class C, class Arg1, class Arg2>
+struct service_arg_type<R (C::*)(const Arg1&, Arg2)> {
+  typedef Arg1 _1;
+};
+
+template <class R, class C, class Arg1, class Arg2>
+struct service_arg_type<R (C::*)(const Arg1&, std::unique_ptr<MsgReader>,
+                                 Arg2)> {
+  typedef Arg1 _1;
+};
+
+/// register a service function to the ProtoServer
+/// This should only be used within a member function of className
+#define REGISTER_SERVICE_FUNCTION(className, funcName)                        \
+  registerServiceFunction<                                                    \
+      service_arg_type<decltype(&className::funcName)>::_1>(                  \
+      #funcName, std::bind(&className::funcName, this, std::placeholders::_1, \
+                           std::placeholders::_2))
+
+/// register a service function to the ProtoServer
+/// This should only be used within a member function of className
+#define REGISTER_SERVICE_FUNCTION_EX(className, funcName)                     \
+  registerServiceFunctionEx<                                                  \
+      service_arg_type<decltype(&className::funcName)>::_1>(                  \
+      #funcName, std::bind(&className::funcName, this, std::placeholders::_1, \
+                           std::placeholders::_2, std::placeholders::_3))
+
+/// create wrapper function for parameter server high level function and
+/// register the wrapper function into function mapping.
+template <class ProtoIn>
+void ProtoServer::registerServiceFunctionEx(
+    const std::string& funcName,
+    std::function<void(const ProtoIn&, std::unique_ptr<MsgReader> msgReader,
+                       ProtoResponseCallbackEx callback)> func) {
+  auto f =
+      [func](std::unique_ptr<MsgReader> msgReader, ResponseCallback callback) {
+        ProtoIn request;
+        std::string str(msgReader->getNextBlockLength(), 0);
+        msgReader->readNextBlock(&str[0]);
+        CHECK(request.ParseFromString(str));
+        auto pcob = [callback](const google::protobuf::MessageLite& response,
+                               const std::vector<iovec>& outputIovs) {
+          std::string out;
+          CHECK(response.SerializeToString(&out));
+          std::vector<iovec> iovs;
+          iovs.push_back({&out[0], out.size()});
+          iovs.insert(iovs.end(), outputIovs.begin(), outputIovs.end());
+          callback(iovs);
+        };
+
+        func(request, std::move(msgReader), pcob);
+      };
+
+  registerServiceFunctionImp(funcName, f);
+}
+
+template <class ProtoIn>
+void ProtoServer::registerServiceFunction(
+    const std::string& funcName,
+    std::function<void(const ProtoIn&, ProtoResponseCallback callback)> func) {
+  auto f =
+      [func](std::unique_ptr<MsgReader> msgReader, ResponseCallback callback) {
+        ProtoIn request;
+        std::string str(msgReader->getNextBlockLength(), 0);
+        msgReader->readNextBlock(&str[0]);
+        CHECK(request.ParseFromString(str));
+        msgReader.reset();
+
+        auto pcob = [callback](const google::protobuf::MessageLite& response) {
+          std::string out;
+          CHECK(response.SerializeToString(&out));
+          std::vector<iovec> iovs;
+          iovs.push_back({&out[0], out.size()});
+          callback(iovs);
+        };
+
+        func(request, pcob);
+      };
+
+  registerServiceFunctionImp(funcName, f);
+}
+
+}  // namespace paddle
diff --git a/paddle/pserver/PserverForPython.h b/paddle/pserver/PserverForPython.h
new file mode 100644
index 00000000000000..5bbeae8bd8b973
--- /dev/null
+++ b/paddle/pserver/PserverForPython.h
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/pserver/ParameterClient.h"
+#include "paddle/pserver/ParameterServer.h"
+#include "paddle/parameter/Parameter.h"
+#include <Python.h>
+
+namespace paddle {
+
+struct PyObjectDeleter {
+  void operator()(PyObject* obj) {
+    if (obj) {
+      Py_DECREF(obj);
+    }
+  }
+};
+
+class ParameterClientPy : public ParameterClient {
+protected:
+  typedef std::unique_ptr<PyObject, PyObjectDeleter> PyObjectPtr;
+
+  std::vector<ParameterPtr> parameter_;
+  int initArgc_;
+  char** initArgv_;
+
+public:
+  ParameterClientPy(std::vector<std::string> configs, int argc,
+                    std::vector<std::string> argv, bool useGpu) {
+    initArgc_ = argc;
+    initArgv_ = new char* [argc];
+    for (int i = 0; i < argc; i++) {
+      initArgv_[i] = new char[argv[i].size()];
+      strcpy(initArgv_[i],      // NOLINT
+             argv[i].c_str());  // NOLINT TODO(yuyang18): use snprintf instead.
+    }
+    ParameterConfig pyConfig;
+    ParameterPtr param;
+    for (auto& config : configs) {
+      pyConfig.ParseFromString(config);
+      param.reset(new Parameter(pyConfig, useGpu));
+      parameter_.push_back(param);
+    }
+    Py_Initialize();
+    CHECK(Py_IsInitialized());
+  }
+
+  ~ParameterClientPy() {
+    delete initArgv_;
+    Py_Finalize();
+  }
+
+  Parameter getParameter(int idx) { return *(parameter_[idx].get()); }
+
+  void initClientPy() {
+    initMain(initArgc_, initArgv_);
+    CHECK(init(parameter_)) << "Init Client Failed.";
+  }
+
+  void setConfigPy(std::string config) {
+    OptimizationConfig optConfig;
+    optConfig.ParseFromString(config);
+    setConfig(optConfig);
+  }
+
+  bool inStatusPy(int status) { return inStatus(PServerStatus(status)); }
+
+  void setStatusPy(int status) { setStatus(PServerStatus(status)); }
+
+  void waitForStatusPy(int status) { waitForStatus(PServerStatus(status)); }
+
+  void sendParameterPy(int updateMode, int parameterType, int numSamples,
+                       real cost, bool sendBackParameter) {
+    sendParameter(ParameterUpdateMode(updateMode), ParameterType(parameterType),
+                  int64_t(numSamples), real(cost), sendBackParameter);
+  }
+
+  template <class ProtoIn, class ProtoOut>
+  std::string asyncCallPy(const char* serviceName, const char* funcName,
+                          const std::string in) {
+    ProtoIn protoIn;
+    ProtoOut protoOut;
+    std::mutex waitLock;
+    std::string data;
+    protoIn.ParseFromString(in);
+    waitLock.lock();
+    auto callback = [&](ProtoOut* pOut, bool isSuccessful) {
+      if (isSuccessful) {
+        pOut->SerializeToString(&data);
+      } else {
+        LOG(INFO) << "Async Talk Failed.";
+      }
+      waitLock.unlock();
+    };
+
+    ubClient_.asyncCall<ProtoIn, ProtoOut>(serviceName, funcName, protoIn,
+                                           &protoOut, callback);
+    waitLock.lock();
+    protoOut.SerializeToString(&data);
+    return data;
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/pserver/RDMANetwork.h b/paddle/pserver/RDMANetwork.h
new file mode 100644
index 00000000000000..05b845b68a150c
--- /dev/null
+++ b/paddle/pserver/RDMANetwork.h
@@ -0,0 +1,160 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifndef PADDLE_DISABLE_RDMA
+#include "sxi_sock.h"
+#else
+#define PROMPT_ERR() LOG(FATAL) << "Paddle is not compiled with rdma"
+#endif
+#include "paddle/utils/Logging.h"
+
+#include <netinet/in.h>
+struct sxi_sock;
+struct sxi_socket;
+
+#ifndef MAX_VEC_SIZE
+// define default MAX_VEC_SIZE
+#define MAX_VEC_SIZE (1UL << 16)
+#endif
+
+namespace paddle {
+/// Namespace rdma is adaptors for sxi_sock.h. Make paddle not depend on it
+/// when disable rdma support
+namespace rdma {
+inline int numCpus() {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_num_configured_cpus();
+#else
+  return 0;
+#endif
+}
+
+inline sxi_socket* ssocket(int cpuId) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_ssocket(cpuId);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline int listen(sxi_socket* s) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_listen(s);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline int bind(sxi_socket* s, const char* str) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_bind(s, str);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline sxi_sock* accept(sxi_socket* s) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_accept(s);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline sockaddr_in* getSourceAddress(sxi_sock* sock) {
+#ifndef PADDLE_DISABLE_RDMA
+  return reinterpret_cast<sockaddr_in *>(&sock->sa);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline int close(sxi_socket* sock) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_socket_close(sock);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline int close(sxi_sock* sock) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_sock_close(sock);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+
+inline void init() {
+#ifndef PADDLE_DISABLE_RDMA
+  sxi_module_init();
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline sxi_socket* csocket(int cpuId) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_csocket(cpuId);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline ssize_t read(sxi_sock* channel, void* data, size_t len) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_read(channel, data, len);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline ssize_t write(sxi_sock* channel, void* data, size_t len) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_write(channel, data, len);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline ssize_t readv(sxi_sock* channel, iovec* iov, int count) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_readv(channel, iov, count);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline ssize_t writev(sxi_sock* channel, iovec* iov, int count) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_writev(channel, iov, count);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline sxi_sock* connect(sxi_socket* socket, const char* url) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_connect(socket, url);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+
+}  //  namespace rdma
+}  //  namespace paddle
diff --git a/paddle/pserver/SocketChannel.cpp b/paddle/pserver/SocketChannel.cpp
new file mode 100644
index 00000000000000..ebb4245b9a7df2
--- /dev/null
+++ b/paddle/pserver/SocketChannel.cpp
@@ -0,0 +1,208 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "SocketChannel.h"
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <unistd.h>
+#include "RDMANetwork.h"
+
+#include "paddle/utils/Util.h"
+
+namespace paddle {
+
+SocketChannel::~SocketChannel() {
+  if (tcpRdma_ == F_TCP)
+    close(tcpSocket_);
+  else
+    rdma::close(rdmaSocket_);
+  LOG(INFO) << "destory connection in socket channel, peer = " << peerName_;
+}
+
+size_t SocketChannel::read(void* buf, size_t size) {
+  size_t total = 0;
+  while (total < size) {
+    ssize_t len;
+    if (tcpRdma_ == F_TCP)
+      len = ::read(tcpSocket_, (char*)buf + total, size - total);
+    else
+      len = rdma::read(rdmaSocket_, (char*)buf + total, size - total);
+
+    PCHECK(len >= 0) << " peer=" << peerName_;
+    if (len <= 0) {
+      return total;
+    }
+    total += len;
+  }
+  return total;
+}
+
+size_t SocketChannel::write(const void* buf, size_t size) {
+  size_t total = 0;
+  while (total < size) {
+    ssize_t len;
+    if (tcpRdma_ == F_TCP)
+      len = ::write(tcpSocket_, (const char*)buf + total, size - total);
+    else
+      len = rdma::write(rdmaSocket_, (char*)buf + total, size - total);
+
+    PCHECK(len >= 0) << " peer=" << peerName_;
+    if (len <= 0) {
+      return total;
+    }
+    total += len;
+  }
+  return total;
+}
+
+template <class IOFunc, class SocketType>
+static size_t readwritev(IOFunc iofunc, SocketType socket, iovec* iovs,
+                         int iovcnt, int maxiovs, const std::string& peerName) {
+  int curIov = 0;
+  size_t total = 0;
+
+  for (int i = 0; i < iovcnt; ++i) {
+    total += iovs[i].iov_len;
+  }
+
+  size_t size = 0;
+  size_t curIovSizeDone = 0;
+
+  while (size < total) {
+    ssize_t len =
+        iofunc(socket, &iovs[curIov], std::min(iovcnt - curIov, maxiovs));
+    PCHECK(len > 0) << " peer=" << peerName << " curIov=" << curIov
+                    << " iovCnt=" << iovcnt
+                    << " iovs[curIov].base=" << iovs[curIov].iov_base
+                    << " iovs[curIov].iov_len=" << iovs[curIov].iov_len;
+    size += len;
+
+    /// restore iovs[curIov] to the original value
+    iovs[curIov].iov_base =
+        (void*)((char*)iovs[curIov].iov_base - curIovSizeDone);
+    iovs[curIov].iov_len += curIovSizeDone;
+
+    len += curIovSizeDone;
+
+    while (curIov < iovcnt) {
+      if ((size_t)len < iovs[curIov].iov_len) break;
+      len -= iovs[curIov].iov_len;
+      ++curIov;
+    }
+    if (curIov < iovcnt) {
+      curIovSizeDone = len;
+      iovs[curIov].iov_base = (void*)((char*)iovs[curIov].iov_base + len);
+      iovs[curIov].iov_len -= len;
+    }
+  }
+  return size;
+}
+
+
+/// rdma::readv and rdma::writev can take advantage of RDMA blocking offload
+/// transfering
+size_t SocketChannel::writev(const std::vector<struct iovec>& iovs) {
+  if (tcpRdma_ == F_TCP)
+    return readwritev(::writev, tcpSocket_, const_cast<iovec*>(&iovs[0]),
+                      iovs.size(), UIO_MAXIOV, peerName_);
+  else
+    return readwritev(rdma::writev, rdmaSocket_, const_cast<iovec*>(&iovs[0]),
+                      iovs.size(), MAX_VEC_SIZE, peerName_);
+}
+
+size_t SocketChannel::readv(std::vector<struct iovec>* iovs) {
+  if (tcpRdma_ == F_TCP)
+    return readwritev(::readv, tcpSocket_, const_cast<iovec*>(&(*iovs)[0]),
+                      iovs->size(), UIO_MAXIOV, peerName_);
+  else
+    return readwritev(rdma::readv, rdmaSocket_, const_cast<iovec*>(&(*iovs)[0]),
+                      iovs->size(), MAX_VEC_SIZE, peerName_);
+}
+
+void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
+  MessageHeader header;
+  header.numIovs = userIovs.size();
+
+  std::vector<size_t> iovLengths;
+  iovLengths.reserve(userIovs.size());
+  for (auto& iov : userIovs) {
+    iovLengths.push_back(iov.iov_len);
+  }
+
+  std::vector<iovec> iovs;
+  iovs.reserve(userIovs.size() + 2);
+  iovs.push_back({&header, sizeof(header)});
+  iovs.push_back({&iovLengths[0], sizeof(iovLengths[0]) * header.numIovs});
+  iovs.insert(iovs.end(), userIovs.begin(), userIovs.end());
+
+  header.totalLength = 0;
+  for (auto& iov : iovs) {
+    header.totalLength += iov.iov_len;
+  }
+
+  PCHECK(writev(iovs) == (size_t)header.totalLength);
+}
+
+std::unique_ptr<MsgReader> SocketChannel::readMessage() {
+  MessageHeader header;
+
+  size_t len = read(&header, sizeof(header));
+  if (len == 0) {
+    return nullptr;
+  }
+
+  PCHECK(len == sizeof(header));
+
+  std::unique_ptr<MsgReader> msgReader(new MsgReader(this, header.numIovs));
+
+  CHECK_EQ(msgReader->getTotalLength() + sizeof(header) +
+               msgReader->getNumBlocks() * sizeof(size_t),
+           (size_t)header.totalLength)
+      << " totalLength=" << msgReader->getTotalLength()
+      << " numBlocks=" << msgReader->getNumBlocks();
+  return msgReader;
+}
+
+MsgReader::MsgReader(SocketChannel* channel, size_t numBlocks)
+    : channel_(channel), blockLengths_(numBlocks), currentBlockIndex_(0) {
+  size_t size = numBlocks * sizeof(blockLengths_[0]);
+  PCHECK(channel_->read(&blockLengths_[0], size) == size);
+}
+
+void MsgReader::readBlocks(const std::vector<void*>& bufs) {
+  CHECK_LE(currentBlockIndex_ + bufs.size(), blockLengths_.size());
+  std::vector<iovec> iovs;
+  iovs.reserve(bufs.size());
+  size_t totalLength = 0;
+  for (void* buf : bufs) {
+    iovs.push_back({buf, getNextBlockLength()});
+    totalLength += getNextBlockLength();
+    ++currentBlockIndex_;
+  }
+
+  PCHECK(channel_->readv(&iovs) == totalLength);
+}
+
+void MsgReader::readNextBlock(void* buf) {
+  CHECK_LT(currentBlockIndex_, blockLengths_.size());
+  PCHECK(channel_->read(buf, getNextBlockLength()) == getNextBlockLength());
+  ++currentBlockIndex_;
+}
+
+}  // namespace paddle
diff --git a/paddle/pserver/SocketChannel.h b/paddle/pserver/SocketChannel.h
new file mode 100644
index 00000000000000..fb9ac2e1dc23d9
--- /dev/null
+++ b/paddle/pserver/SocketChannel.h
@@ -0,0 +1,154 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "paddle/utils/Util.h"
+
+#include <sys/uio.h>
+
+#include <memory>
+#include <vector>
+
+struct sxi_sock;
+
+namespace paddle {
+
+class SocketChannel;
+enum ChannelType {
+  F_TCP = 1,
+  F_RDMA = 2,
+};
+
+/// reading a set of blocks of data from SocketChannel.
+class MsgReader {
+public:
+  MsgReader(SocketChannel* channel, size_t numIovs);
+  ~MsgReader() {
+    /// ensure all data blocks have been processed
+    CHECK_EQ(currentBlockIndex_, blockLengths_.size());
+  }
+  /**
+   * @brief number of remaining parts
+   */
+  size_t getNumBlocks() const {
+    return blockLengths_.size() - currentBlockIndex_;
+  }
+
+  /**
+   * @brief lenght of next block
+   */
+  size_t getNextBlockLength() const { return getBlockLength(0); }
+
+  /**
+   * @brief get the total length of all the remaining blocks
+   */
+  size_t getTotalLength() const {
+    size_t total = 0;
+    for (size_t i = currentBlockIndex_; i < blockLengths_.size(); ++i) {
+      total += blockLengths_[i];
+    }
+    return total;
+  }
+
+  /**
+   * @brief Get the length for block currentBlockIndex + i
+   */
+  size_t getBlockLength(size_t i) const {
+    return blockLengths_[currentBlockIndex_ + i];
+  }
+
+  /**
+   * @brief  read blocks data and store it to buf
+   */
+  void readBlocks(const std::vector<void*>& bufs);
+  void readNextBlock(void* buf);
+
+protected:
+  SocketChannel* channel_;
+  std::vector<size_t> blockLengths_;
+  size_t currentBlockIndex_;
+};
+
+/// APIs for reading and writing byte stream data or naive iov data
+/// from the APIs both RDMA and TCP exhibits byte stream style
+class SocketChannel {
+public:
+  SocketChannel(int socket, const std::string& peerName)
+      : tcpSocket_(socket), peerName_(peerName) {
+    tcpRdma_ = F_TCP;
+  }
+  SocketChannel(struct sxi_sock* socket, const std::string& peerName)
+      : rdmaSocket_(socket), peerName_(peerName) {
+    tcpRdma_ = F_RDMA;
+  }
+
+  ~SocketChannel();
+
+  const std::string& getPeerName() const { return peerName_; }
+
+  /**
+   * @brief read size bytes.
+   *
+   * @note  keep reading until getting size bytes or sock is closed
+   *        is closed
+   */
+  size_t read(void* buf, size_t size);
+
+  /**
+   * @brief write size bytes.
+   *
+   * @note  keep writing until writing size bytes or sock is closed
+   */
+  size_t write(const void* buf, size_t size);
+
+  /**
+   * @brief write a set of buffers.
+   *
+   * @note  keep writing until all buffers are written or sock is closed
+   */
+  size_t writev(const std::vector<struct iovec>& iov);
+
+  /**
+   * @brief read a set of buffers.
+   *
+   * @note  keep reading until all buffers are full or sock is closed.
+   */
+  size_t readv(std::vector<struct iovec>* iov);
+
+  /**
+   * @brief write a set of buffers.
+   *
+   * @note  keep writing until all buffers are passed or sock is closed
+   */
+  void writeMessage(const std::vector<struct iovec>& iov);
+
+  /// return null to indicate socket is closed
+  std::unique_ptr<MsgReader> readMessage();
+
+protected:
+  struct MessageHeader {
+    int64_t totalLength;  /// include the header
+    int64_t numIovs;
+    int64_t iovLengths[0];
+  };
+
+  int tcpSocket_;
+  struct sxi_sock* rdmaSocket_;
+  const std::string peerName_;
+  enum ChannelType tcpRdma_;
+};
+
+}  // namespace paddle
diff --git a/paddle/pserver/SparseParameterDistribution.cpp b/paddle/pserver/SparseParameterDistribution.cpp
new file mode 100644
index 00000000000000..31682c158e8006
--- /dev/null
+++ b/paddle/pserver/SparseParameterDistribution.cpp
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <unistd.h>
+
+#include "paddle/utils/Logging.h"
+
+#include "paddle/utils/Flags.h"
+
+#include "SparseParameterDistribution.h"
+
+P_DEFINE_bool(check_sparse_distribution_in_pserver, false,
+              "check whether sparse parameter exhibts balanced distribution at "
+              "all pservers");
+P_DEFINE_bool(show_check_sparse_distribution_log, false,
+              "show logs details for sparse parameter distribution in pserver");
+P_DEFINE_int32(check_sparse_distribution_batches, 100,
+               "run sparse parameter distribution check for N batches");
+P_DEFINE_double(
+    check_sparse_distribution_ratio, 0.6,
+    "if parameters dispatched to different pservers exhibit unbalanced "
+    " distribution for check_sparse_distribution_ratio * "
+    " check_sparse_distribution_batches times, crash program");
+P_DEFINE_double(check_sparse_distribution_unbalance_degree, 2.0,
+                "the ratio of maximum data size and minimun data size for "
+                "different pserver");
+
+namespace paddle {
+
+SparseParameterDistribution::SparseParameterDistribution(size_t serviceNum) {
+  totBytes_ = 0;
+  data_.resize(serviceNum);
+
+  batchPassed_ = 0;
+  unbalanceCnt_ = 0;
+}
+
+void SparseParameterDistribution::probeDistribution(int serverId,
+                                                    size_t dataSize) {
+  if (!FLAGS_check_sparse_distribution_in_pserver ||
+      batchPassed_ > FLAGS_check_sparse_distribution_batches) {
+    return;
+  }
+
+  CHECK_LT((size_t)serverId, data_.size())
+      << "invalid sparse parameter distribution probe";
+
+  data_[serverId] += dataSize;
+  totBytes_ += dataSize;
+}
+
+void SparseParameterDistribution::checkAndResetDistribution() {
+  if (!FLAGS_check_sparse_distribution_in_pserver ||
+      batchPassed_ >= FLAGS_check_sparse_distribution_batches) {
+    return;
+  }
+
+  /// at runtime, prepareSendData is called by many contexts,
+  /// so need to check if data is avaiable.
+  if (!totBytes_) {
+    return;
+  }
+
+  /// check if distribution is balanced
+  auto avgSize = totBytes_ / data_.size();
+  auto unbalanceDegree = FLAGS_check_sparse_distribution_unbalance_degree;
+  for (auto& dataSize : data_) {
+    if (dataSize > unbalanceDegree * avgSize ||
+        dataSize * unbalanceDegree < avgSize) {
+      unbalanceCnt_++;
+      break;
+    }
+  }
+
+  auto printData = [&]() {
+    std::stringstream ss;
+    for (auto& dataSize : data_) {
+      ss << dataSize * 0.001 << "KB ";
+    }
+    ss << std::endl;
+    LOG(INFO) << ss.str();
+  };
+
+  /// show all sparse data size for different pserver
+  if (FLAGS_show_check_sparse_distribution_log) {
+    LOG(INFO) << "sparse distribution:";
+    printData();
+  }
+
+  totBytes_ = 0;
+  batchPassed_++;
+
+  if (batchPassed_ == FLAGS_check_sparse_distribution_batches) {
+    LOG(INFO) << "show last parameter distribution sample:";
+    printData();
+    LOG(INFO) << "total unbalanced batches: " << unbalanceCnt_
+              << " in passed batches: " << batchPassed_;
+    CHECK_LE((float)unbalanceCnt_ / (float)batchPassed_,
+             FLAGS_check_sparse_distribution_ratio)
+        << "unbalanced sparse parameter distribution for different pserver. "
+        << "it could be caused by unbalanced sparse ids distribution, try "
+        << "to shuffle dimensions in input samples";
+  }
+
+  std::fill(data_.begin(), data_.end(), 0);
+}
+}  // namespace paddle
diff --git a/paddle/pserver/SparseParameterDistribution.h b/paddle/pserver/SparseParameterDistribution.h
new file mode 100644
index 00000000000000..af2b43af0ff58e
--- /dev/null
+++ b/paddle/pserver/SparseParameterDistribution.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <unistd.h>
+
+#include "paddle/utils/Logging.h"
+#include <atomic>
+
+namespace paddle {
+
+/*
+ * if sparse_remote_updater is used, different ParameterServer could
+ * be assigned with unbalanced gradients. the parameter value from
+ * ParameterServer also be not balanced. the distribution of different
+ * dimensions of sparse ids determines the unbalanced degree of data
+ * distributed among all ParameterServers. Even distribution will
+ * benifits cluster efficiency.
+ * do check the unbalanced degree of gradients at runtime, crash program
+ * if unbalanced distribution exhibts by default.
+ */
+class SparseParameterDistribution {
+public:
+  /// serviceNum means the number of ParameterServers
+  explicit SparseParameterDistribution(size_t serviceNum);
+  ~SparseParameterDistribution() {}
+  /// collect data
+  void probeDistribution(int serverId, size_t data);
+  void checkAndResetDistribution();
+
+private:
+  std::vector<size_t> data_;
+  std::atomic<size_t> totBytes_;
+
+  /// after some batches, stop to check
+  int batchPassed_;
+
+  /// stat on unbalanced distribution found
+  int unbalanceCnt_;
+};
+}  // namespace paddle
diff --git a/paddle/pserver/test/.gitignore b/paddle/pserver/test/.gitignore
new file mode 100644
index 00000000000000..aeb58c5b562c61
--- /dev/null
+++ b/paddle/pserver/test/.gitignore
@@ -0,0 +1,5 @@
+log
+test_ParameterServer
+test_ParameterServer2
+socket_test
+test_ProtoServer
diff --git a/paddle/pserver/test/CMakeLists.txt b/paddle/pserver/test/CMakeLists.txt
new file mode 100644
index 00000000000000..64654f67d0c2c8
--- /dev/null
+++ b/paddle/pserver/test/CMakeLists.txt
@@ -0,0 +1,26 @@
+######################### socket_test ########################
+add_unittest_without_exec(socket_test
+    SocketTest.cpp)
+
+add_test(NAME socket_test
+    COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port
+        ${CMAKE_CURRENT_BINARY_DIR}/socket_test --loop_time=10)
+
+####################### test_ProtoServer ####################
+add_unittest_without_exec(test_ProtoServer
+    test_ProtoServer.cpp)
+
+add_test(NAME test_ProtoServer
+    COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port
+        ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoServer)
+
+# TODO(yuyang18): Run test_ProtoServer when with rdma
+# add_test(NAME test_ProtoServerRDMA
+#   COMMAND ...)
+
+#################### test_ParameterServer2 ####################
+add_unittest_without_exec(test_ParameterServer2
+    test_ParameterServer2.cpp)
+add_test(NAME test_ParameterServer2
+    COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port -n 4
+        ${CMAKE_CURRENT_BINARY_DIR}/test_ParameterServer2)
diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp
new file mode 100644
index 00000000000000..260aed0083c5d1
--- /dev/null
+++ b/paddle/pserver/test/SocketTest.cpp
@@ -0,0 +1,256 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Util.h"
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <netinet/in.h>
+
+#include <thread>
+
+#include "paddle/math/Vector.h"
+#include "paddle/utils/Logging.h"
+
+struct MessageHeader {
+  int64_t dataLength;
+};
+
+class Thread {
+public:
+  void start();
+  virtual void run() = 0;
+  virtual ~Thread() {}
+
+protected:
+  std::unique_ptr<std::thread> thread_;
+};
+
+void Thread::start() {
+  thread_.reset(new std::thread([this]() { this->run(); }));
+}
+
+class SocketChannel {
+public:
+  explicit SocketChannel(int socket) : socket_(socket) {}
+  int getSocketFd() const { return socket_; }
+  uint64_t readAll(void* buf, size_t size);
+  uint64_t writeAll(const void* buf, size_t size);
+
+protected:
+  int socket_;
+};
+
+uint64_t SocketChannel::readAll(void* buf, size_t size) {
+  uint64_t total = 0;
+  while (total < size) {
+    int64_t len = read(socket_, (char*)buf + total, size - total);
+    if (len <= 0) {
+      return total;
+    }
+    total += len;
+  }
+  return total;
+}
+
+uint64_t SocketChannel::writeAll(const void* buf, size_t size) {
+  uint64_t total = 0;
+  while (total < size) {
+    int64_t len = write(socket_, (const char*)buf + total, size - total);
+    if (len <= 0) {
+      return total;
+    }
+    total += len;
+  }
+  return total;
+}
+
+class SocketWorker : public Thread {
+public:
+  explicit SocketWorker(int socket) : channel_(socket) {}
+  virtual void run();
+
+  // read n bytes.
+  int64_t readAll(char* buf, size_t n);
+
+  // write n bytes
+
+protected:
+  SocketChannel channel_;
+  std::string buffer_;
+};
+
+class SocketServer : public Thread {
+public:
+  explicit SocketServer(int port)
+      : port_(port), socket_(0), maxPendingConnections_(100) {}
+
+  virtual void run();
+
+protected:
+  int port_;
+  int socket_;
+  int maxPendingConnections_;
+};
+
+void SocketServer::run() {
+  int newsockfd;
+  socklen_t clilen;
+  struct sockaddr_in serv_addr, cli_addr;
+
+  /* First call to socket() function */
+  socket_ = socket(AF_INET, SOCK_STREAM, 0);
+  PCHECK(socket_ >= 0) << "ERROR opening socket";
+
+  /* Initialize socket structure */
+  bzero((char*)&serv_addr, sizeof(serv_addr));
+  serv_addr.sin_family = AF_INET;
+  serv_addr.sin_addr.s_addr = INADDR_ANY;
+  serv_addr.sin_port = htons(port_);
+
+  /* Now bind the host address using bind() call.*/
+  PCHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
+      << "ERROR on binding";
+
+  /* Now start listening for the clients, here process will
+   * go in sleep mode and will wait for the incoming connection
+   */
+  listen(socket_, maxPendingConnections_);
+  clilen = sizeof(cli_addr);
+
+  while (true) {
+    /* Accept actual connection from the client */
+    newsockfd = accept(socket_, (struct sockaddr*)&cli_addr, &clilen);
+    PCHECK(newsockfd >= 0) << "ERROR on accept";
+
+    SocketWorker* worker = new SocketWorker(newsockfd);
+    worker->start();
+  }
+}
+
+void SocketWorker::run() {
+  MessageHeader header;
+
+  while (true) {
+    int64_t n = channel_.readAll(&header, sizeof(header));
+    PCHECK(n == sizeof(header)) << "ERROR reading from socket";
+
+    buffer_.resize(header.dataLength);
+    n = channel_.readAll(&buffer_[0], header.dataLength);
+    PCHECK(n == header.dataLength) << "ERROR reading from socket";
+
+    /* Write a response to the client */
+    n = channel_.writeAll(&header, sizeof(header));
+    PCHECK(n == sizeof(header)) << "ERROR reading from socket";
+    n = channel_.writeAll(buffer_.data(), buffer_.size());
+    PCHECK(n == header.dataLength) << "ERROR writing to socket";
+  }
+}
+
+class SocketClient {
+public:
+  SocketClient(const std::string& serverAddr, int serverPort);
+  SocketChannel* getChannel() const { return channel_.get(); }
+
+protected:
+  std::unique_ptr<SocketChannel> channel_;
+};
+
+SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
+  struct sockaddr_in serv_addr;
+  struct hostent* server;
+
+  // char buffer[256];
+
+  /* Create a socket point */
+  int sockfd = socket(AF_INET, SOCK_STREAM, 0);
+  PCHECK(sockfd >= 0) << "ERROR opening socket";
+  server = gethostbyname(serverAddr.c_str());
+  PCHECK(server) << "ERROR, no such host: " << serverAddr;
+
+  bzero((char*)&serv_addr, sizeof(serv_addr));
+  serv_addr.sin_family = AF_INET;
+  bcopy((char*)server->h_addr, (char*)&serv_addr.sin_addr.s_addr,
+        server->h_length);
+  serv_addr.sin_port = htons(serverPort);
+
+  /* Now connect to the server */
+  PCHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
+      << "ERROR connecting";
+
+  channel_.reset(new SocketChannel(sockfd));
+}
+
+P_DEFINE_string(server_addr, "127.0.0.1", "Server address");
+P_DEFINE_int64(dim, 10000000, "Data size");
+P_DEFINE_int32(loop_time, 100000, "test loop time");
+
+using namespace paddle;  // NOLINT
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  SocketServer server(FLAGS_port);
+  server.start();
+  sleep(1);
+
+  SocketClient client(FLAGS_server_addr, FLAGS_port);
+
+  SocketChannel* channel = client.getChannel();
+
+  MessageHeader header;
+
+  uint64_t dataSize = FLAGS_dim * sizeof(real);
+
+#ifndef PADDLE_ONLY_CPU
+  GpuVector gpuParam(FLAGS_dim);
+  GpuVector gpuGrad(FLAGS_dim);
+#else
+  CpuVector gpuParam(FLAGS_dim);
+  CpuVector gpuGrad(FLAGS_dim);
+#endif
+  CpuVector cpuParam(FLAGS_dim);
+  CpuVector cpuGrad(FLAGS_dim);
+
+  gpuParam.rand();
+  gpuGrad.rand();
+  cpuParam.rand();
+  cpuGrad.rand();
+
+  for (int i = 0; i < FLAGS_loop_time; ++i) {
+    cpuGrad.copyFrom(gpuGrad);
+
+    header.dataLength = dataSize;
+    PCHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header))
+        << "Client write header error";
+
+    PCHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize)
+        << "Client write data error";
+
+    /* Now read server response */
+    PCHECK(channel->readAll(&header, sizeof(header)) == sizeof(header))
+        << "Client read header error";
+
+    CHECK_EQ((uint64_t)header.dataLength, dataSize);
+    PCHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize)
+        << "Client read data error";
+
+    gpuParam.copyFrom(cpuParam);
+
+    LOG_EVERY_N(INFO, 100) << "i=" << i;
+  }
+  exit(0);
+}
diff --git a/paddle/pserver/test/test_ParameterServer2.cpp b/paddle/pserver/test/test_ParameterServer2.cpp
new file mode 100644
index 00000000000000..c9722f1212ae9b
--- /dev/null
+++ b/paddle/pserver/test/test_ParameterServer2.cpp
@@ -0,0 +1,622 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/pserver/ParameterClient2.h>
+#include <paddle/pserver/ParameterServer2.h>
+#include <gtest/gtest.h>
+#include <paddle/utils/Flags.h>
+#include <paddle/utils/Util.h>
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+P_DECLARE_int32(num_gradient_servers);
+P_DEFINE_string(server_addr, "127.0.0.1", "assign server address");
+P_DEFINE_int32(server_cpu, 0, "assign server cpu");
+
+class ParameterServer2Tester : public ParameterServer2 {
+public:
+  ParameterServer2Tester(std::string serverAddr, int port, int rdmaCpu = -1,
+                         bool sepSendAndRecv = false)
+      : ParameterServer2(serverAddr, port, rdmaCpu), client_(sepSendAndRecv) {}
+  virtual ~ParameterServer2Tester() {}
+  void setup() {
+    CHECK(ParameterServer2::init());
+
+    parameters_.clear();
+    clientConfigs_.clear();
+
+    clientConfigs_.resize(2);
+    {
+      ParameterConfig& config = clientConfigs_[0];
+      config.set_name("para0");
+      config.set_para_id(0);
+      config.set_size(10000);
+      config.set_device(-1);
+      config.set_learning_rate(1.0);
+      config.set_momentum(0.9);
+    }
+
+    {
+      ParameterConfig& config = clientConfigs_[1];
+      config.set_name("para1");
+      config.set_para_id(1);
+      config.set_size(5000);
+      config.set_device(-1);
+      config.set_learning_rate(0.5);
+      config.set_momentum(0.4);
+    }
+
+    for (auto& config : clientConfigs_) {
+      parameters_.emplace_back(new Parameter(config, /* useGpu= */ false));
+    }
+
+    size_t id = 0;
+    for (auto &para : parameters_) {
+      para->setID(id++);
+    }
+
+    CHECK(client_.init(parameters_));
+    OptimizationConfig optConfig;
+    optConfig.set_algorithm("async_sgd");
+    optConfig.set_batch_size(100);
+    optConfig.set_learning_rate(0.1);
+    client_.setConfig(optConfig);
+    client_.setParameter();
+  }
+
+  void setConfigTest();
+  void setStatusTest();
+  void sendParameterTest();
+  void sendDataTest(SendDataType type, size_t size);
+  void operationTest();
+  void mergeBlockSegmentTest();
+  void checkSegments(const BlockSegments& expected, const BlockSegments& segs);
+  void waitPassFinishTest();
+  void synchronizeTest();
+
+protected:
+  ParameterClient2 client_;
+  vector<ParameterConfig> clientConfigs_;
+  vector<ParameterPtr> parameters_;
+};
+
+std::unique_ptr<ParameterServer2Tester> g_server;
+
+void ParameterServer2Tester::setConfigTest() {
+  setup();
+
+  for (auto& config : clientConfigs_) {
+    auto it = configMap_.find(config.para_id());
+    EXPECT_TRUE(it != configMap_.end());
+    auto& serverConfig = it->second;
+    EXPECT_EQ(config.name(), serverConfig.name());
+    EXPECT_EQ(config.size(), serverConfig.size());
+    EXPECT_EQ(config.learning_rate(), serverConfig.learning_rate());
+    EXPECT_EQ(config.momentum(), serverConfig.momentum());
+  }
+}
+
+void ParameterServer2Tester::setStatusTest() {
+  setup();
+  EXPECT_TRUE(client_.inStatus(PSERVER_STATUS_NOT_SET));
+  client_.setStatus(PSERVER_STATUS_PARAMETER_READY);
+  EXPECT_EQ(PSERVER_STATUS_PARAMETER_READY, status_);
+  EXPECT_TRUE(client_.inStatus(PSERVER_STATUS_PARAMETER_READY));
+}
+
+real sumVector(const CpuVector& vec) {
+  const real* data = vec.getData();
+  size_t dim = vec.getSize();
+  real sum = 0;
+  for (size_t i = 0; i < dim; ++i) {
+    sum += data[i];
+  }
+  return sum;
+}
+
+void ParameterServer2Tester::sendParameterTest() {
+  setup();
+
+  client_.sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM,
+                                  PARAMETER_VALUE,
+                                  0,       // numSamples = 0
+                                  0,       // cost = 0
+                                  false);  // sendBackParameter = false
+
+  vector<ParameterPtr> parameterCopies;
+
+  for (auto& parameter : parameters_) {
+    parameterCopies.emplace_back(
+        new Parameter(parameter->getConfig(), /* useGpu= */ false));
+    parameterCopies.back()
+        ->getBuf(PARAMETER_VALUE)
+        ->copyFrom(*parameter->getBuf(PARAMETER_VALUE));
+  }
+
+  client_.sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM,
+                                  PARAMETER_VALUE,
+                                  0,      // numSamples = 0
+                                  0,      // cost = 0
+                                  true);  // sendBackParameter = true
+
+  for (size_t i = 0; i != parameters_.size(); ++i) {
+    real* v1 = parameters_[i]->getBuf(PARAMETER_VALUE)->getData();
+    real* v2 = parameterCopies[i]->getBuf(PARAMETER_VALUE)->getData();
+    EXPECT_EQ(parameters_[i]->getSize(), parameterCopies[i]->getSize());
+    size_t size = parameters_[i]->getSize();
+    real sum1 = 0, sum2 = 0;
+    for (size_t j = 0; j < size; ++j) {
+      sum1 += v1[j];
+      sum2 += v2[j];
+    }
+    EXPECT_EQ(sum1, sum2);
+  }
+}
+
+void ParameterServer2Tester::sendDataTest(SendDataType type, size_t size) {
+  ParameterClient2 client1(true);
+  client1.init(parameters_);
+  ParameterClient2 client2(true);
+  client2.init(parameters_);
+  ParameterClient2 client3(true);
+  client3.init(parameters_);
+
+  ThreadWorker worker1;
+  ThreadWorker worker2;
+  ThreadWorker worker3;
+
+  double* testData1 = new double[size];
+  double* testData2 = new double[size];
+  double* testData3 = new double[size];
+  double* getDataExpect = new double[size];
+  double* getDataReal = new double[size];
+  for (size_t i = 0; i < size; ++i) {
+    testData1[i] = rand();  // NOLINT TODO(yuyang18): Use rand_r instead.
+    testData2[i] = rand();  // NOLINT
+    testData3[i] = rand();  // NOLINT
+    getDataExpect[i] = testData1[i] + testData2[i] + testData3[i];
+  }
+
+  auto put1 = [&]() {
+    LOG(INFO) << "putOwnData1 start";
+    client1.putOwnData(0, type, testData1, size);
+    LOG(INFO) << "putOwnData1 finish";
+  };
+
+  auto get1 = [&]() {
+    LOG(INFO) << "sendData1 get all start";
+    client1.getAllData(0, type, getDataReal, size);
+    for (size_t i = 0; i < size; ++i) {
+      CHECK_EQ(getDataReal[i], getDataExpect[i]);
+    }
+    LOG(INFO) << "sendData1 get all finish";
+  };
+
+  auto put2 = [&]() {
+    LOG(INFO) << "putOwnData2 start";
+    client2.putOwnData(1, type, testData2, size);
+    LOG(INFO) << "putOwnData2 finish";
+  };
+
+  auto put3 = [&]() {
+    LOG(INFO) << "putOwnData3 start";
+    client3.putOwnData(2, type, testData3, size);
+    LOG(INFO) << "putOwnData3 finish";
+  };
+
+  worker1.addJob(put1);
+  worker1.addJob(get1);
+  worker2.addJob(put2);
+  worker3.addJob(put3);
+
+  worker1.addJob(put1);
+  worker2.addJob(put2);
+  worker3.addJob(put3);
+  worker1.addJob(get1);
+
+  worker1.wait();
+  worker2.wait();
+  worker3.wait();
+  free(testData1);
+  free(testData2);
+  free(testData3);
+  free(getDataExpect);
+  free(getDataReal);
+}
+
+void ParameterServer2Tester::operationTest() {
+  PServerVector v1, v2;
+  v1 = client_.createVector();
+  EXPECT_EQ(NUM_PARAMETER_TYPES, v1.handle);
+
+  v2 = client_.createVector();
+  EXPECT_EQ(NUM_PARAMETER_TYPES + 1, v2.handle);
+
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_RESET, v1, (real)1);
+  ops.addOperation(PSERVER_OP_RESET, v2, (real)2);
+
+  real res1, res2, res3;
+  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res1);
+
+  ops.addOperation(PSERVER_OP_au_bv, v1, v2, (real)-1, (real)1);
+  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res2);
+
+  ops.addOperation(PSERVER_OP_au_bv, v1, v2, (real)-1, (real)1);
+  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res3);
+  client_.doOperation(ops, false, false);
+
+  EXPECT_EQ(30000, res1);
+  EXPECT_EQ(15000, res2);
+  EXPECT_EQ(0, res3);
+
+  PServerMatrix m1, m2;
+  m1 = client_.createMatrix(4);
+  EXPECT_EQ(0, m1.handle);
+  m2 = client_.createMatrix(8);
+  EXPECT_EQ(1, m2.handle);
+
+  // TODO(yuyang18): add tests for other operations OP_COPY, OP_au
+
+  client_.releaseVector(v1);
+  client_.releaseVector(v2);
+  client_.releaseMatrix(m1);
+  client_.releaseMatrix(m2);
+}
+
+void ParameterServer2Tester::checkSegments(const BlockSegments& expected,
+                                           const BlockSegments& segs) {
+  EXPECT_EQ(expected.size(), segs.size());
+  if (expected.size() != segs.size()) {
+    return;
+  }
+  for (size_t i = 0; i < expected.size(); ++i) {
+    EXPECT_EQ(expected[i], segs[i]);
+  }
+}
+
+void ParameterServer2Tester::mergeBlockSegmentTest() {
+  {
+    BlockSegments segs{{10, 20}, {30, 45}, {50, 70}};
+    mergeSegments(&segs);
+    checkSegments({{10, 20}, {30, 45}, {50, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {50, 70}, {10, 20}};
+    mergeSegments(&segs);
+    checkSegments({{10, 20}, {30, 45}, {50, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {50, 70}, {10, 30}};
+    mergeSegments(&segs);
+    checkSegments({{10, 45}, {50, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {10, 70}, {10, 30}};
+    mergeSegments(&segs);
+    checkSegments({{10, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {50, 70}, {10, 35}};
+    mergeSegments(&segs);
+    checkSegments({{10, 45}, {50, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {50, 70}, {10, 60}};
+    mergeSegments(&segs);
+    checkSegments({{10, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {50, 70}, {30, 47}};
+    mergeSegments(&segs);
+    checkSegments({{30, 47}, {50, 70}}, segs);
+  }
+}
+
+void ParameterServer2Tester::waitPassFinishTest() {
+  ParameterClient2 client1;
+  ParameterClient2 client2;
+  ParameterClient2 client3;
+
+  ThreadWorker worker1;
+  ThreadWorker worker2;
+  ThreadWorker worker3;
+
+  auto init1 = [&]() {
+    LOG(INFO) << "init1 start";
+    client1.init(parameters_);
+    LOG(INFO) << "init1 finish";
+  };
+
+  auto init2 = [&]() {
+    LOG(INFO) << "init2 start";
+    client2.init(parameters_);
+    LOG(INFO) << "init2 finish";
+  };
+
+  auto init3 = [&]() {
+    LOG(INFO) << "init3 start";
+    client3.init(parameters_);
+    LOG(INFO) << "init3 finish";
+  };
+
+  auto update1 = [&]() {
+    LOG(INFO) << "update1 start";
+    client1.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ADD_GRADIENT,
+                                    PARAMETER_VALUE,
+                                    0,      // numSamples = 0
+                                    0,      // cost = 0
+                                    true);  // sendBackParameter = false
+    LOG(INFO) << "update1 finish";
+  };
+
+  auto wait1 = [&]() {
+    LOG(INFO) << "wait1 start";
+    client1.waitPassFinish();
+    LOG(INFO) << "wait1 finish";
+  };
+
+  auto update2 = [&]() {
+    LOG(INFO) << "update2 start";
+    client2.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ADD_GRADIENT,
+                                    PARAMETER_VALUE,
+                                    0,      // numSamples = 0
+                                    0,      // cost = 0
+                                    true);  // sendBackParameter = false
+    LOG(INFO) << "update2 finish";
+  };
+
+  auto wait2 = [&]() {
+    LOG(INFO) << "wait2 start";
+    client2.waitPassFinish();
+    LOG(INFO) << "wait2 finish";
+  };
+
+  auto op3 = [&]() {
+    LOG(INFO) << "op3 start";
+    PreparedOperations ops;
+    ops.addOperation(PSERVER_OP_SGD);
+    client3.doOperation(ops,
+                        /* waitForGradient= */ true,
+                        /* sendBackarameter= */ true);
+    LOG(INFO) << "op3 finish";
+  };
+
+  worker1.addJob(init1);
+  worker2.addJob(init2);
+  worker3.addJob(init3);
+
+  worker1.addJob(update1);
+  worker2.addJob(update2);
+  worker3.addJob(op3);
+
+  worker3.addJob(op3);
+  worker3.addJob(op3);
+  worker2.addJob(update2);
+  worker2.addJob(update2);
+  worker1.addJob(wait1);
+
+  worker2.addJob(wait2);
+  worker3.addJob(op3);
+
+  worker1.wait();
+  worker2.wait();
+  worker3.wait();
+
+  LOG(INFO) << "Pass 1 finished";
+
+  worker1.addJob(update1);
+  worker2.addJob(update2);
+  worker3.addJob(op3);
+
+  worker1.wait();
+  worker2.wait();
+  worker3.wait();
+
+  worker3.addJob(op3);
+  worker3.addJob(op3);
+  worker1.addJob(update1);
+  worker1.addJob(wait1);
+  worker2.addJob(wait2);
+
+  worker1.wait();
+  worker2.wait();
+  worker3.wait();
+
+  LOG(INFO) << "Pass 2 finished";
+}
+
+void ParameterServer2Tester::synchronizeTest() {
+  ParameterClient2 client1;
+  ParameterClient2 client2;
+
+  ThreadWorker worker1;
+  ThreadWorker worker2;
+
+  FLAGS_log_period_server = 2;
+
+  auto init1 = [&]() {
+    LOG(INFO) << "init1 start";
+    client1.init(parameters_);
+    client1.setTrainerId(0);
+    LOG(INFO) << "init1 finish";
+  };
+
+  auto init2 = [&]() {
+    LOG(INFO) << "init2 start";
+    client2.init(parameters_);
+    client2.setTrainerId(1);
+    LOG(INFO) << "init2 finish";
+  };
+
+  auto update1 = [&]() {
+    LOG(INFO) << "update1 start";
+    client1.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ASYNC_SGD,
+                                    PARAMETER_VALUE,
+                                    0,      // numSamples = 0
+                                    0,      // cost = 0
+                                    true);  // sendBackParameter = false
+    LOG(INFO) << "update1 finish";
+  };
+
+  auto wait1 = [&]() {
+    LOG(INFO) << "wait1 start";
+    client1.asyncFinishPass();
+    LOG(INFO) << "wait1 finish";
+  };
+
+  auto update2 = [&]() {
+    LOG(INFO) << "update2 start";
+    client2.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ASYNC_SGD,
+                                    PARAMETER_VALUE,
+                                    0,      // numSamples = 0
+                                    0,      // cost = 0
+                                    true);  // sendBackParameter = false
+    LOG(INFO) << "update2 finish";
+  };
+
+  auto wait2 = [&]() {
+    LOG(INFO) << "wait2 start";
+    client2.asyncFinishPass();
+    LOG(INFO) << "wait2 finish";
+  };
+
+  worker1.addJob(init1);
+  worker2.addJob(init2);
+  // call wait to reset some stats at pserver
+  worker1.addJob(wait1);
+  worker2.addJob(wait2);
+
+  worker1.addJob(update1);
+  worker2.addJob(update2);
+
+  worker2.addJob(update2);
+  worker2.addJob(update2);
+  worker1.addJob(wait1);
+
+  worker2.addJob(wait2);
+
+  worker1.wait();
+  worker2.wait();
+  LOG(INFO) << "Pass 1 finished";
+
+  worker1.addJob(update1);
+  worker2.addJob(update2);
+
+  worker1.wait();
+  worker2.wait();
+
+  worker1.addJob(update1);
+  worker2.addJob(update2);
+  worker1.addJob(update1);
+  worker1.addJob(update1);
+  worker1.addJob(update1);
+  worker1.addJob(update1);
+  worker1.addJob(update1);
+  worker1.addJob(update1);
+  worker1.addJob(wait1);
+  worker2.addJob(wait2);
+
+  worker1.wait();
+  worker2.wait();
+  LOG(INFO) << "Pass 2 finished";
+}
+
+TEST(ParameterServer2, sendParameter) { g_server->sendParameterTest(); }
+
+TEST(ParameterServer2, setConfig) { g_server->setConfigTest(); }
+
+TEST(ParameterServer2, setStatus) { g_server->setStatusTest(); }
+
+TEST(ParameterServer2, operation) { g_server->operationTest(); }
+
+TEST(ParameterServer2, mergeBlockSegment) { g_server->mergeBlockSegmentTest(); }
+
+TEST(ParameterServer2, waitPassFinish) { g_server->waitPassFinishTest(); }
+
+TEST(ParameterServer2, synchronize) { g_server->synchronizeTest(); }
+
+TEST(ParameterServer2, sendData) {
+  // Set gserver and pserver all 3, so that the test is sufficient.
+  int oldFlagsPortsNUm = FLAGS_ports_num;
+  int oldFlagsNumGradientServers = FLAGS_num_gradient_servers;
+  int oldFlagsPort = FLAGS_port;
+  FLAGS_ports_num = 3;
+  FLAGS_num_gradient_servers = 3;
+  FLAGS_port = FLAGS_port + 1;
+  std::unique_ptr<ParameterServer2Tester> g_server1;
+  std::unique_ptr<ParameterServer2Tester> g_server2;
+  std::unique_ptr<ParameterServer2Tester> g_server3;
+  if (FLAGS_rdma_tcp == "rdma") {
+    g_server1.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port,
+                                               FLAGS_server_cpu));
+    g_server1->start();
+    g_server2.reset(new ParameterServer2Tester(
+        FLAGS_server_addr, FLAGS_port + 1, FLAGS_server_cpu + 1));
+    g_server2->start();
+    g_server3.reset(new ParameterServer2Tester(
+        FLAGS_server_addr, FLAGS_port + 2, FLAGS_server_cpu + 2));
+    g_server3->start();
+  } else {  // tcp
+    g_server1.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port));
+    g_server1->start();
+    g_server2.reset(
+        new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port + 1));
+    g_server2->start();
+    g_server3.reset(
+        new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port + 2));
+    g_server3->start();
+  }
+
+  g_server2->init();
+  g_server3->init();
+  sleep(2);
+  g_server1->setup();
+  g_server1->sendDataTest(DATA_REDUCE_SUM, 1 << 24);
+  sleep(2);
+  g_server1->sendDataTest(DATA_REDUCE_SUM, 2);
+  sleep(2);
+  g_server1.reset();
+  g_server2.reset();
+  g_server3.reset();
+
+  FLAGS_ports_num = oldFlagsPortsNUm;
+  FLAGS_num_gradient_servers = oldFlagsNumGradientServers;
+  FLAGS_port = oldFlagsPort;
+}
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+
+  FLAGS_num_gradient_servers = 2;
+
+  if (FLAGS_rdma_tcp == "rdma") {
+    g_server.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port,
+                                              FLAGS_server_cpu));
+  } else {
+    g_server.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port));
+  }
+
+  g_server->start();
+
+  sleep(2);
+
+  int ret = RUN_ALL_TESTS();
+
+  g_server.reset();
+
+  exit(ret);
+}
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
new file mode 100644
index 00000000000000..065d6b3396be22
--- /dev/null
+++ b/paddle/pserver/test/test_ProtoServer.cpp
@@ -0,0 +1,177 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Util.h"
+
+#include <gtest/gtest.h>
+
+#include "paddle/utils/Stat.h"
+#include "paddle/math/Vector.h"
+#include "paddle/pserver/ProtoServer.h"
+#include "ParameterService.pb.h"
+
+P_DEFINE_string(server_addr, "127.0.0.1", "Server address");
+P_DEFINE_int64(dim, 50000000, "Data size");
+P_DEFINE_bool(test_proto_server, true, "whether to test ProtoServer");
+P_DEFINE_bool(benchmark, false, "Do benchmark. Skip some tests");
+
+using namespace paddle;  // NOLINT
+
+class MyServer : public ProtoServer {
+public:
+  explicit MyServer(int port, int rdmaCpu = -1)
+      : ProtoServer(FLAGS_server_addr, port, rdmaCpu),
+        status_(PSERVER_STATUS_NOT_SET) {
+    REGISTER_SERVICE_FUNCTION(MyServer, getStatus);
+    REGISTER_SERVICE_FUNCTION(MyServer, setStatus);
+    REGISTER_SERVICE_FUNCTION_EX(MyServer, getStatusEx);
+  }
+  void getStatus(const GetStatusRequest& request,
+                 ProtoResponseCallback callback) {
+    (void)request;
+    GetStatusResponse response;
+    response.set_status(status_);
+    callback(response);
+  }
+
+  void getStatusEx(const GetStatusRequest& request,
+                   std::unique_ptr<MsgReader> msgReader,
+                   ProtoResponseCallbackEx callback) {
+    (void)request;
+    GetStatusResponse response;
+    response.set_status(status_);
+    buffer_.resize(msgReader->getNextBlockLength());
+    msgReader->readNextBlock(&buffer_[0]);
+    callback(response, {{&buffer_[0], buffer_.size()}});
+  }
+
+  void setStatus(const SetStatusRequest& request,
+                 ProtoResponseCallback callback) {
+    SetStatusResponse response;
+    status_ = request.status();
+    callback(response);
+  }
+
+protected:
+  PServerStatus status_;
+  std::string buffer_;
+};
+
+TEST(ProtoServer, regular) {
+  ProtoClient* client;
+  if (FLAGS_rdma_tcp == "rdma")
+    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
+  else
+    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_TCP);
+  {
+    GetStatusRequest request;
+    GetStatusResponse response;
+    auto msgReader = client->sendAndRecv("getStatus", request, &response);
+    EXPECT_EQ(response.status(), PSERVER_STATUS_NOT_SET);
+    EXPECT_EQ(msgReader->getNumBlocks(), (size_t)0);
+  }
+
+  {
+    SetStatusRequest request;
+    SetStatusResponse response;
+    request.set_status(PSERVER_STATUS_PARAMETER_READY);
+    client->sendAndRecv("setStatus", request, &response);
+  }
+
+  {
+    GetStatusRequest request;
+    GetStatusResponse response;
+    client->sendAndRecv("getStatus", request, &response);
+    EXPECT_EQ(response.status(), PSERVER_STATUS_PARAMETER_READY);
+  }
+
+  delete client;
+}
+
+TEST(ProtoServer, extended) {
+#ifndef PADDLE_ONLY_CPU
+  ProtoClient* client;
+  if (FLAGS_rdma_tcp == "rdma")
+    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
+  else
+    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_TCP);
+  int64_t dataSize = FLAGS_dim * sizeof(real);
+
+  GpuVector gpuParam(FLAGS_dim);
+  GpuVector gpuGrad(FLAGS_dim);
+  CpuVector cpuParam(FLAGS_dim);
+  CpuVector cpuGrad(FLAGS_dim);
+
+  gpuParam.rand();
+  gpuGrad.rand();
+  cpuParam.rand();
+  cpuGrad.rand();
+
+  for (int k = 0; k < 4; ++k) {
+    for (int i = 0; i < 10; ++i) {
+      cpuGrad.copyFrom(gpuGrad);
+      if (FLAGS_test_proto_server) {
+        GetStatusRequest request;
+        GetStatusResponse response;
+        {
+          REGISTER_TIMER("sendAndRecv");
+          auto msgReader = client->sendAndRecv(
+              "getStatusEx", request, {{cpuGrad.getData(), (size_t)dataSize}},
+              &response);
+
+          EXPECT_EQ(msgReader->getNumBlocks(), (size_t)1);
+          EXPECT_EQ(msgReader->getNextBlockLength(), (size_t)dataSize);
+          msgReader->readNextBlock(cpuParam.getData());
+        }
+        if (!FLAGS_benchmark) {
+          real* v1 = cpuGrad.getData();
+          real* v2 = cpuParam.getData();
+          real sum1 = 0, sum2 = 0;
+          for (int j = 0; j < FLAGS_dim; ++j) {
+            sum1 += v1[j];
+            sum2 += v2[j];
+          }
+          EXPECT_EQ(sum1, sum2);
+        }
+      }
+      gpuParam.copyFrom(cpuParam);
+
+      LOG_EVERY_N(INFO, 10) << "i=" << i;
+    }
+    globalStat.printAllStatus();
+    globalStat.reset();
+  }
+
+  delete client;
+#endif
+}
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+
+  MyServer* server;
+  if (FLAGS_rdma_tcp == "rdma") {
+    server = new MyServer(FLAGS_port, 0);
+  } else {
+    server = new MyServer(FLAGS_port);
+  }
+
+  server->start();
+  usleep(10000);
+
+  int ret = RUN_ALL_TESTS();
+
+  exit(ret);
+}
diff --git a/paddle/pserver/test/test_ProtoServer.sh b/paddle/pserver/test/test_ProtoServer.sh
new file mode 100755
index 00000000000000..a87b1b1ddcd0ea
--- /dev/null
+++ b/paddle/pserver/test/test_ProtoServer.sh
@@ -0,0 +1,33 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+for ((port=12340;port<=12360;port++))
+do
+    port_used_num=`netstat -a |grep $port|wc -l`
+    if [ $port_used_num -eq 0 ]
+    then
+        echo $port;
+        pserver/test/test_ProtoServer --port=$port 
+        if [ $? -eq 0 ]
+           then
+               exit 0
+           else
+               echo "test_ProtoServer run wrong"
+       	       exit 1
+        fi
+fi
+done
+echo "test_ProtoServer port not found"
+exit 1
diff --git a/paddle/py_paddle/.gitignore b/paddle/py_paddle/.gitignore
new file mode 100644
index 00000000000000..9e8ad4bf1638a6
--- /dev/null
+++ b/paddle/py_paddle/.gitignore
@@ -0,0 +1 @@
+swig_paddle.py
diff --git a/paddle/py_paddle/__init__.py b/paddle/py_paddle/__init__.py
new file mode 100644
index 00000000000000..970b4e64aec19b
--- /dev/null
+++ b/paddle/py_paddle/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import swig_paddle
+import util
+from util import DataProviderWrapperConverter
+
+__all__ = ['paddle', 'DataProviderWrapperConverter', 'loadParameterFile']
+
+util.monkeypatches()
diff --git a/paddle/py_paddle/util.py b/paddle/py_paddle/util.py
new file mode 100644
index 00000000000000..eb148c9c668e88
--- /dev/null
+++ b/paddle/py_paddle/util.py
@@ -0,0 +1,482 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Some Useful method for py_paddle.
+"""
+
+import swig_paddle
+import os
+import paddle.trainer.PyDataProviderWrapper
+import paddle.proto.ParameterConfig_pb2
+import paddle.proto.ModelConfig_pb2
+import paddle.proto.TrainerConfig_pb2
+import weakref
+import numpy
+import struct
+import sys
+import copy
+
+
+def initializePaddle(*args):
+    """
+    To initialize paddle process.
+    :param args: Command line options, such as --use_gpu=0, etc.
+    :return: Nothing.
+    """
+    old_argv = copy.deepcopy(sys.argv)
+    old_pypath = os.getenv("PYTHONPATH")
+    pypath = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+    if old_pypath is not None:
+        pypath = os.pathsep.join([pypath, old_pypath])
+        os.putenv("PYTHONPATH", pypath)
+    args = [""] + list(args)  # argv[0] is command name, it is not important.
+    swig_paddle.__initPaddle__(args)
+    sys.argv = old_argv
+
+
+def __monkeypatch_init_paddle__():
+    swig_paddle.__initPaddle__ = swig_paddle.initPaddle
+    swig_paddle.initPaddle = initializePaddle
+
+
+class __ParameterCallbackWrapper__(swig_paddle.UpdateCallback):
+    """
+    Wrap the python callable object to paddle.UpdateCallback.
+
+    INTERNAL USE ONLY.
+    """
+
+    def __init__(self, callback):
+        swig_paddle.UpdateCallback.__init__(self)
+        self.callback = callback
+
+    def apply(self, param):
+        self.callback(param)
+
+    @staticmethod
+    def wrap(callback):
+        """
+        Cast the python callable object/paddle.UpdateCallback to
+        swig_paddle.UpdateCallback.__disown__
+        :param callback: callable or swig_paddle.UpdateCallback object.
+        """
+        if isinstance(callback, swig_paddle.UpdateCallback):
+            return callback.__disown__()
+        elif isinstance(callback, weakref.ProxyType):
+            raise RuntimeError("Should not pass __disown__ object")
+        else:
+            return __ParameterCallbackWrapper__(callback).__disown__()
+
+
+def __monkeypatch_gradient_machine__():
+    """
+    Add some class methods to GradientMachine.
+    This method should be only used internally.
+    """
+    swig_paddle.GradientMachine.loadFromConfigFile = \
+        staticmethod(loadGradientMachine)
+
+    def __arguments_to_numpy__(i, arg):
+        assert isinstance(arg, swig_paddle.Arguments)
+        value = arg.getSlotValue(i)
+        if value is not None:
+            assert isinstance(value, swig_paddle.Matrix)
+            value = value.copyToNumpyMat()
+        ids = arg.getSlotIds(i)
+        if ids is not None:
+            assert isinstance(ids, swig_paddle.IVector)
+            ids = ids.copyToNumpyArray()
+        return {
+            "value": value,
+            "id": ids
+        }
+
+    def __matrix_to_numpy__(m):
+        if isinstance(m, swig_paddle.Matrix):
+            return m.copyToNumpyMat()
+        elif isinstance(m, swig_paddle.IVector):
+            return m.copyToNumpyArra()
+        else:
+            raise RuntimeError("Input arg should be matrix or vecotr.")
+
+    def createFromConfigProto(protoObj,
+                              createMode=swig_paddle.CREATE_MODE_NORMAL,
+                              paramTypes=[swig_paddle.PARAMETER_VALUE,
+                                          swig_paddle.PARAMETER_GRADIENT,
+                                          swig_paddle.PARAMETER_MOMENTUM]):
+        """
+        Create Gradient Machine From Proto object.
+        :param protoObj: Model config
+        :type protoObj: proto.ModelConfig_pb2.ModelConfig
+        :param createMode: Create Mode, default is normal.
+        :type createMode: int
+        :param paramTypes: the gradient machine parameter type.
+        :type paramTypes: list of int
+        :return: paddle.GradientMachine
+        """
+        assert isinstance(protoObj, paddle.proto.ModelConfig_pb2.ModelConfig)
+        return swig_paddle.GradientMachine.createByConfigProtoStr(
+            protoObj.SerializeToString(), createMode, paramTypes)
+
+    swig_paddle.GradientMachine.createFromConfigProto = \
+        staticmethod(createFromConfigProto)
+
+    def forwardTest(self, inArgs):
+        """
+        forwardTest. forward gradient machine in test mode, and return a numpy
+        matrix dict.
+
+        :param inArgs: The input arguments
+        :type inArgs: paddle.Arguments
+        :return: A dictionary with keys ['id', 'value'], each value is a
+                 numpy.ndarray.
+        """
+        outArgs = swig_paddle.Arguments.createArguments(0)
+        self.forward(inArgs, outArgs, swig_paddle.PASS_TEST)
+        return [__arguments_to_numpy__(i, outArgs) for i in xrange(
+            outArgs.getSlotNum())]
+
+    swig_paddle.GradientMachine.forwardTest = forwardTest
+
+    # Monkey patching backward
+    swig_paddle.GradientMachine.__backward__ = swig_paddle.GradientMachine.backward
+
+    def backward(self, callback):
+        """
+        GradientMachine Backward
+        :param callback: a callback which parameter is (paddle.Parameter) or
+                         a paddle.UpdateCallback object.
+        """
+        self.__backward__(__ParameterCallbackWrapper__.wrap(callback))
+
+    swig_paddle.GradientMachine.backward = backward
+
+    # Monkey patching forwardBackward.
+    swig_paddle.GradientMachine.__forwardBackward__ = \
+        swig_paddle.GradientMachine.forwardBackward
+
+    def forwardBackward(self, inArgs, outArgs, passType,
+                        callback=swig_paddle.UpdateCallback()):
+        """
+        GradientMachine forward backward.
+        :param inArgs: Input Arguments for GradientMachine.
+        :type inArgs: paddle.Arguments
+        :param outArgs: Output Arguments for GradientMachine.
+        :type outArgs: paddle.Arguments
+        :param passType: gradient machine's pass type.
+        :type passType: paddle.PassType
+        :param callback: a callable object with arguments (paddle.Parameter) or
+                         a paddle.UpdateCallback it will be called when
+                         backward
+        """
+        self.__forwardBackward__(inArgs, outArgs, passType,
+                                 __ParameterCallbackWrapper__.wrap(callback))
+
+    swig_paddle.GradientMachine.forwardBackward = forwardBackward
+
+    def getParameters(self):
+        return (self.getParameter(i) for i in xrange(self.getParameterSize()))
+
+    swig_paddle.GradientMachine.getParameters = getParameters
+
+    def getLayerOutputs(self, layerNames):
+        """
+        getLayerOutputs. get outputs of layers and return a numpy matrix dict.
+        :param layerNames: layer names.
+        :type layerNames: string or list.
+        """
+        if isinstance(layerNames, basestring):
+            layerNames = [layerNames]
+        elif not isinstance(layerNames, list):
+            raise RuntimeError("Input args shuld be string or a sting list.")
+
+        output = dict()
+        for name in layerNames:
+            output[name] = __matrix_to_numpy__(self.getLayerOutput(name))
+        return output
+
+    swig_paddle.GradientMachine.getLayerOutputs = getLayerOutputs
+
+def loadGradientMachine(config_filename, model_dir=None):
+    """
+    Load a gradient machine from config file name/path.
+    :param config_filename: The trainer config file name/path
+    :param model_dir: The model parameter directory. None if same as the
+    directory of config_filename
+    :return: GradientMachine with some enhance methods.
+    :rtype: paddle.GradientMachine
+    """
+    trainer_config = swig_paddle.TrainerConfig.createFromTrainerConfigFile(
+        config_filename)
+    assert isinstance(trainer_config, swig_paddle.TrainerConfig)
+    model_conf = trainer_config.getModelConfig()
+    network = swig_paddle.GradientMachine.createByModelConfig(model_conf)
+    assert isinstance(network, swig_paddle.GradientMachine)
+    if model_dir is None:
+        model_dir = os.path.dirname(config_filename)
+    network.loadParameters(model_dir)
+    return network
+
+def loadParameterFile(fn):
+    """
+    Load Paddle Parameter file to numpy.ndarray
+    :param fn: file name or file like object.
+    :type fn: str or file like object.
+    :return: numpy array
+    :rtype: numpy.ndarray
+    :raise: paddle.UnsupportError when parameter format is wrong.
+    """
+    if isinstance(fn, str):
+        with open(fn, 'rb') as f:
+            return loadParameterFile(f)
+    elif hasattr(fn, 'read'):  # File like object
+        version, = struct.unpack('i', fn.read(4))
+        if version != 0:
+            raise swig_paddle.UnsupportError()
+        value_length, = struct.unpack("I", fn.read(4))
+        if value_length != 4 and value_length != 8:
+            raise swig_paddle.UnsupportError()
+        dtype = 'float32' if value_length == 4 else 'float64'
+        param_size, = struct.unpack("L", fn.read(8))
+        value = numpy.fromfile(fn, dtype)
+        if len(value) != param_size:
+            raise swig_paddle.UnsupportError()
+        return value
+    else:
+        raise swig_paddle.UnsupportError()
+
+class DataProviderWrapperConverter(object):
+    """
+    A class convert DataFormat from PyDataProvider Wrapper to
+    py_paddle.paddle.Arguemnts.
+    """
+
+    class DenseValueConverter(object):
+        """
+        Internal class
+        """
+
+        def __init__(self, header_def):
+            self.__dim__ = header_def.dim
+            self.buf = []
+
+        def append(self, other):
+            assert len(other) == self.__dim__
+            self.buf += other
+
+        def __call__(self, slot_idx, arg):
+            mat = swig_paddle.Matrix.createDense(self.buf,
+                                            len(self.buf) / self.__dim__,
+                                            self.__dim__)
+            arg.setSlotValue(slot_idx, mat)
+
+    class IdValueConverter(object):
+        """
+        Internal class
+        """
+
+        def __init__(self, *args):
+            self.buf = []
+
+        def append(self, other):
+            assert isinstance(other, int)
+            self.buf.append(other)
+
+        def __call__(self, slot_idx, arg):
+            arg.setSlotIds(slot_idx, swig_paddle.IVector.create(self.buf))
+
+    class SparseNonValueConverter(object):
+        """
+        Internal class
+        """
+
+        def __init__(self, slot_def):
+            self.indices = [0]
+            self.cols = []
+            self.dim = slot_def.dim
+
+        def append(self, other):
+            self.indices.append(self.indices[-1] + len(other))
+            self.cols += other
+
+        def __call__(self, slot_idx, arg):
+            mat = swig_paddle.Matrix.createSparse(len(self.indices) - 1, self.dim,
+                                             len(self.cols), True)
+            assert isinstance(mat, swig_paddle.Matrix)
+            mat.sparseCopyFrom(self.indices, self.cols)
+            self.putIntoArg(slot_idx, arg, mat)
+
+        def putIntoArg(self, slot_idx, arg, mat):
+            arg.setSlotValue(slot_idx, mat)
+
+    class SparseValueConverter(SparseNonValueConverter):
+        """
+        Internal class
+        """
+
+        def __init__(self, slot_def):
+            super(DataProviderWrapperConverter.SparseValueConverter,
+                  self).__init__(slot_def)
+            self.values = []
+
+        def append(self, other):
+            super(DataProviderWrapperConverter.SparseValueConverter,
+                  self).append(map(lambda x: x[0], other))
+            self.values += map(lambda x: x[1], other)
+
+        def __call__(self, slot_idx, arg):
+            mat = swig_paddle.Matrix.createSparse(len(self.indices) - 1, self.dim,
+                                             len(self.cols), False)
+            assert isinstance(mat, swig_paddle.Matrix)
+            mat.sparseCopyFrom(self.indices, self.cols, self.values)
+            self.putIntoArg(slot_idx, arg, mat)
+
+    __SLOT_VALUE_CONVERTER_MAP__ = {
+        paddle.trainer.PyDataProviderWrapper.DenseSlot: DenseValueConverter,
+        paddle.trainer.PyDataProviderWrapper.IndexSlot: IdValueConverter,
+        paddle.trainer.PyDataProviderWrapper.SparseNonValueSlot:
+            SparseNonValueConverter,
+        paddle.trainer.PyDataProviderWrapper.SparseValueSlot: SparseValueConverter
+    }
+
+    def __init__(self, use_seq, header):
+        """
+        Ctor
+        :param use_seq: True if use sequence.
+        :param header:  List of slots type,
+                       trainer.PyDataProviderWrapper.SlotType
+        """
+        self.__use_seq__ = use_seq
+        self.__header__ = header
+
+    def convert(self, wrapper_data, argument=None):
+        """
+        Convert PyDataProviderWrapper format to paddle.Argument
+        :param wrapper_data: PyDataProviderWrapper yield's data list.
+        :param argument: The output paddle.Arguments.
+                        If it is not None, it will assign data in this
+                        arguments, else it will create new arguments.
+        :return: arguments that contains data.
+        :rtype: paddle.Arguments
+        """
+        if argument is None:
+            argument = swig_paddle.Arguments.createArguments(0)
+        assert isinstance(argument,swig_paddle.Arguments)
+        argument.resize(len(self.__header__))
+
+        values = map(lambda x:
+                     DataProviderWrapperConverter.__SLOT_VALUE_CONVERTER_MAP__[
+                         x.__class__](x),
+                     self.__header__)
+
+        if self.__use_seq__:
+            seq_dim = [[] for _ in xrange(self.__header__.__len__())]
+            seq_start_pos = [[0] for _ in xrange(self.__header__.__len__())]
+
+            for each_sample in wrapper_data:
+                for slot_idx, sequence in enumerate(each_sample):
+                    for raw_data in sequence:
+                        values[slot_idx].append(raw_data)
+                    seq_start_pos[slot_idx].append(
+                        seq_start_pos[slot_idx][-1] + len(sequence))
+                    seq_dim[slot_idx].append(len(sequence))
+
+            for slot_idx in xrange(len(self.__header__)):
+                argument.setSlotSequenceDim(slot_idx, swig_paddle.IVector.create(
+                    seq_dim[slot_idx]))
+                argument.setSlotSequenceStartPositions(
+                    slot_idx, swig_paddle.IVector.create(seq_start_pos[slot_idx]))
+        else:
+            for each_sample in wrapper_data:
+                for raw_data, value in zip(each_sample, values):
+                    value.append(raw_data)
+
+        for i, v in enumerate(values):
+            v(i, argument)
+
+        return argument
+
+    def __call__(self, wrapper_data, argument=None):
+        """
+        Invoke self.convert. See documents in self.convert.
+        """
+        return self.convert(wrapper_data, argument)
+
+
+def __monkey_patch_protobuf_objects__():
+    def ParameterConfig_toProto(self):
+        """
+        Convert paddle.ParameterConfig to
+        proto.ParameterConfig_pb2.ParameterConfig
+
+        :return: proto.ParameterConfig_pb2.ParameterConfig object.
+        """
+        param_conf = paddle.proto.ParameterConfig_pb2.ParameterConfig()
+        param_conf.ParseFromString(self.toProtoString())
+        return param_conf
+
+    swig_paddle.ParameterConfig.toProto = ParameterConfig_toProto
+
+    def OptimizationConfig_toProto(self):
+        """
+        Convert paddle.OptimizationConfig to
+        proto.TrainerConfig_pb2.OptimizationConfig
+
+        :return: proto.TrainerConfig_pb2.OptimizationConfig
+        """
+        opt_conf = proto.TrainerConfig_pb2.OptimizationConfig()
+        opt_conf.ParseFromString(self.toProtoString())
+        return opt_conf
+
+    swig_paddle.OptimizationConfig.toProto = OptimizationConfig_toProto
+
+    def OptimizationConfig_createFromProto(protoObj):
+        """
+        Create a new paddle.OptimizationConfig from
+        proto.TrainerConfig_pb2.OptimizationConfig
+
+        :param protoObj: proto.TrainerConfig_pb2.OptimizationConfig
+        :return: paddle.OptimizationConfig
+        """
+
+        assert isinstance(protoObj, paddle.proto.TrainerConfig_pb2.OptimizationConfig)
+        return swig_paddle.OptimizationConfig.createFromProtoString(
+            protoObj.SerializeToString())
+
+    swig_paddle.OptimizationConfig.createFromProto = staticmethod(
+        OptimizationConfig_createFromProto)
+
+
+def __monkey_patch_parameter__():
+    def getBufs(self):
+        """
+        get all parameter vectors.
+        NOTE: the return value is a generator. Maybe you need to cast to
+        list or tuple or something else.
+
+        :return: generator of all parameter vectors.
+        :rtype: generator
+        """
+        return (self.getBuf(i) for i in xrange(swig_paddle.NUM_PARAMETER_TYPES))
+
+    swig_paddle.Parameter.getBufs = getBufs
+
+
+def monkeypatches():
+    patches = [__monkeypatch_init_paddle__, __monkeypatch_gradient_machine__,
+               __monkey_patch_protobuf_objects__,
+               __monkey_patch_parameter__]
+    for patch in patches:
+        patch()
diff --git a/paddle/scripts/CMakeLists.txt b/paddle/scripts/CMakeLists.txt
new file mode 100644
index 00000000000000..dee46055c5a4db
--- /dev/null
+++ b/paddle/scripts/CMakeLists.txt
@@ -0,0 +1,9 @@
+configure_file(submit_local.sh.in
+    submit_local.sh
+    @ONLY)
+
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/submit_local.sh DESTINATION bin
+        PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
+            GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
+        RENAME paddle)
\ No newline at end of file
diff --git a/paddle/scripts/cluster_train/conf.py b/paddle/scripts/cluster_train/conf.py
new file mode 100644
index 00000000000000..927b352eacfbf1
--- /dev/null
+++ b/paddle/scripts/cluster_train/conf.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+HOSTS = [
+        "root@192.168.100.17",
+        "root@192.168.100.18",
+        ]
+
+'''
+workspace configuration
+'''
+#root dir for workspace, can be set as any director with real user account
+ROOT_DIR = "/home/paddle"
+
+
+'''
+network configuration
+'''
+#pserver nics
+PADDLE_NIC = "eth0"
+#pserver port
+PADDLE_PORT = 7164
+#pserver ports num
+PADDLE_PORTS_NUM = 2
+#pserver sparse ports num
+PADDLE_PORTS_NUM_FOR_SPARSE = 2
diff --git a/paddle/scripts/cluster_train/paddle.py b/paddle/scripts/cluster_train/paddle.py
new file mode 100644
index 00000000000000..9ddfb6423abdab
--- /dev/null
+++ b/paddle/scripts/cluster_train/paddle.py
@@ -0,0 +1,232 @@
+#!/usr/bin/python
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+""" module for launching cluster job """
+
+import os
+import argparse
+import socket
+import copy
+import time
+import signal
+
+
+from fabric.api import run, put, settings, env
+from fabric.tasks import execute
+
+#configuration for cluster
+import conf
+
+def refine_unknown_args(cmd_args):
+    '''
+    refine unknown parameters to handle some special parameters
+    '''
+    new_args = []
+    for arg in cmd_args:
+        if arg.startswith("--") and arg.find("=") != -1:
+            equal_pos = arg.find("=") #find first = pos
+            arglist = list(arg)
+            arglist[equal_pos] = " "
+            arg = "".join(arglist)
+            arg = arg.lstrip("-")
+            new_args += arg.split(" ")
+        elif arg.startswith("--") and arg.find("=") == -1:
+            arg = arg.lstrip("-")
+            new_args.append(arg)
+        else:
+            new_args.append(arg)
+    return new_args
+
+def kill_process():
+    '''
+    kill comments threads
+    '''
+    run("ps aux \
+         | grep paddle_process_by_paddle \
+         | grep -v grep  \
+         | awk '{print $2}' \
+         | xargs kill > /dev/null 2>&1")
+
+def job_prepare(jobdir, data=None):
+    '''
+    prepare job related workspace data
+
+    Assuming you already installed Paddle in all nodes which means
+    Paddle related bins and dependencies libraries.
+    Assuming the train/test data have already been installed.
+    This function just prepare all related model and other resources
+    needed at runtime.
+    '''
+    def job_create_workspace(jobdir, data=None):
+        '''
+        prepare job workspace, common file, etc.
+        '''
+        log = os.path.join(jobdir, "log")
+        if data is not None:
+            #create job dir
+            run('rm ' + jobdir + ' -fr && ' + 'mkdir -p ' + jobdir)
+            #push data and paddle bin
+            put(data + "/*", jobdir)
+            run("mkdir -p " + log)
+        run('rm -fr ' + log + "/*")
+
+    def set_nodefile(nodeid):
+        '''
+        create nodefile for later usage
+        '''
+        run('echo ' + str(nodeid) + ' > ' + jobdir + '/nodefile')
+
+    execute(job_create_workspace, jobdir, data, hosts=conf.HOSTS)
+    for i in xrange(len(conf.HOSTS)):
+        execute(set_nodefile, i, hosts=conf.HOSTS[i])
+    #clean rubbish caused by exception 
+    with settings(warn_only=True):
+          execute(kill_process, hosts=conf.HOSTS)
+
+def job_pserver(jobdir, pids=None):
+    '''
+    start all pservers
+    '''
+    pargs = " --num_gradient_servers=" + str(len(conf.HOSTS))
+    pargs += (" --nics=" + conf.PADDLE_NIC)
+    pargs += " --port=" + str(conf.PADDLE_PORT)
+    pargs += " --ports_num=" + str(conf.PADDLE_PORTS_NUM)
+    #always start sparse pserver by default
+    pargs += " --ports_num_for_sparse=" + str(conf.PADDLE_PORTS_NUM_FOR_SPARSE)
+    pargs += " --comment=" + "paddle_process_by_paddle"
+
+    def start_pserver(jobdir, pargs):
+        '''
+        start pserver process with fabric executor
+        '''
+        program = 'paddle pserver'
+        run('cd ' + jobdir + '; '  + \
+            'GLOG_logtostderr=0 GLOG_log_dir="./log" ' + \
+            'nohup ' + \
+            program + " " + pargs + ' > ./log/server.log 2>&1 < /dev/null & ',
+            pty=False)
+
+    execute(start_pserver, jobdir, pargs, hosts=conf.HOSTS)
+
+def job_trainer(jobdir,
+        train_args_dict,
+        pids=None):
+    '''
+    start paddle trainer
+    '''
+    args = " --num_gradient_servers=" + str(len(conf.HOSTS))
+    args += " --nics=" + conf.PADDLE_NIC
+    args += " --port=" + str(conf.PADDLE_PORT)
+    args += " --ports_num=" + str(conf.PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for i in xrange(len(conf.HOSTS)):
+        host = conf.HOSTS[i]
+        left = host.find("@")
+        right = host.find(':')
+        left = 0 if left == -1 else left + 1
+        right = len(host) if right == -1 else right
+        ip_string += (socket.gethostbyname(host[left:right]) + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    args += " " + args_ext
+
+    def start_trainer(jobdir, args):
+        '''
+        start trainer process with fabric executor
+        '''
+        program = 'paddle train'
+        run('cd ' + jobdir + '; '  + \
+            'GLOG_logtostderr=0 '
+            'GLOG_log_dir="./log" '
+            'nohup ' + \
+            program + " " + args + " > ./log/train.log 2>&1 < /dev/null & ",
+            pty=False)
+
+    for i in xrange(len(conf.HOSTS)):
+        train_args = copy.deepcopy(args)
+        train_args += " --trainer_id=" + str(i)
+        execute(start_trainer, jobdir, train_args, hosts=conf.HOSTS[i])
+
+def job_all(job_package,
+        jobdir=None,
+        train_args_dict=None):
+    '''
+    param job_package
+    param train_args_dict
+    '''
+    if jobdir is None:
+        timestamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
+        jobdir = conf.ROOT_DIR + "/JOB" + timestamp
+    job_prepare(jobdir, job_package)
+    job_pserver(jobdir)
+    time.sleep(5) #wait until pservers completely start
+    job_trainer(jobdir, train_args_dict)
+    job_clean()
+
+def job_clean():
+    '''
+    if starting job failed from paddle internal, the framework always
+    is launched successfully since these process are daemon processes.
+    so this job_clean can alway clean job rubbish process with ctrl+c.
+    '''
+    def signal_handler(signal, frame):
+        '''
+        SIGINT handler
+        '''
+        def kill_process():
+             run("ps aux \
+                  | grep paddle_process_by_paddle \
+                  | grep -v grep  \
+                  | awk '{print $2}' \
+                  | xargs kill > /dev/null 2>&1")
+        with settings(warn_only=True):
+              execute(kill_process, hosts=conf.HOSTS)
+
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.pause()
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(prog="paddle.py",
+            description='simple tool for cluster training')
+    parser.add_argument('-j', '--job_workspace',
+            required=False, default=None,
+            help='job workspace')
+    parser.add_argument('-p', '--job_dispatch_package',
+            required=False, default=None,
+            help='job package for dispatching to all other nodes')
+
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+
+    if args.job_workspace is not None:
+        #if assigned workspace, do not need to dispatch data,
+        #so job_local_package should be None
+        assert args.job_dispatch_package is None
+        job_all(None,
+                args.job_workspace,
+                train_args_dict)
+    elif args.job_dispatch_package is not None:
+        assert args.job_workspace is None
+        assert os.path.isdir(args.job_dispatch_package)
+        job_all(args.job_dispatch_package,
+                None,
+                train_args_dict)
diff --git a/paddle/scripts/cluster_train/run.sh b/paddle/scripts/cluster_train/run.sh
new file mode 100644
index 00000000000000..331c6498813774
--- /dev/null
+++ b/paddle/scripts/cluster_train/run.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+
+#python paddle.py \
+#  --job_workspace="${PATH_TO_REMOTE_EXISTED_WORKSPACE}" \
+#  --dot_period=10 \
+#  --ports_num_for_sparse=2 \
+#  --log_period=50 \
+#  --num_passes=10 \
+#  --trainer_count=4 \
+#  --saving_period=1 \
+#  --local=0 \
+#  --config=./trainer_config.py \
+#  --save_dir=./output \
+#  --use_gpu=0
+
+python paddle.py \
+  --job_dispatch_package="${PATH_TO_LOCAL_WORKSPACE}" \
+  --dot_period=10 \
+  --ports_num_for_sparse=2 \
+  --log_period=50 \
+  --num_passes=10 \
+  --trainer_count=4 \
+  --saving_period=1 \
+  --local=0 \
+  --config=./trainer_config.py \
+  --save_dir=./output \
+  --use_gpu=0
diff --git a/paddle/scripts/cpplint.py b/paddle/scripts/cpplint.py
new file mode 100644
index 00000000000000..5e905b865fc516
--- /dev/null
+++ b/paddle/scripts/cpplint.py
@@ -0,0 +1,6332 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2009 Google Inc. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#    * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#    * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Does google-lint on c++ files.
+
+The goal of this script is to identify places in the code that *may*
+be in non-compliance with google style.  It does not attempt to fix
+up these problems -- the point is to educate.  It does also not
+attempt to find all problems, or to ensure that everything it does
+find is legitimately a problem.
+
+In particular, we can get very confused by /* and // inside strings!
+We do a small hack, which is to ignore //'s with "'s after them on the
+same line, but it is far from perfect (in either direction).
+
+EDIT(yuyang18): Add #pragma once as include guard.
+EDIT(yuyang18): Add NOLINTNEXTLINES_ to suppress multiline lint.
+"""
+
+import codecs
+import copy
+import getopt
+import math  # for log
+import os
+import re
+import sre_compile
+import string
+import sys
+import unicodedata
+
+
+_USAGE = """
+Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
+                   [--counting=total|toplevel|detailed] [--root=subdir]
+                   [--linelength=digits]
+        <file> [file] ...
+
+  The style guidelines this tries to follow are those in
+    http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
+
+  Every problem is given a confidence score from 1-5, with 5 meaning we are
+  certain of the problem, and 1 meaning it could be a legitimate construct.
+  This will miss some errors, and is not a substitute for a code review.
+
+  To suppress false-positive errors of a certain category, add a
+  'NOLINT(category)' comment to the line.  NOLINT or NOLINT(*)
+  suppresses errors of all categories on that line.
+
+  The files passed in will be linted; at least one file must be provided.
+  Default linted extensions are .cc, .cpp, .cu, .cuh and .h.  Change the
+  extensions with the --extensions flag.
+
+  Flags:
+
+    output=vs7
+      By default, the output is formatted to ease emacs parsing.  Visual Studio
+      compatible output (vs7) may also be used.  Other formats are unsupported.
+
+    verbose=#
+      Specify a number 0-5 to restrict errors to certain verbosity levels.
+
+    filter=-x,+y,...
+      Specify a comma-separated list of category-filters to apply: only
+      error messages whose category names pass the filters will be printed.
+      (Category names are printed with the message and look like
+      "[whitespace/indent]".)  Filters are evaluated left to right.
+      "-FOO" and "FOO" means "do not print categories that start with FOO".
+      "+FOO" means "do print categories that start with FOO".
+
+      Examples: --filter=-whitespace,+whitespace/braces
+                --filter=whitespace,runtime/printf,+runtime/printf_format
+                --filter=-,+build/include_what_you_use
+
+      To see a list of all the categories used in cpplint, pass no arg:
+         --filter=
+
+    counting=total|toplevel|detailed
+      The total number of errors found is always printed. If
+      'toplevel' is provided, then the count of errors in each of
+      the top-level categories like 'build' and 'whitespace' will
+      also be printed. If 'detailed' is provided, then a count
+      is provided for each category like 'build/class'.
+
+    root=subdir
+      The root directory used for deriving header guard CPP variable.
+      By default, the header guard CPP variable is calculated as the relative
+      path to the directory that contains .git, .hg, or .svn.  When this flag
+      is specified, the relative path is calculated from the specified
+      directory. If the specified directory does not exist, this flag is
+      ignored.
+
+      Examples:
+        Assuming that src/.git exists, the header guard CPP variables for
+        src/chrome/browser/ui/browser.h are:
+
+        No flag => CHROME_BROWSER_UI_BROWSER_H_
+        --root=chrome => BROWSER_UI_BROWSER_H_
+        --root=chrome/browser => UI_BROWSER_H_
+
+    linelength=digits
+      This is the allowed line length for the project. The default value is
+      80 characters.
+
+      Examples:
+        --linelength=120
+
+    extensions=extension,extension,...
+      The allowed file extensions that cpplint will check
+
+      Examples:
+        --extensions=hpp,cpp
+
+    cpplint.py supports per-directory configurations specified in CPPLINT.cfg
+    files. CPPLINT.cfg file can contain a number of key=value pairs.
+    Currently the following options are supported:
+
+      set noparent
+      filter=+filter1,-filter2,...
+      exclude_files=regex
+      linelength=80
+
+    "set noparent" option prevents cpplint from traversing directory tree
+    upwards looking for more .cfg files in parent directories. This option
+    is usually placed in the top-level project directory.
+
+    The "filter" option is similar in function to --filter flag. It specifies
+    message filters in addition to the |_DEFAULT_FILTERS| and those specified
+    through --filter command-line flag.
+
+    "exclude_files" allows to specify a regular expression to be matched against
+    a file name. If the expression matches, the file is skipped and not run
+    through liner.
+
+    "linelength" allows to specify the allowed line length for the project.
+
+    CPPLINT.cfg has an effect on files in the same directory and all
+    sub-directories, unless overridden by a nested configuration file.
+
+      Example file:
+        filter=-build/include_order,+build/include_alpha
+        exclude_files=.*\.cc
+
+    The above example disables build/include_order warning and enables
+    build/include_alpha as well as excludes all .cc from being
+    processed by linter, in the current directory (where the .cfg
+    file is located) and all sub-directories.
+"""
+
+# We categorize each error message we print.  Here are the categories.
+# We want an explicit list so we can list them all in cpplint --filter=.
+# If you add a new error message with a new category, add it to the list
+# here!  cpplint_unittest.py should tell you if you forget to do this.
+_ERROR_CATEGORIES = [
+    'build/class',
+    'build/c++11',
+    'build/deprecated',
+    'build/endif_comment',
+    'build/explicit_make_pair',
+    'build/forward_decl',
+    'build/header_guard',
+    'build/include',
+    'build/include_alpha',
+    'build/include_order',
+    'build/include_what_you_use',
+    'build/namespaces',
+    'build/printf_format',
+    'build/storage_class',
+    'legal/copyright',
+    'readability/alt_tokens',
+    'readability/braces',
+    'readability/casting',
+    'readability/check',
+    'readability/constructors',
+    'readability/fn_size',
+    'readability/function',
+    'readability/inheritance',
+    'readability/multiline_comment',
+    'readability/multiline_string',
+    'readability/namespace',
+    'readability/nolint',
+    'readability/nul',
+    'readability/strings',
+    'readability/todo',
+    'readability/utf8',
+    'runtime/arrays',
+    'runtime/casting',
+    'runtime/explicit',
+    'runtime/int',
+    'runtime/init',
+    'runtime/invalid_increment',
+    'runtime/member_string_references',
+    'runtime/memset',
+    'runtime/indentation_namespace',
+    'runtime/operator',
+    'runtime/printf',
+    'runtime/printf_format',
+    'runtime/references',
+    'runtime/string',
+    'runtime/threadsafe_fn',
+    'runtime/vlog',
+    'whitespace/blank_line',
+    'whitespace/braces',
+    'whitespace/comma',
+    'whitespace/comments',
+    'whitespace/empty_conditional_body',
+    'whitespace/empty_loop_body',
+    'whitespace/end_of_line',
+    'whitespace/ending_newline',
+    'whitespace/forcolon',
+    'whitespace/indent',
+    'whitespace/line_length',
+    'whitespace/newline',
+    'whitespace/operators',
+    'whitespace/parens',
+    'whitespace/semicolon',
+    'whitespace/tab',
+    'whitespace/todo',
+    ]
+
+# These error categories are no longer enforced by cpplint, but for backwards-
+# compatibility they may still appear in NOLINT comments.
+_LEGACY_ERROR_CATEGORIES = [
+    'readability/streams',
+    ]
+
+# The default state of the category filter. This is overridden by the --filter=
+# flag. By default all errors are on, so only add here categories that should be
+# off by default (i.e., categories that must be enabled by the --filter= flags).
+# All entries here should start with a '-' or '+', as in the --filter= flag.
+_DEFAULT_FILTERS = ['-build/include_alpha']
+
+# We used to check for high-bit characters, but after much discussion we
+# decided those were OK, as long as they were in UTF-8 and didn't represent
+# hard-coded international strings, which belong in a separate i18n file.
+
+# C++ headers
+_CPP_HEADERS = frozenset([
+    # Legacy
+    'algobase.h',
+    'algo.h',
+    'alloc.h',
+    'builtinbuf.h',
+    'bvector.h',
+    'complex.h',
+    'defalloc.h',
+    'deque.h',
+    'editbuf.h',
+    'fstream.h',
+    'function.h',
+    'hash_map',
+    'hash_map.h',
+    'hash_set',
+    'hash_set.h',
+    'hashtable.h',
+    'heap.h',
+    'indstream.h',
+    'iomanip.h',
+    'iostream.h',
+    'istream.h',
+    'iterator.h',
+    'list.h',
+    'map.h',
+    'multimap.h',
+    'multiset.h',
+    'ostream.h',
+    'pair.h',
+    'parsestream.h',
+    'pfstream.h',
+    'procbuf.h',
+    'pthread_alloc',
+    'pthread_alloc.h',
+    'rope',
+    'rope.h',
+    'ropeimpl.h',
+    'set.h',
+    'slist',
+    'slist.h',
+    'stack.h',
+    'stdiostream.h',
+    'stl_alloc.h',
+    'stl_relops.h',
+    'streambuf.h',
+    'stream.h',
+    'strfile.h',
+    'strstream.h',
+    'tempbuf.h',
+    'tree.h',
+    'type_traits.h',
+    'vector.h',
+    # 17.6.1.2 C++ library headers
+    'algorithm',
+    'array',
+    'atomic',
+    'bitset',
+    'chrono',
+    'codecvt',
+    'complex',
+    'condition_variable',
+    'deque',
+    'exception',
+    'forward_list',
+    'fstream',
+    'functional',
+    'future',
+    'initializer_list',
+    'iomanip',
+    'ios',
+    'iosfwd',
+    'iostream',
+    'istream',
+    'iterator',
+    'limits',
+    'list',
+    'locale',
+    'map',
+    'memory',
+    'mutex',
+    'new',
+    'numeric',
+    'ostream',
+    'queue',
+    'random',
+    'ratio',
+    'regex',
+    'set',
+    'sstream',
+    'stack',
+    'stdexcept',
+    'streambuf',
+    'string',
+    'strstream',
+    'system_error',
+    'thread',
+    'tuple',
+    'typeindex',
+    'typeinfo',
+    'type_traits',
+    'unordered_map',
+    'unordered_set',
+    'utility',
+    'valarray',
+    'vector',
+    # 17.6.1.2 C++ headers for C library facilities
+    'cassert',
+    'ccomplex',
+    'cctype',
+    'cerrno',
+    'cfenv',
+    'cfloat',
+    'cinttypes',
+    'ciso646',
+    'climits',
+    'clocale',
+    'cmath',
+    'csetjmp',
+    'csignal',
+    'cstdalign',
+    'cstdarg',
+    'cstdbool',
+    'cstddef',
+    'cstdint',
+    'cstdio',
+    'cstdlib',
+    'cstring',
+    'ctgmath',
+    'ctime',
+    'cuchar',
+    'cwchar',
+    'cwctype',
+    ])
+
+
+# These headers are excluded from [build/include] and [build/include_order]
+# checks:
+# - Anything not following google file name conventions (containing an
+#   uppercase character, such as Python.h or nsStringAPI.h, for example).
+# - Lua headers.
+_THIRD_PARTY_HEADERS_PATTERN = re.compile(
+    r'^(?:[^/]*[A-Z][^/]*\.h|lua\.h|lauxlib\.h|lualib\.h)$')
+
+
+# Assertion macros.  These are defined in base/logging.h and
+# testing/base/gunit.h.  Note that the _M versions need to come first
+# for substring matching to work.
+_CHECK_MACROS = [
+    'DCHECK', 'CHECK',
+    'EXPECT_TRUE_M', 'EXPECT_TRUE',
+    'ASSERT_TRUE_M', 'ASSERT_TRUE',
+    'EXPECT_FALSE_M', 'EXPECT_FALSE',
+    'ASSERT_FALSE_M', 'ASSERT_FALSE',
+    ]
+
+# Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE
+_CHECK_REPLACEMENT = dict([(m, {}) for m in _CHECK_MACROS])
+
+for op, replacement in [('==', 'EQ'), ('!=', 'NE'),
+                        ('>=', 'GE'), ('>', 'GT'),
+                        ('<=', 'LE'), ('<', 'LT')]:
+  _CHECK_REPLACEMENT['DCHECK'][op] = 'DCHECK_%s' % replacement
+  _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement
+  _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement
+  _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement
+  _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement
+  _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement
+
+for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'),
+                            ('>=', 'LT'), ('>', 'LE'),
+                            ('<=', 'GT'), ('<', 'GE')]:
+  _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement
+  _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement
+  _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement
+  _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement
+
+# Alternative tokens and their replacements.  For full list, see section 2.5
+# Alternative tokens [lex.digraph] in the C++ standard.
+#
+# Digraphs (such as '%:') are not included here since it's a mess to
+# match those on a word boundary.
+_ALT_TOKEN_REPLACEMENT = {
+    'and': '&&',
+    'bitor': '|',
+    'or': '||',
+    'xor': '^',
+    'compl': '~',
+    'bitand': '&',
+    'and_eq': '&=',
+    'or_eq': '|=',
+    'xor_eq': '^=',
+    'not': '!',
+    'not_eq': '!='
+    }
+
+# Compile regular expression that matches all the above keywords.  The "[ =()]"
+# bit is meant to avoid matching these keywords outside of boolean expressions.
+#
+# False positives include C-style multi-line comments and multi-line strings
+# but those have always been troublesome for cpplint.
+_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile(
+    r'[ =()](' + ('|'.join(_ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)')
+
+
+# These constants define types of headers for use with
+# _IncludeState.CheckNextIncludeOrder().
+_C_SYS_HEADER = 1
+_CPP_SYS_HEADER = 2
+_LIKELY_MY_HEADER = 3
+_POSSIBLE_MY_HEADER = 4
+_OTHER_HEADER = 5
+
+# These constants define the current inline assembly state
+_NO_ASM = 0       # Outside of inline assembly block
+_INSIDE_ASM = 1   # Inside inline assembly block
+_END_ASM = 2      # Last line of inline assembly block
+_BLOCK_ASM = 3    # The whole block is an inline assembly block
+
+# Match start of assembly blocks
+_MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)'
+                        r'(?:\s+(volatile|__volatile__))?'
+                        r'\s*[{(]')
+
+
+_regexp_compile_cache = {}
+
+# {str, set(int)}: a map from error categories to sets of linenumbers
+# on which those errors are expected and should be suppressed.
+_error_suppressions = {}
+
+# The root directory used for deriving header guard CPP variable.
+# This is set by --root flag.
+_root = None
+
+# The allowed line length of files.
+# This is set by --linelength flag.
+_line_length = 80
+
+# The allowed extensions for file names
+# This is set by --extensions flag.
+_valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh'])
+
+def ParseNolintSuppressions(filename, raw_line, linenum, error):
+  """Updates the global list of error-suppressions.
+
+  Parses any NOLINT comments on the current line, updating the global
+  error_suppressions store.  Reports an error if the NOLINT comment
+  was malformed.
+
+  Args:
+    filename: str, the name of the input file.
+    raw_line: str, the line of input text, with comments.
+    linenum: int, the number of the current line.
+    error: function, an error handler.
+  """
+  matched = Search(r'\bNOLINT(NEXTLINE(S_\d+)?)?\b(\([^)]+\))?', raw_line)
+  if matched:
+    if matched.group(1):
+      lines = matched.group(2)
+      if lines :
+        lines=int(lines[2:])
+        suppressed_line = [ linenum + i for i in xrange(lines) ]
+      else:
+        suppressed_line = linenum + 1
+    else:
+      suppressed_line = linenum
+    category = matched.group(3)
+    if category in (None, '(*)'):  # => "suppress all"
+      if isinstance(suppressed_line, int):
+        _error_suppressions.setdefault(None, set()).add(suppressed_line)
+      else:
+        for _line in suppressed_line:
+          _error_suppressions.setdefault(None, set()).add(_line)
+    else:
+      if category.startswith('(') and category.endswith(')'):
+        category = category[1:-1]
+        if category in _ERROR_CATEGORIES:
+          if isinstance(suppressed_line, int):
+            _error_suppressions.setdefault(category, set()).add(suppressed_line)
+          else:
+            for _line in suppressed_line:
+              _error_suppressions.setdefault(category, set()).add(_line)
+        elif category not in _LEGACY_ERROR_CATEGORIES:
+          error(filename, linenum, 'readability/nolint', 5,
+                'Unknown NOLINT error category: %s' % category)
+
+
+def ResetNolintSuppressions():
+  """Resets the set of NOLINT suppressions to empty."""
+  _error_suppressions.clear()
+
+
+def IsErrorSuppressedByNolint(category, linenum):
+  """Returns true if the specified error category is suppressed on this line.
+
+  Consults the global error_suppressions map populated by
+  ParseNolintSuppressions/ResetNolintSuppressions.
+
+  Args:
+    category: str, the category of the error.
+    linenum: int, the current line number.
+  Returns:
+    bool, True iff the error should be suppressed due to a NOLINT comment.
+  """
+  return (linenum in _error_suppressions.get(category, set()) or
+          linenum in _error_suppressions.get(None, set()))
+
+
+def Match(pattern, s):
+  """Matches the string with the pattern, caching the compiled regexp."""
+  # The regexp compilation caching is inlined in both Match and Search for
+  # performance reasons; factoring it out into a separate function turns out
+  # to be noticeably expensive.
+  if pattern not in _regexp_compile_cache:
+    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+  return _regexp_compile_cache[pattern].match(s)
+
+
+def ReplaceAll(pattern, rep, s):
+  """Replaces instances of pattern in a string with a replacement.
+
+  The compiled regex is kept in a cache shared by Match and Search.
+
+  Args:
+    pattern: regex pattern
+    rep: replacement text
+    s: search string
+
+  Returns:
+    string with replacements made (or original string if no replacements)
+  """
+  if pattern not in _regexp_compile_cache:
+    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+  return _regexp_compile_cache[pattern].sub(rep, s)
+
+
+def Search(pattern, s):
+  """Searches the string for the pattern, caching the compiled regexp."""
+  if pattern not in _regexp_compile_cache:
+    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+  return _regexp_compile_cache[pattern].search(s)
+
+
+class _IncludeState(object):
+  """Tracks line numbers for includes, and the order in which includes appear.
+
+  include_list contains list of lists of (header, line number) pairs.
+  It's a lists of lists rather than just one flat list to make it
+  easier to update across preprocessor boundaries.
+
+  Call CheckNextIncludeOrder() once for each header in the file, passing
+  in the type constants defined above. Calls in an illegal order will
+  raise an _IncludeError with an appropriate error message.
+
+  """
+  # self._section will move monotonically through this set. If it ever
+  # needs to move backwards, CheckNextIncludeOrder will raise an error.
+  _INITIAL_SECTION = 0
+  _MY_H_SECTION = 1
+  _C_SECTION = 2
+  _CPP_SECTION = 3
+  _OTHER_H_SECTION = 4
+
+  _TYPE_NAMES = {
+      _C_SYS_HEADER: 'C system header',
+      _CPP_SYS_HEADER: 'C++ system header',
+      _LIKELY_MY_HEADER: 'header this file implements',
+      _POSSIBLE_MY_HEADER: 'header this file may implement',
+      _OTHER_HEADER: 'other header',
+      }
+  _SECTION_NAMES = {
+      _INITIAL_SECTION: "... nothing. (This can't be an error.)",
+      _MY_H_SECTION: 'a header this file implements',
+      _C_SECTION: 'C system header',
+      _CPP_SECTION: 'C++ system header',
+      _OTHER_H_SECTION: 'other header',
+      }
+
+  def __init__(self):
+    self.include_list = [[]]
+    self.ResetSection('')
+
+  def FindHeader(self, header):
+    """Check if a header has already been included.
+
+    Args:
+      header: header to check.
+    Returns:
+      Line number of previous occurrence, or -1 if the header has not
+      been seen before.
+    """
+    for section_list in self.include_list:
+      for f in section_list:
+        if f[0] == header:
+          return f[1]
+    return -1
+
+  def ResetSection(self, directive):
+    """Reset section checking for preprocessor directive.
+
+    Args:
+      directive: preprocessor directive (e.g. "if", "else").
+    """
+    # The name of the current section.
+    self._section = self._INITIAL_SECTION
+    # The path of last found header.
+    self._last_header = ''
+
+    # Update list of includes.  Note that we never pop from the
+    # include list.
+    if directive in ('if', 'ifdef', 'ifndef'):
+      self.include_list.append([])
+    elif directive in ('else', 'elif'):
+      self.include_list[-1] = []
+
+  def SetLastHeader(self, header_path):
+    self._last_header = header_path
+
+  def CanonicalizeAlphabeticalOrder(self, header_path):
+    """Returns a path canonicalized for alphabetical comparison.
+
+    - replaces "-" with "_" so they both cmp the same.
+    - removes '-inl' since we don't require them to be after the main header.
+    - lowercase everything, just in case.
+
+    Args:
+      header_path: Path to be canonicalized.
+
+    Returns:
+      Canonicalized path.
+    """
+    return header_path.replace('-inl.h', '.h').replace('-', '_').lower()
+
+  def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path):
+    """Check if a header is in alphabetical order with the previous header.
+
+    Args:
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      header_path: Canonicalized header to be checked.
+
+    Returns:
+      Returns true if the header is in alphabetical order.
+    """
+    # If previous section is different from current section, _last_header will
+    # be reset to empty string, so it's always less than current header.
+    #
+    # If previous line was a blank line, assume that the headers are
+    # intentionally sorted the way they are.
+    if (self._last_header > header_path and
+        Match(r'^\s*#\s*include\b', clean_lines.elided[linenum - 1])):
+      return False
+    return True
+
+  def CheckNextIncludeOrder(self, header_type):
+    """Returns a non-empty error message if the next header is out of order.
+
+    This function also updates the internal state to be ready to check
+    the next include.
+
+    Args:
+      header_type: One of the _XXX_HEADER constants defined above.
+
+    Returns:
+      The empty string if the header is in the right order, or an
+      error message describing what's wrong.
+
+    """
+    error_message = ('Found %s after %s' %
+                     (self._TYPE_NAMES[header_type],
+                      self._SECTION_NAMES[self._section]))
+
+    last_section = self._section
+
+    if header_type == _C_SYS_HEADER:
+      if self._section <= self._C_SECTION:
+        self._section = self._C_SECTION
+      else:
+        self._last_header = ''
+        return error_message
+    elif header_type == _CPP_SYS_HEADER:
+      if self._section <= self._CPP_SECTION:
+        self._section = self._CPP_SECTION
+      else:
+        self._last_header = ''
+        return error_message
+    elif header_type == _LIKELY_MY_HEADER:
+      if self._section <= self._MY_H_SECTION:
+        self._section = self._MY_H_SECTION
+      else:
+        self._section = self._OTHER_H_SECTION
+    elif header_type == _POSSIBLE_MY_HEADER:
+      if self._section <= self._MY_H_SECTION:
+        self._section = self._MY_H_SECTION
+      else:
+        # This will always be the fallback because we're not sure
+        # enough that the header is associated with this file.
+        self._section = self._OTHER_H_SECTION
+    else:
+      assert header_type == _OTHER_HEADER
+      self._section = self._OTHER_H_SECTION
+
+    if last_section != self._section:
+      self._last_header = ''
+
+    return ''
+
+
+class _CppLintState(object):
+  """Maintains module-wide state.."""
+
+  def __init__(self):
+    self.verbose_level = 1  # global setting.
+    self.error_count = 0    # global count of reported errors
+    # filters to apply when emitting error messages
+    self.filters = _DEFAULT_FILTERS[:]
+    # backup of filter list. Used to restore the state after each file.
+    self._filters_backup = self.filters[:]
+    self.counting = 'total'  # In what way are we counting errors?
+    self.errors_by_category = {}  # string to int dict storing error counts
+
+    # output format:
+    # "emacs" - format that emacs can parse (default)
+    # "vs7" - format that Microsoft Visual Studio 7 can parse
+    self.output_format = 'emacs'
+
+  def SetOutputFormat(self, output_format):
+    """Sets the output format for errors."""
+    self.output_format = output_format
+
+  def SetVerboseLevel(self, level):
+    """Sets the module's verbosity, and returns the previous setting."""
+    last_verbose_level = self.verbose_level
+    self.verbose_level = level
+    return last_verbose_level
+
+  def SetCountingStyle(self, counting_style):
+    """Sets the module's counting options."""
+    self.counting = counting_style
+
+  def SetFilters(self, filters):
+    """Sets the error-message filters.
+
+    These filters are applied when deciding whether to emit a given
+    error message.
+
+    Args:
+      filters: A string of comma-separated filters (eg "+whitespace/indent").
+               Each filter should start with + or -; else we die.
+
+    Raises:
+      ValueError: The comma-separated filters did not all start with '+' or '-'.
+                  E.g. "-,+whitespace,-whitespace/indent,whitespace/badfilter"
+    """
+    # Default filters always have less priority than the flag ones.
+    self.filters = _DEFAULT_FILTERS[:]
+    self.AddFilters(filters)
+
+  def AddFilters(self, filters):
+    """ Adds more filters to the existing list of error-message filters. """
+    for filt in filters.split(','):
+      clean_filt = filt.strip()
+      if clean_filt:
+        self.filters.append(clean_filt)
+    for filt in self.filters:
+      if not (filt.startswith('+') or filt.startswith('-')):
+        raise ValueError('Every filter in --filters must start with + or -'
+                         ' (%s does not)' % filt)
+
+  def BackupFilters(self):
+    """ Saves the current filter list to backup storage."""
+    self._filters_backup = self.filters[:]
+
+  def RestoreFilters(self):
+    """ Restores filters previously backed up."""
+    self.filters = self._filters_backup[:]
+
+  def ResetErrorCounts(self):
+    """Sets the module's error statistic back to zero."""
+    self.error_count = 0
+    self.errors_by_category = {}
+
+  def IncrementErrorCount(self, category):
+    """Bumps the module's error statistic."""
+    self.error_count += 1
+    if self.counting in ('toplevel', 'detailed'):
+      if self.counting != 'detailed':
+        category = category.split('/')[0]
+      if category not in self.errors_by_category:
+        self.errors_by_category[category] = 0
+      self.errors_by_category[category] += 1
+
+  def PrintErrorCounts(self):
+    """Print a summary of errors by category, and the total."""
+    for category, count in self.errors_by_category.iteritems():
+      sys.stdout.write('Category \'%s\' errors found: %d\n' %
+                       (category, count))
+    sys.stdout.write('Total errors found: %d\n' % self.error_count)
+
+_cpplint_state = _CppLintState()
+
+
+def _OutputFormat():
+  """Gets the module's output format."""
+  return _cpplint_state.output_format
+
+
+def _SetOutputFormat(output_format):
+  """Sets the module's output format."""
+  _cpplint_state.SetOutputFormat(output_format)
+
+
+def _VerboseLevel():
+  """Returns the module's verbosity setting."""
+  return _cpplint_state.verbose_level
+
+
+def _SetVerboseLevel(level):
+  """Sets the module's verbosity, and returns the previous setting."""
+  return _cpplint_state.SetVerboseLevel(level)
+
+
+def _SetCountingStyle(level):
+  """Sets the module's counting options."""
+  _cpplint_state.SetCountingStyle(level)
+
+
+def _Filters():
+  """Returns the module's list of output filters, as a list."""
+  return _cpplint_state.filters
+
+
+def _SetFilters(filters):
+  """Sets the module's error-message filters.
+
+  These filters are applied when deciding whether to emit a given
+  error message.
+
+  Args:
+    filters: A string of comma-separated filters (eg "whitespace/indent").
+             Each filter should start with + or -; else we die.
+  """
+  _cpplint_state.SetFilters(filters)
+
+def _AddFilters(filters):
+  """Adds more filter overrides.
+
+  Unlike _SetFilters, this function does not reset the current list of filters
+  available.
+
+  Args:
+    filters: A string of comma-separated filters (eg "whitespace/indent").
+             Each filter should start with + or -; else we die.
+  """
+  _cpplint_state.AddFilters(filters)
+
+def _BackupFilters():
+  """ Saves the current filter list to backup storage."""
+  _cpplint_state.BackupFilters()
+
+def _RestoreFilters():
+  """ Restores filters previously backed up."""
+  _cpplint_state.RestoreFilters()
+
+class _FunctionState(object):
+  """Tracks current function name and the number of lines in its body."""
+
+  _NORMAL_TRIGGER = 250  # for --v=0, 500 for --v=1, etc.
+  _TEST_TRIGGER = 400    # about 50% more than _NORMAL_TRIGGER.
+
+  def __init__(self):
+    self.in_a_function = False
+    self.lines_in_function = 0
+    self.current_function = ''
+
+  def Begin(self, function_name):
+    """Start analyzing function body.
+
+    Args:
+      function_name: The name of the function being tracked.
+    """
+    self.in_a_function = True
+    self.lines_in_function = 0
+    self.current_function = function_name
+
+  def Count(self):
+    """Count line in current function body."""
+    if self.in_a_function:
+      self.lines_in_function += 1
+
+  def Check(self, error, filename, linenum):
+    """Report if too many lines in function body.
+
+    Args:
+      error: The function to call with any errors found.
+      filename: The name of the current file.
+      linenum: The number of the line to check.
+    """
+    if Match(r'T(EST|est)', self.current_function):
+      base_trigger = self._TEST_TRIGGER
+    else:
+      base_trigger = self._NORMAL_TRIGGER
+    trigger = base_trigger * 2**_VerboseLevel()
+
+    if self.lines_in_function > trigger:
+      error_level = int(math.log(self.lines_in_function / base_trigger, 2))
+      # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ...
+      if error_level > 5:
+        error_level = 5
+      error(filename, linenum, 'readability/fn_size', error_level,
+            'Small and focused functions are preferred:'
+            ' %s has %d non-comment lines'
+            ' (error triggered by exceeding %d lines).'  % (
+                self.current_function, self.lines_in_function, trigger))
+
+  def End(self):
+    """Stop analyzing function body."""
+    self.in_a_function = False
+
+
+class _IncludeError(Exception):
+  """Indicates a problem with the include order in a file."""
+  pass
+
+
+class FileInfo(object):
+  """Provides utility functions for filenames.
+
+  FileInfo provides easy access to the components of a file's path
+  relative to the project root.
+  """
+
+  def __init__(self, filename):
+    self._filename = filename
+
+  def FullName(self):
+    """Make Windows paths like Unix."""
+    return os.path.abspath(self._filename).replace('\\', '/')
+
+  def RepositoryName(self):
+    """FullName after removing the local path to the repository.
+
+    If we have a real absolute path name here we can try to do something smart:
+    detecting the root of the checkout and truncating /path/to/checkout from
+    the name so that we get header guards that don't include things like
+    "C:\Documents and Settings\..." or "/home/username/..." in them and thus
+    people on different computers who have checked the source out to different
+    locations won't see bogus errors.
+    """
+    fullname = self.FullName()
+
+    if os.path.exists(fullname):
+      project_dir = os.path.dirname(fullname)
+
+      if os.path.exists(os.path.join(project_dir, ".svn")):
+        # If there's a .svn file in the current directory, we recursively look
+        # up the directory tree for the top of the SVN checkout
+        root_dir = project_dir
+        one_up_dir = os.path.dirname(root_dir)
+        while os.path.exists(os.path.join(one_up_dir, ".svn")):
+          root_dir = os.path.dirname(root_dir)
+          one_up_dir = os.path.dirname(one_up_dir)
+
+        prefix = os.path.commonprefix([root_dir, project_dir])
+        return fullname[len(prefix) + 1:]
+
+      # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by
+      # searching up from the current path.
+      root_dir = os.path.dirname(fullname)
+      while (root_dir != os.path.dirname(root_dir) and
+             not os.path.exists(os.path.join(root_dir, ".git")) and
+             not os.path.exists(os.path.join(root_dir, ".hg")) and
+             not os.path.exists(os.path.join(root_dir, ".svn"))):
+        root_dir = os.path.dirname(root_dir)
+
+      if (os.path.exists(os.path.join(root_dir, ".git")) or
+          os.path.exists(os.path.join(root_dir, ".hg")) or
+          os.path.exists(os.path.join(root_dir, ".svn"))):
+        prefix = os.path.commonprefix([root_dir, project_dir])
+        return fullname[len(prefix) + 1:]
+
+    # Don't know what to do; header guard warnings may be wrong...
+    return fullname
+
+  def Split(self):
+    """Splits the file into the directory, basename, and extension.
+
+    For 'chrome/browser/browser.cc', Split() would
+    return ('chrome/browser', 'browser', '.cc')
+
+    Returns:
+      A tuple of (directory, basename, extension).
+    """
+
+    googlename = self.RepositoryName()
+    project, rest = os.path.split(googlename)
+    return (project,) + os.path.splitext(rest)
+
+  def BaseName(self):
+    """File base name - text after the final slash, before the final period."""
+    return self.Split()[1]
+
+  def Extension(self):
+    """File extension - text following the final period."""
+    return self.Split()[2]
+
+  def NoExtension(self):
+    """File has no source file extension."""
+    return '/'.join(self.Split()[0:2])
+
+  def IsSource(self):
+    """File has a source file extension."""
+    return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx')
+
+
+def _ShouldPrintError(category, confidence, linenum):
+  """If confidence >= verbose, category passes filter and is not suppressed."""
+
+  # There are three ways we might decide not to print an error message:
+  # a "NOLINT(category)" comment appears in the source,
+  # the verbosity level isn't high enough, or the filters filter it out.
+  if IsErrorSuppressedByNolint(category, linenum):
+    return False
+
+  if confidence < _cpplint_state.verbose_level:
+    return False
+
+  is_filtered = False
+  for one_filter in _Filters():
+    if one_filter.startswith('-'):
+      if category.startswith(one_filter[1:]):
+        is_filtered = True
+    elif one_filter.startswith('+'):
+      if category.startswith(one_filter[1:]):
+        is_filtered = False
+    else:
+      assert False  # should have been checked for in SetFilter.
+  if is_filtered:
+    return False
+
+  return True
+
+
+def Error(filename, linenum, category, confidence, message):
+  """Logs the fact we've found a lint error.
+
+  We log where the error was found, and also our confidence in the error,
+  that is, how certain we are this is a legitimate style regression, and
+  not a misidentification or a use that's sometimes justified.
+
+  False positives can be suppressed by the use of
+  "cpplint(category)"  comments on the offending line.  These are
+  parsed into _error_suppressions.
+
+  Args:
+    filename: The name of the file containing the error.
+    linenum: The number of the line containing the error.
+    category: A string used to describe the "category" this bug
+      falls under: "whitespace", say, or "runtime".  Categories
+      may have a hierarchy separated by slashes: "whitespace/indent".
+    confidence: A number from 1-5 representing a confidence score for
+      the error, with 5 meaning that we are certain of the problem,
+      and 1 meaning that it could be a legitimate construct.
+    message: The error message.
+  """
+  if _ShouldPrintError(category, confidence, linenum):
+    _cpplint_state.IncrementErrorCount(category)
+    if _cpplint_state.output_format == 'vs7':
+      sys.stderr.write('%s(%s):  %s  [%s] [%d]\n' % (
+          filename, linenum, message, category, confidence))
+    elif _cpplint_state.output_format == 'eclipse':
+      sys.stderr.write('%s:%s: warning: %s  [%s] [%d]\n' % (
+          filename, linenum, message, category, confidence))
+    else:
+      sys.stderr.write('%s:%s:  %s  [%s] [%d]\n' % (
+          filename, linenum, message, category, confidence))
+
+
+# Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard.
+_RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile(
+    r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)')
+# Match a single C style comment on the same line.
+_RE_PATTERN_C_COMMENTS = r'/\*(?:[^*]|\*(?!/))*\*/'
+# Matches multi-line C style comments.
+# This RE is a little bit more complicated than one might expect, because we
+# have to take care of space removals tools so we can handle comments inside
+# statements better.
+# The current rule is: We only clear spaces from both sides when we're at the
+# end of the line. Otherwise, we try to remove spaces from the right side,
+# if this doesn't work we try on left side but only if there's a non-character
+# on the right.
+_RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile(
+    r'(\s*' + _RE_PATTERN_C_COMMENTS + r'\s*$|' +
+    _RE_PATTERN_C_COMMENTS + r'\s+|' +
+    r'\s+' + _RE_PATTERN_C_COMMENTS + r'(?=\W)|' +
+    _RE_PATTERN_C_COMMENTS + r')')
+
+
+def IsCppString(line):
+  """Does line terminate so, that the next symbol is in string constant.
+
+  This function does not consider single-line nor multi-line comments.
+
+  Args:
+    line: is a partial line of code starting from the 0..n.
+
+  Returns:
+    True, if next character appended to 'line' is inside a
+    string constant.
+  """
+
+  line = line.replace(r'\\', 'XX')  # after this, \\" does not match to \"
+  return ((line.count('"') - line.count(r'\"') - line.count("'\"'")) & 1) == 1
+
+
+def CleanseRawStrings(raw_lines):
+  """Removes C++11 raw strings from lines.
+
+    Before:
+      static const char kData[] = R"(
+          multi-line string
+          )";
+
+    After:
+      static const char kData[] = ""
+          (replaced by blank line)
+          "";
+
+  Args:
+    raw_lines: list of raw lines.
+
+  Returns:
+    list of lines with C++11 raw strings replaced by empty strings.
+  """
+
+  delimiter = None
+  lines_without_raw_strings = []
+  for line in raw_lines:
+    if delimiter:
+      # Inside a raw string, look for the end
+      end = line.find(delimiter)
+      if end >= 0:
+        # Found the end of the string, match leading space for this
+        # line and resume copying the original lines, and also insert
+        # a "" on the last line.
+        leading_space = Match(r'^(\s*)\S', line)
+        line = leading_space.group(1) + '""' + line[end + len(delimiter):]
+        delimiter = None
+      else:
+        # Haven't found the end yet, append a blank line.
+        line = '""'
+
+    # Look for beginning of a raw string, and replace them with
+    # empty strings.  This is done in a loop to handle multiple raw
+    # strings on the same line.
+    while delimiter is None:
+      # Look for beginning of a raw string.
+      # See 2.14.15 [lex.string] for syntax.
+      matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line)
+      if matched:
+        delimiter = ')' + matched.group(2) + '"'
+
+        end = matched.group(3).find(delimiter)
+        if end >= 0:
+          # Raw string ended on same line
+          line = (matched.group(1) + '""' +
+                  matched.group(3)[end + len(delimiter):])
+          delimiter = None
+        else:
+          # Start of a multi-line raw string
+          line = matched.group(1) + '""'
+      else:
+        break
+
+    lines_without_raw_strings.append(line)
+
+  # TODO(unknown): if delimiter is not None here, we might want to
+  # emit a warning for unterminated string.
+  return lines_without_raw_strings
+
+
+def FindNextMultiLineCommentStart(lines, lineix):
+  """Find the beginning marker for a multiline comment."""
+  while lineix < len(lines):
+    if lines[lineix].strip().startswith('/*'):
+      # Only return this marker if the comment goes beyond this line
+      if lines[lineix].strip().find('*/', 2) < 0:
+        return lineix
+    lineix += 1
+  return len(lines)
+
+
+def FindNextMultiLineCommentEnd(lines, lineix):
+  """We are inside a comment, find the end marker."""
+  while lineix < len(lines):
+    if lines[lineix].strip().endswith('*/'):
+      return lineix
+    lineix += 1
+  return len(lines)
+
+
+def RemoveMultiLineCommentsFromRange(lines, begin, end):
+  """Clears a range of lines for multi-line comments."""
+  # Having // dummy comments makes the lines non-empty, so we will not get
+  # unnecessary blank line warnings later in the code.
+  for i in range(begin, end):
+    lines[i] = '/**/'
+
+
+def RemoveMultiLineComments(filename, lines, error):
+  """Removes multiline (c-style) comments from lines."""
+  lineix = 0
+  while lineix < len(lines):
+    lineix_begin = FindNextMultiLineCommentStart(lines, lineix)
+    if lineix_begin >= len(lines):
+      return
+    lineix_end = FindNextMultiLineCommentEnd(lines, lineix_begin)
+    if lineix_end >= len(lines):
+      error(filename, lineix_begin + 1, 'readability/multiline_comment', 5,
+            'Could not find end of multi-line comment')
+      return
+    RemoveMultiLineCommentsFromRange(lines, lineix_begin, lineix_end + 1)
+    lineix = lineix_end + 1
+
+
+def CleanseComments(line):
+  """Removes //-comments and single-line C-style /* */ comments.
+
+  Args:
+    line: A line of C++ source.
+
+  Returns:
+    The line with single-line comments removed.
+  """
+  commentpos = line.find('//')
+  if commentpos != -1 and not IsCppString(line[:commentpos]):
+    line = line[:commentpos].rstrip()
+  # get rid of /* ... */
+  return _RE_PATTERN_CLEANSE_LINE_C_COMMENTS.sub('', line)
+
+
+class CleansedLines(object):
+  """Holds 4 copies of all lines with different preprocessing applied to them.
+
+  1) elided member contains lines without strings and comments.
+  2) lines member contains lines without comments.
+  3) raw_lines member contains all the lines without processing.
+  4) lines_without_raw_strings member is same as raw_lines, but with C++11 raw
+     strings removed.
+  All these members are of <type 'list'>, and of the same length.
+  """
+
+  def __init__(self, lines):
+    self.elided = []
+    self.lines = []
+    self.raw_lines = lines
+    self.num_lines = len(lines)
+    self.lines_without_raw_strings = CleanseRawStrings(lines)
+    for linenum in range(len(self.lines_without_raw_strings)):
+      self.lines.append(CleanseComments(
+          self.lines_without_raw_strings[linenum]))
+      elided = self._CollapseStrings(self.lines_without_raw_strings[linenum])
+      self.elided.append(CleanseComments(elided))
+
+  def NumLines(self):
+    """Returns the number of lines represented."""
+    return self.num_lines
+
+  @staticmethod
+  def _CollapseStrings(elided):
+    """Collapses strings and chars on a line to simple "" or '' blocks.
+
+    We nix strings first so we're not fooled by text like '"http://"'
+
+    Args:
+      elided: The line being processed.
+
+    Returns:
+      The line with collapsed strings.
+    """
+    if _RE_PATTERN_INCLUDE.match(elided):
+      return elided
+
+    # Remove escaped characters first to make quote/single quote collapsing
+    # basic.  Things that look like escaped characters shouldn't occur
+    # outside of strings and chars.
+    elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
+
+    # Replace quoted strings and digit separators.  Both single quotes
+    # and double quotes are processed in the same loop, otherwise
+    # nested quotes wouldn't work.
+    collapsed = ''
+    while True:
+      # Find the first quote character
+      match = Match(r'^([^\'"]*)([\'"])(.*)$', elided)
+      if not match:
+        collapsed += elided
+        break
+      head, quote, tail = match.groups()
+
+      if quote == '"':
+        # Collapse double quoted strings
+        second_quote = tail.find('"')
+        if second_quote >= 0:
+          collapsed += head + '""'
+          elided = tail[second_quote + 1:]
+        else:
+          # Unmatched double quote, don't bother processing the rest
+          # of the line since this is probably a multiline string.
+          collapsed += elided
+          break
+      else:
+        # Found single quote, check nearby text to eliminate digit separators.
+        #
+        # There is no special handling for floating point here, because
+        # the integer/fractional/exponent parts would all be parsed
+        # correctly as long as there are digits on both sides of the
+        # separator.  So we are fine as long as we don't see something
+        # like "0.'3" (gcc 4.9.0 will not allow this literal).
+        if Search(r'\b(?:0[bBxX]?|[1-9])[0-9a-fA-F]*$', head):
+          match_literal = Match(r'^((?:\'?[0-9a-zA-Z_])*)(.*)$', "'" + tail)
+          collapsed += head + match_literal.group(1).replace("'", '')
+          elided = match_literal.group(2)
+        else:
+          second_quote = tail.find('\'')
+          if second_quote >= 0:
+            collapsed += head + "''"
+            elided = tail[second_quote + 1:]
+          else:
+            # Unmatched single quote
+            collapsed += elided
+            break
+
+    return collapsed
+
+
+def FindEndOfExpressionInLine(line, startpos, stack):
+  """Find the position just after the end of current parenthesized expression.
+
+  Args:
+    line: a CleansedLines line.
+    startpos: start searching at this position.
+    stack: nesting stack at startpos.
+
+  Returns:
+    On finding matching end: (index just after matching end, None)
+    On finding an unclosed expression: (-1, None)
+    Otherwise: (-1, new stack at end of this line)
+  """
+  for i in xrange(startpos, len(line)):
+    char = line[i]
+    if char in '([{':
+      # Found start of parenthesized expression, push to expression stack
+      stack.append(char)
+    elif char == '<':
+      # Found potential start of template argument list
+      if i > 0 and line[i - 1] == '<':
+        # Left shift operator
+        if stack and stack[-1] == '<':
+          stack.pop()
+          if not stack:
+            return (-1, None)
+      elif i > 0 and Search(r'\boperator\s*$', line[0:i]):
+        # operator<, don't add to stack
+        continue
+      else:
+        # Tentative start of template argument list
+        stack.append('<')
+    elif char in ')]}':
+      # Found end of parenthesized expression.
+      #
+      # If we are currently expecting a matching '>', the pending '<'
+      # must have been an operator.  Remove them from expression stack.
+      while stack and stack[-1] == '<':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+      if ((stack[-1] == '(' and char == ')') or
+          (stack[-1] == '[' and char == ']') or
+          (stack[-1] == '{' and char == '}')):
+        stack.pop()
+        if not stack:
+          return (i + 1, None)
+      else:
+        # Mismatched parentheses
+        return (-1, None)
+    elif char == '>':
+      # Found potential end of template argument list.
+
+      # Ignore "->" and operator functions
+      if (i > 0 and
+          (line[i - 1] == '-' or Search(r'\boperator\s*$', line[0:i - 1]))):
+        continue
+
+      # Pop the stack if there is a matching '<'.  Otherwise, ignore
+      # this '>' since it must be an operator.
+      if stack:
+        if stack[-1] == '<':
+          stack.pop()
+          if not stack:
+            return (i + 1, None)
+    elif char == ';':
+      # Found something that look like end of statements.  If we are currently
+      # expecting a '>', the matching '<' must have been an operator, since
+      # template argument list should not contain statements.
+      while stack and stack[-1] == '<':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+
+  # Did not find end of expression or unbalanced parentheses on this line
+  return (-1, stack)
+
+
+def CloseExpression(clean_lines, linenum, pos):
+  """If input points to ( or { or [ or <, finds the position that closes it.
+
+  If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the
+  linenum/pos that correspond to the closing of the expression.
+
+  TODO(unknown): cpplint spends a fair bit of time matching parentheses.
+  Ideally we would want to index all opening and closing parentheses once
+  and have CloseExpression be just a simple lookup, but due to preprocessor
+  tricks, this is not so easy.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    pos: A position on the line.
+
+  Returns:
+    A tuple (line, linenum, pos) pointer *past* the closing brace, or
+    (line, len(lines), -1) if we never find a close.  Note we ignore
+    strings and comments when matching; and the line we return is the
+    'cleansed' line at linenum.
+  """
+
+  line = clean_lines.elided[linenum]
+  if (line[pos] not in '({[<') or Match(r'<[<=]', line[pos:]):
+    return (line, clean_lines.NumLines(), -1)
+
+  # Check first line
+  (end_pos, stack) = FindEndOfExpressionInLine(line, pos, [])
+  if end_pos > -1:
+    return (line, linenum, end_pos)
+
+  # Continue scanning forward
+  while stack and linenum < clean_lines.NumLines() - 1:
+    linenum += 1
+    line = clean_lines.elided[linenum]
+    (end_pos, stack) = FindEndOfExpressionInLine(line, 0, stack)
+    if end_pos > -1:
+      return (line, linenum, end_pos)
+
+  # Did not find end of expression before end of file, give up
+  return (line, clean_lines.NumLines(), -1)
+
+
+def FindStartOfExpressionInLine(line, endpos, stack):
+  """Find position at the matching start of current expression.
+
+  This is almost the reverse of FindEndOfExpressionInLine, but note
+  that the input position and returned position differs by 1.
+
+  Args:
+    line: a CleansedLines line.
+    endpos: start searching at this position.
+    stack: nesting stack at endpos.
+
+  Returns:
+    On finding matching start: (index at matching start, None)
+    On finding an unclosed expression: (-1, None)
+    Otherwise: (-1, new stack at beginning of this line)
+  """
+  i = endpos
+  while i >= 0:
+    char = line[i]
+    if char in ')]}':
+      # Found end of expression, push to expression stack
+      stack.append(char)
+    elif char == '>':
+      # Found potential end of template argument list.
+      #
+      # Ignore it if it's a "->" or ">=" or "operator>"
+      if (i > 0 and
+          (line[i - 1] == '-' or
+           Match(r'\s>=\s', line[i - 1:]) or
+           Search(r'\boperator\s*$', line[0:i]))):
+        i -= 1
+      else:
+        stack.append('>')
+    elif char == '<':
+      # Found potential start of template argument list
+      if i > 0 and line[i - 1] == '<':
+        # Left shift operator
+        i -= 1
+      else:
+        # If there is a matching '>', we can pop the expression stack.
+        # Otherwise, ignore this '<' since it must be an operator.
+        if stack and stack[-1] == '>':
+          stack.pop()
+          if not stack:
+            return (i, None)
+    elif char in '([{':
+      # Found start of expression.
+      #
+      # If there are any unmatched '>' on the stack, they must be
+      # operators.  Remove those.
+      while stack and stack[-1] == '>':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+      if ((char == '(' and stack[-1] == ')') or
+          (char == '[' and stack[-1] == ']') or
+          (char == '{' and stack[-1] == '}')):
+        stack.pop()
+        if not stack:
+          return (i, None)
+      else:
+        # Mismatched parentheses
+        return (-1, None)
+    elif char == ';':
+      # Found something that look like end of statements.  If we are currently
+      # expecting a '<', the matching '>' must have been an operator, since
+      # template argument list should not contain statements.
+      while stack and stack[-1] == '>':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+
+    i -= 1
+
+  return (-1, stack)
+
+
+def ReverseCloseExpression(clean_lines, linenum, pos):
+  """If input points to ) or } or ] or >, finds the position that opens it.
+
+  If lines[linenum][pos] points to a ')' or '}' or ']' or '>', finds the
+  linenum/pos that correspond to the opening of the expression.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    pos: A position on the line.
+
+  Returns:
+    A tuple (line, linenum, pos) pointer *at* the opening brace, or
+    (line, 0, -1) if we never find the matching opening brace.  Note
+    we ignore strings and comments when matching; and the line we
+    return is the 'cleansed' line at linenum.
+  """
+  line = clean_lines.elided[linenum]
+  if line[pos] not in ')}]>':
+    return (line, 0, -1)
+
+  # Check last line
+  (start_pos, stack) = FindStartOfExpressionInLine(line, pos, [])
+  if start_pos > -1:
+    return (line, linenum, start_pos)
+
+  # Continue scanning backward
+  while stack and linenum > 0:
+    linenum -= 1
+    line = clean_lines.elided[linenum]
+    (start_pos, stack) = FindStartOfExpressionInLine(line, len(line) - 1, stack)
+    if start_pos > -1:
+      return (line, linenum, start_pos)
+
+  # Did not find start of expression before beginning of file, give up
+  return (line, 0, -1)
+
+
+def CheckForCopyright(filename, lines, error):
+  """Logs an error if no Copyright message appears at the top of the file."""
+
+  # We'll say it should occur by line 10. Don't forget there's a
+  # dummy line at the front.
+  for line in xrange(1, min(len(lines), 11)):
+    if re.search(r'Copyright', lines[line], re.I): break
+  else:                       # means no copyright line was found
+    error(filename, 0, 'legal/copyright', 5,
+          'No copyright message found.  '
+          'You should have a line: "Copyright [year] <Copyright Owner>"')
+
+
+def GetIndentLevel(line):
+  """Return the number of leading spaces in line.
+
+  Args:
+    line: A string to check.
+
+  Returns:
+    An integer count of leading spaces, possibly zero.
+  """
+  indent = Match(r'^( *)\S', line)
+  if indent:
+    return len(indent.group(1))
+  else:
+    return 0
+
+
+def GetHeaderGuardCPPVariable(filename):
+  """Returns the CPP variable that should be used as a header guard.
+
+  Args:
+    filename: The name of a C++ header file.
+
+  Returns:
+    The CPP variable that should be used as a header guard in the
+    named file.
+
+  """
+  filename = os.path.basename(filename)
+  return re.sub(r'[^a-zA-Z0-9]', '_', filename).upper() + '_'
+
+
+def CheckForHeaderGuard(filename, clean_lines, error):
+  """Checks that the file contains a header guard.
+
+  Logs an error if no #ifndef header guard is present.  For other
+  headers, checks that the full pathname is used.
+
+  Args:
+    filename: The name of the C++ header file.
+    clean_lines: A CleansedLines instance containing the file.
+    error: The function to call with any errors found.
+  """
+
+  # Don't check for header guards if there are error suppression
+  # comments somewhere in this file.
+  #
+  # Because this is silencing a warning for a nonexistent line, we
+  # only support the very specific NOLINT(build/header_guard) syntax,
+  # and not the general NOLINT or NOLINT(*) syntax.
+  raw_lines = clean_lines.lines_without_raw_strings
+  for i in raw_lines:
+    if Search(r'//\s*NOLINT\(build/header_guard\)', i):
+      return
+
+  cppvar = GetHeaderGuardCPPVariable(filename)
+
+  ifndef = ''
+  ifndef_linenum = 0
+  define = ''
+  endif = ''
+  endif_linenum = 0
+  pragma_linenum = -1
+  for linenum, line in enumerate(raw_lines):
+    linesplit = line.split()
+    if len(linesplit) >= 2:
+      if linesplit[0] == '#pragma' and linesplit[1] == 'once':
+        pragma_linenum = linenum
+      # find the first occurrence of #ifndef and #define, save arg
+      if not ifndef and linesplit[0] == '#ifndef':
+        # set ifndef to the header guard presented on the #ifndef line.
+        ifndef = linesplit[1]
+        ifndef_linenum = linenum
+      if not define and linesplit[0] == '#define':
+        define = linesplit[1]
+    # find the last occurrence of #endif, save entire line
+    if line.startswith('#endif'):
+      endif = line
+      endif_linenum = linenum
+  if pragma_linenum != -1:
+      return # short path for pragma once
+  if not ifndef or not define or ifndef != define:
+    error(filename, 0, 'build/header_guard', 5,
+          'No #ifndef header guard found, suggested CPP variable is: %s' %
+          cppvar)
+    return
+
+  # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__
+  # for backward compatibility.
+  if ifndef != cppvar:
+    error_level = 0
+    if ifndef != cppvar + '_':
+      error_level = 5
+
+    ParseNolintSuppressions(filename, raw_lines[ifndef_linenum], ifndef_linenum,
+                            error)
+    error(filename, ifndef_linenum, 'build/header_guard', error_level,
+          '#ifndef header guard has wrong style, please use: %s' % cppvar)
+
+  # Check for "//" comments on endif line.
+  ParseNolintSuppressions(filename, raw_lines[endif_linenum], endif_linenum,
+                          error)
+  match = Match(r'#endif\s*//\s*' + cppvar + r'(_)?\b', endif)
+  if match:
+    if match.group(1) == '_':
+      # Issue low severity warning for deprecated double trailing underscore
+      error(filename, endif_linenum, 'build/header_guard', 0,
+            '#endif line should be "#endif  // %s"' % cppvar)
+    return
+
+  # Didn't find the corresponding "//" comment.  If this file does not
+  # contain any "//" comments at all, it could be that the compiler
+  # only wants "/**/" comments, look for those instead.
+  no_single_line_comments = True
+  for i in xrange(1, len(raw_lines) - 1):
+    line = raw_lines[i]
+    if Match(r'^(?:(?:\'(?:\.|[^\'])*\')|(?:"(?:\.|[^"])*")|[^\'"])*//', line):
+      no_single_line_comments = False
+      break
+
+  if no_single_line_comments:
+    match = Match(r'#endif\s*/\*\s*' + cppvar + r'(_)?\s*\*/', endif)
+    if match:
+      if match.group(1) == '_':
+        # Low severity warning for double trailing underscore
+        error(filename, endif_linenum, 'build/header_guard', 0,
+              '#endif line should be "#endif  /* %s */"' % cppvar)
+      return
+
+  # Didn't find anything
+  error(filename, endif_linenum, 'build/header_guard', 5,
+        '#endif line should be "#endif  // %s"' % cppvar)
+
+
+def CheckHeaderFileIncluded(filename, include_state, error):
+  """Logs an error if a .cc file does not include its header."""
+
+  # Do not check test files
+  if filename.endswith('_test.cc') or filename.endswith('_unittest.cc'):
+    return
+
+  fileinfo = FileInfo(filename)
+  headerfile = filename[0:len(filename) - 2] + 'h'
+  if not os.path.exists(headerfile):
+    return
+  headername = FileInfo(headerfile).RepositoryName()
+  first_include = 0
+  for section_list in include_state.include_list:
+    for f in section_list:
+      if headername in f[0] or f[0] in headername:
+        return
+      if not first_include:
+        first_include = f[1]
+
+  error(filename, first_include, 'build/include', 5,
+        '%s should include its header file %s' % (fileinfo.RepositoryName(),
+                                                  headername))
+
+
+def CheckForBadCharacters(filename, lines, error):
+  """Logs an error for each line containing bad characters.
+
+  Two kinds of bad characters:
+
+  1. Unicode replacement characters: These indicate that either the file
+  contained invalid UTF-8 (likely) or Unicode replacement characters (which
+  it shouldn't).  Note that it's possible for this to throw off line
+  numbering if the invalid UTF-8 occurred adjacent to a newline.
+
+  2. NUL bytes.  These are problematic for some tools.
+
+  Args:
+    filename: The name of the current file.
+    lines: An array of strings, each representing a line of the file.
+    error: The function to call with any errors found.
+  """
+  for linenum, line in enumerate(lines):
+    if u'\ufffd' in line:
+      error(filename, linenum, 'readability/utf8', 5,
+            'Line contains invalid UTF-8 (or Unicode replacement character).')
+    if '\0' in line:
+      error(filename, linenum, 'readability/nul', 5, 'Line contains NUL byte.')
+
+
+def CheckForNewlineAtEOF(filename, lines, error):
+  """Logs an error if there is no newline char at the end of the file.
+
+  Args:
+    filename: The name of the current file.
+    lines: An array of strings, each representing a line of the file.
+    error: The function to call with any errors found.
+  """
+
+  # The array lines() was created by adding two newlines to the
+  # original file (go figure), then splitting on \n.
+  # To verify that the file ends in \n, we just have to make sure the
+  # last-but-two element of lines() exists and is empty.
+  if len(lines) < 3 or lines[-2]:
+    error(filename, len(lines) - 2, 'whitespace/ending_newline', 5,
+          'Could not find a newline character at the end of the file.')
+
+
+def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error):
+  """Logs an error if we see /* ... */ or "..." that extend past one line.
+
+  /* ... */ comments are legit inside macros, for one line.
+  Otherwise, we prefer // comments, so it's ok to warn about the
+  other.  Likewise, it's ok for strings to extend across multiple
+  lines, as long as a line continuation character (backslash)
+  terminates each line. Although not currently prohibited by the C++
+  style guide, it's ugly and unnecessary. We don't do well with either
+  in this lint program, so we warn about both.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Remove all \\ (escaped backslashes) from the line. They are OK, and the
+  # second (escaped) slash may trigger later \" detection erroneously.
+  line = line.replace('\\\\', '')
+
+  if line.count('/*') > line.count('*/'):
+    error(filename, linenum, 'readability/multiline_comment', 5,
+          'Complex multi-line /*...*/-style comment found. '
+          'Lint may give bogus warnings.  '
+          'Consider replacing these with //-style comments, '
+          'with #if 0...#endif, '
+          'or with more clearly structured multi-line comments.')
+
+  if (line.count('"') - line.count('\\"')) % 2:
+    error(filename, linenum, 'readability/multiline_string', 5,
+          'Multi-line string ("...") found.  This lint script doesn\'t '
+          'do well with such strings, and may give bogus warnings.  '
+          'Use C++11 raw strings or concatenation instead.')
+
+
+# (non-threadsafe name, thread-safe alternative, validation pattern)
+#
+# The validation pattern is used to eliminate false positives such as:
+#  _rand();               // false positive due to substring match.
+#  ->rand();              // some member function rand().
+#  ACMRandom rand(seed);  // some variable named rand.
+#  ISAACRandom rand();    // another variable named rand.
+#
+# Basically we require the return value of these functions to be used
+# in some expression context on the same line by matching on some
+# operator before the function name.  This eliminates constructors and
+# member function calls.
+_UNSAFE_FUNC_PREFIX = r'(?:[-+*/=%^&|(<]\s*|>\s+)'
+_THREADING_LIST = (
+    ('asctime(', 'asctime_r(', _UNSAFE_FUNC_PREFIX + r'asctime\([^)]+\)'),
+    ('ctime(', 'ctime_r(', _UNSAFE_FUNC_PREFIX + r'ctime\([^)]+\)'),
+    ('getgrgid(', 'getgrgid_r(', _UNSAFE_FUNC_PREFIX + r'getgrgid\([^)]+\)'),
+    ('getgrnam(', 'getgrnam_r(', _UNSAFE_FUNC_PREFIX + r'getgrnam\([^)]+\)'),
+    ('getlogin(', 'getlogin_r(', _UNSAFE_FUNC_PREFIX + r'getlogin\(\)'),
+    ('getpwnam(', 'getpwnam_r(', _UNSAFE_FUNC_PREFIX + r'getpwnam\([^)]+\)'),
+    ('getpwuid(', 'getpwuid_r(', _UNSAFE_FUNC_PREFIX + r'getpwuid\([^)]+\)'),
+    ('gmtime(', 'gmtime_r(', _UNSAFE_FUNC_PREFIX + r'gmtime\([^)]+\)'),
+    ('localtime(', 'localtime_r(', _UNSAFE_FUNC_PREFIX + r'localtime\([^)]+\)'),
+    ('rand(', 'rand_r(', _UNSAFE_FUNC_PREFIX + r'rand\(\)'),
+    ('strtok(', 'strtok_r(',
+     _UNSAFE_FUNC_PREFIX + r'strtok\([^)]+\)'),
+    ('ttyname(', 'ttyname_r(', _UNSAFE_FUNC_PREFIX + r'ttyname\([^)]+\)'),
+    )
+
+
+def CheckPosixThreading(filename, clean_lines, linenum, error):
+  """Checks for calls to thread-unsafe functions.
+
+  Much code has been originally written without consideration of
+  multi-threading. Also, engineers are relying on their old experience;
+  they have learned posix before threading extensions were added. These
+  tests guide the engineers to use thread-safe functions (when using
+  posix directly).
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  for single_thread_func, multithread_safe_func, pattern in _THREADING_LIST:
+    # Additional pattern matching check to confirm that this is the
+    # function we are looking for
+    if Search(pattern, line):
+      error(filename, linenum, 'runtime/threadsafe_fn', 2,
+            'Consider using ' + multithread_safe_func +
+            '...) instead of ' + single_thread_func +
+            '...) for improved thread safety.')
+
+
+def CheckVlogArguments(filename, clean_lines, linenum, error):
+  """Checks that VLOG() is only used for defining a logging level.
+
+  For example, VLOG(2) is correct. VLOG(INFO), VLOG(WARNING), VLOG(ERROR), and
+  VLOG(FATAL) are not.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  if Search(r'\bVLOG\((INFO|ERROR|WARNING|DFATAL|FATAL)\)', line):
+    error(filename, linenum, 'runtime/vlog', 5,
+          'VLOG() should be used with numeric verbosity level.  '
+          'Use LOG() if you want symbolic severity levels.')
+
+# Matches invalid increment: *count++, which moves pointer instead of
+# incrementing a value.
+_RE_PATTERN_INVALID_INCREMENT = re.compile(
+    r'^\s*\*\w+(\+\+|--);')
+
+
+def CheckInvalidIncrement(filename, clean_lines, linenum, error):
+  """Checks for invalid increment *count++.
+
+  For example following function:
+  void increment_counter(int* count) {
+    *count++;
+  }
+  is invalid, because it effectively does count++, moving pointer, and should
+  be replaced with ++*count, (*count)++ or *count += 1.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  if _RE_PATTERN_INVALID_INCREMENT.match(line):
+    error(filename, linenum, 'runtime/invalid_increment', 5,
+          'Changing pointer instead of value (or unused value of operator*).')
+
+
+def IsMacroDefinition(clean_lines, linenum):
+  if Search(r'^#define', clean_lines[linenum]):
+    return True
+
+  if linenum > 0 and Search(r'\\$', clean_lines[linenum - 1]):
+    return True
+
+  return False
+
+
+def IsForwardClassDeclaration(clean_lines, linenum):
+  return Match(r'^\s*(\btemplate\b)*.*class\s+\w+;\s*$', clean_lines[linenum])
+
+
+class _BlockInfo(object):
+  """Stores information about a generic block of code."""
+
+  def __init__(self, seen_open_brace):
+    self.seen_open_brace = seen_open_brace
+    self.open_parentheses = 0
+    self.inline_asm = _NO_ASM
+    self.check_namespace_indentation = False
+
+  def CheckBegin(self, filename, clean_lines, linenum, error):
+    """Run checks that applies to text up to the opening brace.
+
+    This is mostly for checking the text after the class identifier
+    and the "{", usually where the base class is specified.  For other
+    blocks, there isn't much to check, so we always pass.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    pass
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    """Run checks that applies to text after the closing brace.
+
+    This is mostly used for checking end of namespace comments.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    pass
+
+  def IsBlockInfo(self):
+    """Returns true if this block is a _BlockInfo.
+
+    This is convenient for verifying that an object is an instance of
+    a _BlockInfo, but not an instance of any of the derived classes.
+
+    Returns:
+      True for this class, False for derived classes.
+    """
+    return self.__class__ == _BlockInfo
+
+
+class _ExternCInfo(_BlockInfo):
+  """Stores information about an 'extern "C"' block."""
+
+  def __init__(self):
+    _BlockInfo.__init__(self, True)
+
+
+class _ClassInfo(_BlockInfo):
+  """Stores information about a class."""
+
+  def __init__(self, name, class_or_struct, clean_lines, linenum):
+    _BlockInfo.__init__(self, False)
+    self.name = name
+    self.starting_linenum = linenum
+    self.is_derived = False
+    self.check_namespace_indentation = True
+    if class_or_struct == 'struct':
+      self.access = 'public'
+      self.is_struct = True
+    else:
+      self.access = 'private'
+      self.is_struct = False
+
+    # Remember initial indentation level for this class.  Using raw_lines here
+    # instead of elided to account for leading comments.
+    self.class_indent = GetIndentLevel(clean_lines.raw_lines[linenum])
+
+    # Try to find the end of the class.  This will be confused by things like:
+    #   class A {
+    #   } *x = { ...
+    #
+    # But it's still good enough for CheckSectionSpacing.
+    self.last_line = 0
+    depth = 0
+    for i in range(linenum, clean_lines.NumLines()):
+      line = clean_lines.elided[i]
+      depth += line.count('{') - line.count('}')
+      if not depth:
+        self.last_line = i
+        break
+
+  def CheckBegin(self, filename, clean_lines, linenum, error):
+    # Look for a bare ':'
+    if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]):
+      self.is_derived = True
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    # If there is a DISALLOW macro, it should appear near the end of
+    # the class.
+    seen_last_thing_in_class = False
+    for i in xrange(linenum - 1, self.starting_linenum, -1):
+      match = Search(
+          r'\b(DISALLOW_COPY_AND_ASSIGN|DISALLOW_IMPLICIT_CONSTRUCTORS)\(' +
+          self.name + r'\)',
+          clean_lines.elided[i])
+      if match:
+        if seen_last_thing_in_class:
+          error(filename, i, 'readability/constructors', 3,
+                match.group(1) + ' should be the last thing in the class')
+        break
+
+      if not Match(r'^\s*$', clean_lines.elided[i]):
+        seen_last_thing_in_class = True
+
+    # Check that closing brace is aligned with beginning of the class.
+    # Only do this if the closing brace is indented by only whitespaces.
+    # This means we will not check single-line class definitions.
+    indent = Match(r'^( *)\}', clean_lines.elided[linenum])
+    if indent and len(indent.group(1)) != self.class_indent:
+      if self.is_struct:
+        parent = 'struct ' + self.name
+      else:
+        parent = 'class ' + self.name
+      error(filename, linenum, 'whitespace/indent', 3,
+            'Closing brace should be aligned with beginning of %s' % parent)
+
+
+class _NamespaceInfo(_BlockInfo):
+  """Stores information about a namespace."""
+
+  def __init__(self, name, linenum):
+    _BlockInfo.__init__(self, False)
+    self.name = name or ''
+    self.starting_linenum = linenum
+    self.check_namespace_indentation = True
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    """Check end of namespace comments."""
+    line = clean_lines.raw_lines[linenum]
+
+    # Check how many lines is enclosed in this namespace.  Don't issue
+    # warning for missing namespace comments if there aren't enough
+    # lines.  However, do apply checks if there is already an end of
+    # namespace comment and it's incorrect.
+    #
+    # TODO(unknown): We always want to check end of namespace comments
+    # if a namespace is large, but sometimes we also want to apply the
+    # check if a short namespace contained nontrivial things (something
+    # other than forward declarations).  There is currently no logic on
+    # deciding what these nontrivial things are, so this check is
+    # triggered by namespace size only, which works most of the time.
+    if (linenum - self.starting_linenum < 10
+        and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)):
+      return
+
+    # Look for matching comment at end of namespace.
+    #
+    # Note that we accept C style "/* */" comments for terminating
+    # namespaces, so that code that terminate namespaces inside
+    # preprocessor macros can be cpplint clean.
+    #
+    # We also accept stuff like "// end of namespace <name>." with the
+    # period at the end.
+    #
+    # Besides these, we don't accept anything else, otherwise we might
+    # get false negatives when existing comment is a substring of the
+    # expected namespace.
+    if self.name:
+      # Named namespace
+      if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) +
+                    r'[\*/\.\\\s]*$'),
+                   line):
+        error(filename, linenum, 'readability/namespace', 5,
+              'Namespace should be terminated with "// namespace %s"' %
+              self.name)
+    else:
+      # Anonymous namespace
+      if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
+        # If "// namespace anonymous" or "// anonymous namespace (more text)",
+        # mention "// anonymous namespace" as an acceptable form
+        if Match(r'}.*\b(namespace anonymous|anonymous namespace)\b', line):
+          error(filename, linenum, 'readability/namespace', 5,
+                'Anonymous namespace should be terminated with "// namespace"'
+                ' or "// anonymous namespace"')
+        else:
+          error(filename, linenum, 'readability/namespace', 5,
+                'Anonymous namespace should be terminated with "// namespace"')
+
+
+class _PreprocessorInfo(object):
+  """Stores checkpoints of nesting stacks when #if/#else is seen."""
+
+  def __init__(self, stack_before_if):
+    # The entire nesting stack before #if
+    self.stack_before_if = stack_before_if
+
+    # The entire nesting stack up to #else
+    self.stack_before_else = []
+
+    # Whether we have already seen #else or #elif
+    self.seen_else = False
+
+
+class NestingState(object):
+  """Holds states related to parsing braces."""
+
+  def __init__(self):
+    # Stack for tracking all braces.  An object is pushed whenever we
+    # see a "{", and popped when we see a "}".  Only 3 types of
+    # objects are possible:
+    # - _ClassInfo: a class or struct.
+    # - _NamespaceInfo: a namespace.
+    # - _BlockInfo: some other type of block.
+    self.stack = []
+
+    # Top of the previous stack before each Update().
+    #
+    # Because the nesting_stack is updated at the end of each line, we
+    # had to do some convoluted checks to find out what is the current
+    # scope at the beginning of the line.  This check is simplified by
+    # saving the previous top of nesting stack.
+    #
+    # We could save the full stack, but we only need the top.  Copying
+    # the full nesting stack would slow down cpplint by ~10%.
+    self.previous_stack_top = []
+
+    # Stack of _PreprocessorInfo objects.
+    self.pp_stack = []
+
+  def SeenOpenBrace(self):
+    """Check if we have seen the opening brace for the innermost block.
+
+    Returns:
+      True if we have seen the opening brace, False if the innermost
+      block is still expecting an opening brace.
+    """
+    return (not self.stack) or self.stack[-1].seen_open_brace
+
+  def InNamespaceBody(self):
+    """Check if we are currently one level inside a namespace body.
+
+    Returns:
+      True if top of the stack is a namespace block, False otherwise.
+    """
+    return self.stack and isinstance(self.stack[-1], _NamespaceInfo)
+
+  def InExternC(self):
+    """Check if we are currently one level inside an 'extern "C"' block.
+
+    Returns:
+      True if top of the stack is an extern block, False otherwise.
+    """
+    return self.stack and isinstance(self.stack[-1], _ExternCInfo)
+
+  def InClassDeclaration(self):
+    """Check if we are currently one level inside a class or struct declaration.
+
+    Returns:
+      True if top of the stack is a class/struct, False otherwise.
+    """
+    return self.stack and isinstance(self.stack[-1], _ClassInfo)
+
+  def InAsmBlock(self):
+    """Check if we are currently one level inside an inline ASM block.
+
+    Returns:
+      True if the top of the stack is a block containing inline ASM.
+    """
+    return self.stack and self.stack[-1].inline_asm != _NO_ASM
+
+  def InTemplateArgumentList(self, clean_lines, linenum, pos):
+    """Check if current position is inside template argument list.
+
+    Args:
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      pos: position just after the suspected template argument.
+    Returns:
+      True if (linenum, pos) is inside template arguments.
+    """
+    while linenum < clean_lines.NumLines():
+      # Find the earliest character that might indicate a template argument
+      line = clean_lines.elided[linenum]
+      match = Match(r'^[^{};=\[\]\.<>]*(.)', line[pos:])
+      if not match:
+        linenum += 1
+        pos = 0
+        continue
+      token = match.group(1)
+      pos += len(match.group(0))
+
+      # These things do not look like template argument list:
+      #   class Suspect {
+      #   class Suspect x; }
+      if token in ('{', '}', ';'): return False
+
+      # These things look like template argument list:
+      #   template <class Suspect>
+      #   template <class Suspect = default_value>
+      #   template <class Suspect[]>
+      #   template <class Suspect...>
+      if token in ('>', '=', '[', ']', '.'): return True
+
+      # Check if token is an unmatched '<'.
+      # If not, move on to the next character.
+      if token != '<':
+        pos += 1
+        if pos >= len(line):
+          linenum += 1
+          pos = 0
+        continue
+
+      # We can't be sure if we just find a single '<', and need to
+      # find the matching '>'.
+      (_, end_line, end_pos) = CloseExpression(clean_lines, linenum, pos - 1)
+      if end_pos < 0:
+        # Not sure if template argument list or syntax error in file
+        return False
+      linenum = end_line
+      pos = end_pos
+    return False
+
+  def UpdatePreprocessor(self, line):
+    """Update preprocessor stack.
+
+    We need to handle preprocessors due to classes like this:
+      #ifdef SWIG
+      struct ResultDetailsPageElementExtensionPoint {
+      #else
+      struct ResultDetailsPageElementExtensionPoint : public Extension {
+      #endif
+
+    We make the following assumptions (good enough for most files):
+    - Preprocessor condition evaluates to true from #if up to first
+      #else/#elif/#endif.
+
+    - Preprocessor condition evaluates to false from #else/#elif up
+      to #endif.  We still perform lint checks on these lines, but
+      these do not affect nesting stack.
+
+    Args:
+      line: current line to check.
+    """
+    if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line):
+      # Beginning of #if block, save the nesting stack here.  The saved
+      # stack will allow us to restore the parsing state in the #else case.
+      self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack)))
+    elif Match(r'^\s*#\s*(else|elif)\b', line):
+      # Beginning of #else block
+      if self.pp_stack:
+        if not self.pp_stack[-1].seen_else:
+          # This is the first #else or #elif block.  Remember the
+          # whole nesting stack up to this point.  This is what we
+          # keep after the #endif.
+          self.pp_stack[-1].seen_else = True
+          self.pp_stack[-1].stack_before_else = copy.deepcopy(self.stack)
+
+        # Restore the stack to how it was before the #if
+        self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if)
+      else:
+        # TODO(unknown): unexpected #else, issue warning?
+        pass
+    elif Match(r'^\s*#\s*endif\b', line):
+      # End of #if or #else blocks.
+      if self.pp_stack:
+        # If we saw an #else, we will need to restore the nesting
+        # stack to its former state before the #else, otherwise we
+        # will just continue from where we left off.
+        if self.pp_stack[-1].seen_else:
+          # Here we can just use a shallow copy since we are the last
+          # reference to it.
+          self.stack = self.pp_stack[-1].stack_before_else
+        # Drop the corresponding #if
+        self.pp_stack.pop()
+      else:
+        # TODO(unknown): unexpected #endif, issue warning?
+        pass
+
+  # TODO(unknown): Update() is too long, but we will refactor later.
+  def Update(self, filename, clean_lines, linenum, error):
+    """Update nesting state with current line.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    line = clean_lines.elided[linenum]
+
+    # Remember top of the previous nesting stack.
+    #
+    # The stack is always pushed/popped and not modified in place, so
+    # we can just do a shallow copy instead of copy.deepcopy.  Using
+    # deepcopy would slow down cpplint by ~28%.
+    if self.stack:
+      self.previous_stack_top = self.stack[-1]
+    else:
+      self.previous_stack_top = None
+
+    # Update pp_stack
+    self.UpdatePreprocessor(line)
+
+    # Count parentheses.  This is to avoid adding struct arguments to
+    # the nesting stack.
+    if self.stack:
+      inner_block = self.stack[-1]
+      depth_change = line.count('(') - line.count(')')
+      inner_block.open_parentheses += depth_change
+
+      # Also check if we are starting or ending an inline assembly block.
+      if inner_block.inline_asm in (_NO_ASM, _END_ASM):
+        if (depth_change != 0 and
+            inner_block.open_parentheses == 1 and
+            _MATCH_ASM.match(line)):
+          # Enter assembly block
+          inner_block.inline_asm = _INSIDE_ASM
+        else:
+          # Not entering assembly block.  If previous line was _END_ASM,
+          # we will now shift to _NO_ASM state.
+          inner_block.inline_asm = _NO_ASM
+      elif (inner_block.inline_asm == _INSIDE_ASM and
+            inner_block.open_parentheses == 0):
+        # Exit assembly block
+        inner_block.inline_asm = _END_ASM
+
+    # Consume namespace declaration at the beginning of the line.  Do
+    # this in a loop so that we catch same line declarations like this:
+    #   namespace proto2 { namespace bridge { class MessageSet; } }
+    while True:
+      # Match start of namespace.  The "\b\s*" below catches namespace
+      # declarations even if it weren't followed by a whitespace, this
+      # is so that we don't confuse our namespace checker.  The
+      # missing spaces will be flagged by CheckSpacing.
+      namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$', line)
+      if not namespace_decl_match:
+        break
+
+      new_namespace = _NamespaceInfo(namespace_decl_match.group(1), linenum)
+      self.stack.append(new_namespace)
+
+      line = namespace_decl_match.group(2)
+      if line.find('{') != -1:
+        new_namespace.seen_open_brace = True
+        line = line[line.find('{') + 1:]
+
+    # Look for a class declaration in whatever is left of the line
+    # after parsing namespaces.  The regexp accounts for decorated classes
+    # such as in:
+    #   class LOCKABLE API Object {
+    #   };
+    class_decl_match = Match(
+        r'^(\s*(?:template\s*<[\w\s<>,:]*>\s*)?'
+        r'(class|struct)\s+(?:[A-Z_]+\s+)*(\w+(?:::\w+)*))'
+        r'(.*)$', line)
+    if (class_decl_match and
+        (not self.stack or self.stack[-1].open_parentheses == 0)):
+      # We do not want to accept classes that are actually template arguments:
+      #   template <class Ignore1,
+      #             class Ignore2 = Default<Args>,
+      #             template <Args> class Ignore3>
+      #   void Function() {};
+      #
+      # To avoid template argument cases, we scan forward and look for
+      # an unmatched '>'.  If we see one, assume we are inside a
+      # template argument list.
+      end_declaration = len(class_decl_match.group(1))
+      if not self.InTemplateArgumentList(clean_lines, linenum, end_declaration):
+        self.stack.append(_ClassInfo(
+            class_decl_match.group(3), class_decl_match.group(2),
+            clean_lines, linenum))
+        line = class_decl_match.group(4)
+
+    # If we have not yet seen the opening brace for the innermost block,
+    # run checks here.
+    if not self.SeenOpenBrace():
+      self.stack[-1].CheckBegin(filename, clean_lines, linenum, error)
+
+    # Update access control if we are inside a class/struct
+    if self.stack and isinstance(self.stack[-1], _ClassInfo):
+      classinfo = self.stack[-1]
+      access_match = Match(
+          r'^(.*)\b(public|private|protected|signals)(\s+(?:slots\s*)?)?'
+          r':(?:[^:]|$)',
+          line)
+      if access_match:
+        classinfo.access = access_match.group(2)
+
+        # Check that access keywords are indented +1 space.  Skip this
+        # check if the keywords are not preceded by whitespaces.
+        indent = access_match.group(1)
+        if (len(indent) != classinfo.class_indent + 1 and
+            Match(r'^\s*$', indent)):
+          if classinfo.is_struct:
+            parent = 'struct ' + classinfo.name
+          else:
+            parent = 'class ' + classinfo.name
+          slots = ''
+          if access_match.group(3):
+            slots = access_match.group(3)
+          error(filename, linenum, 'whitespace/indent', 3,
+                '%s%s: should be indented +1 space inside %s' % (
+                    access_match.group(2), slots, parent))
+
+    # Consume braces or semicolons from what's left of the line
+    while True:
+      # Match first brace, semicolon, or closed parenthesis.
+      matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line)
+      if not matched:
+        break
+
+      token = matched.group(1)
+      if token == '{':
+        # If namespace or class hasn't seen a opening brace yet, mark
+        # namespace/class head as complete.  Push a new block onto the
+        # stack otherwise.
+        if not self.SeenOpenBrace():
+          self.stack[-1].seen_open_brace = True
+        elif Match(r'^extern\s*"[^"]*"\s*\{', line):
+          self.stack.append(_ExternCInfo())
+        else:
+          self.stack.append(_BlockInfo(True))
+          if _MATCH_ASM.match(line):
+            self.stack[-1].inline_asm = _BLOCK_ASM
+
+      elif token == ';' or token == ')':
+        # If we haven't seen an opening brace yet, but we already saw
+        # a semicolon, this is probably a forward declaration.  Pop
+        # the stack for these.
+        #
+        # Similarly, if we haven't seen an opening brace yet, but we
+        # already saw a closing parenthesis, then these are probably
+        # function arguments with extra "class" or "struct" keywords.
+        # Also pop these stack for these.
+        if not self.SeenOpenBrace():
+          self.stack.pop()
+      else:  # token == '}'
+        # Perform end of block checks and pop the stack.
+        if self.stack:
+          self.stack[-1].CheckEnd(filename, clean_lines, linenum, error)
+          self.stack.pop()
+      line = matched.group(2)
+
+  def InnermostClass(self):
+    """Get class info on the top of the stack.
+
+    Returns:
+      A _ClassInfo object if we are inside a class, or None otherwise.
+    """
+    for i in range(len(self.stack), 0, -1):
+      classinfo = self.stack[i - 1]
+      if isinstance(classinfo, _ClassInfo):
+        return classinfo
+    return None
+
+  def CheckCompletedBlocks(self, filename, error):
+    """Checks that all classes and namespaces have been completely parsed.
+
+    Call this when all lines in a file have been processed.
+    Args:
+      filename: The name of the current file.
+      error: The function to call with any errors found.
+    """
+    # Note: This test can result in false positives if #ifdef constructs
+    # get in the way of brace matching. See the testBuildClass test in
+    # cpplint_unittest.py for an example of this.
+    for obj in self.stack:
+      if isinstance(obj, _ClassInfo):
+        error(filename, obj.starting_linenum, 'build/class', 5,
+              'Failed to find complete declaration of class %s' %
+              obj.name)
+      elif isinstance(obj, _NamespaceInfo):
+        error(filename, obj.starting_linenum, 'build/namespaces', 5,
+              'Failed to find complete declaration of namespace %s' %
+              obj.name)
+
+
+def CheckForNonStandardConstructs(filename, clean_lines, linenum,
+                                  nesting_state, error):
+  r"""Logs an error if we see certain non-ANSI constructs ignored by gcc-2.
+
+  Complain about several constructs which gcc-2 accepts, but which are
+  not standard C++.  Warning about these in lint is one way to ease the
+  transition to new compilers.
+  - put storage class first (e.g. "static const" instead of "const static").
+  - "%lld" instead of %qd" in printf-type functions.
+  - "%1$d" is non-standard in printf-type functions.
+  - "\%" is an undefined character escape sequence.
+  - text after #endif is not allowed.
+  - invalid inner-style forward declaration.
+  - >? and <? operators, and their >?= and <?= cousins.
+
+  Additionally, check for constructor/destructor style violations and reference
+  members, as it is very convenient to do so while checking for
+  gcc-2 compliance.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: A callable to which errors are reported, which takes 4 arguments:
+           filename, line number, error level, and message
+  """
+
+  # Remove comments from the line, but leave in strings for now.
+  line = clean_lines.lines[linenum]
+
+  if Search(r'printf\s*\(.*".*%[-+ ]?\d*q', line):
+    error(filename, linenum, 'runtime/printf_format', 3,
+          '%q in format strings is deprecated.  Use %ll instead.')
+
+  if Search(r'printf\s*\(.*".*%\d+\$', line):
+    error(filename, linenum, 'runtime/printf_format', 2,
+          '%N$ formats are unconventional.  Try rewriting to avoid them.')
+
+  # Remove escaped backslashes before looking for undefined escapes.
+  line = line.replace('\\\\', '')
+
+  if Search(r'("|\').*\\(%|\[|\(|{)', line):
+    error(filename, linenum, 'build/printf_format', 3,
+          '%, [, (, and { are undefined character escapes.  Unescape them.')
+
+  # For the rest, work with both comments and strings removed.
+  line = clean_lines.elided[linenum]
+
+  if Search(r'\b(const|volatile|void|char|short|int|long'
+            r'|float|double|signed|unsigned'
+            r'|schar|u?int8|u?int16|u?int32|u?int64)'
+            r'\s+(register|static|extern|typedef)\b',
+            line):
+    error(filename, linenum, 'build/storage_class', 5,
+          'Storage class (static, extern, typedef, etc) should be first.')
+
+  if Match(r'\s*#\s*endif\s*[^/\s]+', line):
+    error(filename, linenum, 'build/endif_comment', 5,
+          'Uncommented text after #endif is non-standard.  Use a comment.')
+
+  if Match(r'\s*class\s+(\w+\s*::\s*)+\w+\s*;', line):
+    error(filename, linenum, 'build/forward_decl', 5,
+          'Inner-style forward declarations are invalid.  Remove this line.')
+
+  if Search(r'(\w+|[+-]?\d+(\.\d*)?)\s*(<|>)\?=?\s*(\w+|[+-]?\d+)(\.\d*)?',
+            line):
+    error(filename, linenum, 'build/deprecated', 3,
+          '>? and <? (max and min) operators are non-standard and deprecated.')
+
+  if Search(r'^\s*const\s*string\s*&\s*\w+\s*;', line):
+    # TODO(unknown): Could it be expanded safely to arbitrary references,
+    # without triggering too many false positives? The first
+    # attempt triggered 5 warnings for mostly benign code in the regtest, hence
+    # the restriction.
+    # Here's the original regexp, for the reference:
+    # type_name = r'\w+((\s*::\s*\w+)|(\s*<\s*\w+?\s*>))?'
+    # r'\s*const\s*' + type_name + '\s*&\s*\w+\s*;'
+    error(filename, linenum, 'runtime/member_string_references', 2,
+          'const string& members are dangerous. It is much better to use '
+          'alternatives, such as pointers or simple constants.')
+
+  # Everything else in this function operates on class declarations.
+  # Return early if the top of the nesting stack is not a class, or if
+  # the class head is not completed yet.
+  classinfo = nesting_state.InnermostClass()
+  if not classinfo or not classinfo.seen_open_brace:
+    return
+
+  # The class may have been declared with namespace or classname qualifiers.
+  # The constructor and destructor will not have those qualifiers.
+  base_classname = classinfo.name.split('::')[-1]
+
+  # Look for single-argument constructors that aren't marked explicit.
+  # Technically a valid construct, but against style. Also look for
+  # non-single-argument constructors which are also technically valid, but
+  # strongly suggest something is wrong.
+  explicit_constructor_match = Match(
+      r'\s+(?:inline\s+)?(explicit\s+)?(?:inline\s+)?%s\s*'
+      r'\(((?:[^()]|\([^()]*\))*)\)'
+      % re.escape(base_classname),
+      line)
+
+  if explicit_constructor_match:
+    is_marked_explicit = explicit_constructor_match.group(1)
+
+    if not explicit_constructor_match.group(2):
+      constructor_args = []
+    else:
+      constructor_args = explicit_constructor_match.group(2).split(',')
+
+    # collapse arguments so that commas in template parameter lists and function
+    # argument parameter lists don't split arguments in two
+    i = 0
+    while i < len(constructor_args):
+      constructor_arg = constructor_args[i]
+      while (constructor_arg.count('<') > constructor_arg.count('>') or
+             constructor_arg.count('(') > constructor_arg.count(')')):
+        constructor_arg += ',' + constructor_args[i + 1]
+        del constructor_args[i + 1]
+      constructor_args[i] = constructor_arg
+      i += 1
+
+    defaulted_args = [arg for arg in constructor_args if '=' in arg]
+    noarg_constructor = (not constructor_args or  # empty arg list
+                         # 'void' arg specifier
+                         (len(constructor_args) == 1 and
+                          constructor_args[0].strip() == 'void'))
+    onearg_constructor = ((len(constructor_args) == 1 and  # exactly one arg
+                           not noarg_constructor) or
+                          # all but at most one arg defaulted
+                          (len(constructor_args) >= 1 and
+                           not noarg_constructor and
+                           len(defaulted_args) >= len(constructor_args) - 1))
+    initializer_list_constructor = bool(
+        onearg_constructor and
+        Search(r'\bstd\s*::\s*initializer_list\b', constructor_args[0]))
+    copy_constructor = bool(
+        onearg_constructor and
+        Match(r'(const\s+)?%s(\s*<[^>]*>)?(\s+const)?\s*(?:<\w+>\s*)?&'
+              % re.escape(base_classname), constructor_args[0].strip()))
+
+    if (not is_marked_explicit and
+        onearg_constructor and
+        not initializer_list_constructor and
+        not copy_constructor):
+      if defaulted_args:
+        error(filename, linenum, 'runtime/explicit', 5,
+              'Constructors callable with one argument '
+              'should be marked explicit.')
+      else:
+        error(filename, linenum, 'runtime/explicit', 5,
+              'Single-parameter constructors should be marked explicit.')
+    elif is_marked_explicit and not onearg_constructor:
+      if noarg_constructor:
+        error(filename, linenum, 'runtime/explicit', 5,
+              'Zero-parameter constructors should not be marked explicit.')
+      else:
+        error(filename, linenum, 'runtime/explicit', 0,
+              'Constructors that require multiple arguments '
+              'should not be marked explicit.')
+
+
+def CheckSpacingForFunctionCall(filename, clean_lines, linenum, error):
+  """Checks for the correctness of various spacing around function calls.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Since function calls often occur inside if/for/while/switch
+  # expressions - which have their own, more liberal conventions - we
+  # first see if we should be looking inside such an expression for a
+  # function call, to which we can apply more strict standards.
+  fncall = line    # if there's no control flow construct, look at whole line
+  for pattern in (r'\bif\s*\((.*)\)\s*{',
+                  r'\bfor\s*\((.*)\)\s*{',
+                  r'\bwhile\s*\((.*)\)\s*[{;]',
+                  r'\bswitch\s*\((.*)\)\s*{'):
+    match = Search(pattern, line)
+    if match:
+      fncall = match.group(1)    # look inside the parens for function calls
+      break
+
+  # Except in if/for/while/switch, there should never be space
+  # immediately inside parens (eg "f( 3, 4 )").  We make an exception
+  # for nested parens ( (a+b) + c ).  Likewise, there should never be
+  # a space before a ( when it's a function argument.  I assume it's a
+  # function argument when the char before the whitespace is legal in
+  # a function name (alnum + _) and we're not starting a macro. Also ignore
+  # pointers and references to arrays and functions coz they're too tricky:
+  # we use a very simple way to recognize these:
+  # " (something)(maybe-something)" or
+  # " (something)(maybe-something," or
+  # " (something)[something]"
+  # Note that we assume the contents of [] to be short enough that
+  # they'll never need to wrap.
+  if (  # Ignore control structures.
+      not Search(r'\b(if|for|while|switch|return|new|delete|catch|sizeof)\b',
+                 fncall) and
+      # Ignore pointers/references to functions.
+      not Search(r' \([^)]+\)\([^)]*(\)|,$)', fncall) and
+      # Ignore pointers/references to arrays.
+      not Search(r' \([^)]+\)\[[^\]]+\]', fncall)):
+    if Search(r'\w\s*\(\s(?!\s*\\$)', fncall):      # a ( used for a fn call
+      error(filename, linenum, 'whitespace/parens', 4,
+            'Extra space after ( in function call')
+    elif Search(r'\(\s+(?!(\s*\\)|\()', fncall):
+      error(filename, linenum, 'whitespace/parens', 2,
+            'Extra space after (')
+    if (Search(r'\w\s+\(', fncall) and
+        not Search(r'#\s*define|typedef|using\s+\w+\s*=', fncall) and
+        not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall) and
+        not Search(r'\bcase\s+\(', fncall)):
+      # TODO(unknown): Space after an operator function seem to be a common
+      # error, silence those for now by restricting them to highest verbosity.
+      if Search(r'\boperator_*\b', line):
+        error(filename, linenum, 'whitespace/parens', 0,
+              'Extra space before ( in function call')
+      else:
+        error(filename, linenum, 'whitespace/parens', 4,
+              'Extra space before ( in function call')
+    # If the ) is followed only by a newline or a { + newline, assume it's
+    # part of a control statement (if/while/etc), and don't complain
+    if Search(r'[^)]\s+\)\s*[^{\s]', fncall):
+      # If the closing parenthesis is preceded by only whitespaces,
+      # try to give a more descriptive error message.
+      if Search(r'^\s+\)', fncall):
+        error(filename, linenum, 'whitespace/parens', 2,
+              'Closing ) should be moved to the previous line')
+      else:
+        error(filename, linenum, 'whitespace/parens', 2,
+              'Extra space before )')
+
+
+def IsBlankLine(line):
+  """Returns true if the given line is blank.
+
+  We consider a line to be blank if the line is empty or consists of
+  only white spaces.
+
+  Args:
+    line: A line of a string.
+
+  Returns:
+    True, if the given line is blank.
+  """
+  return not line or line.isspace()
+
+
+def CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line,
+                                 error):
+  is_namespace_indent_item = (
+      len(nesting_state.stack) > 1 and
+      nesting_state.stack[-1].check_namespace_indentation and
+      isinstance(nesting_state.previous_stack_top, _NamespaceInfo) and
+      nesting_state.previous_stack_top == nesting_state.stack[-2])
+
+  if ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item,
+                                     clean_lines.elided, line):
+    CheckItemIndentationInNamespace(filename, clean_lines.elided,
+                                    line, error)
+
+
+def CheckForFunctionLengths(filename, clean_lines, linenum,
+                            function_state, error):
+  """Reports for long function bodies.
+
+  For an overview why this is done, see:
+  http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
+
+  Uses a simplistic algorithm assuming other style guidelines
+  (especially spacing) are followed.
+  Only checks unindented functions, so class members are unchecked.
+  Trivial bodies are unchecked, so constructors with huge initializer lists
+  may be missed.
+  Blank/comment lines are not counted so as to avoid encouraging the removal
+  of vertical space and comments just to get through a lint check.
+  NOLINT *on the last line of a function* disables this check.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    function_state: Current function name and lines in body so far.
+    error: The function to call with any errors found.
+  """
+  lines = clean_lines.lines
+  line = lines[linenum]
+  joined_line = ''
+
+  starting_func = False
+  regexp = r'(\w(\w|::|\*|\&|\s)*)\('  # decls * & space::name( ...
+  match_result = Match(regexp, line)
+  if match_result:
+    # If the name is all caps and underscores, figure it's a macro and
+    # ignore it, unless it's TEST or TEST_F.
+    function_name = match_result.group(1).split()[-1]
+    if function_name == 'TEST' or function_name == 'TEST_F' or (
+        not Match(r'[A-Z_]+$', function_name)):
+      starting_func = True
+
+  if starting_func:
+    body_found = False
+    for start_linenum in xrange(linenum, clean_lines.NumLines()):
+      start_line = lines[start_linenum]
+      joined_line += ' ' + start_line.lstrip()
+      if Search(r'(;|})', start_line):  # Declarations and trivial functions
+        body_found = True
+        break                              # ... ignore
+      elif Search(r'{', start_line):
+        body_found = True
+        function = Search(r'((\w|:)*)\(', line).group(1)
+        if Match(r'TEST', function):    # Handle TEST... macros
+          parameter_regexp = Search(r'(\(.*\))', joined_line)
+          if parameter_regexp:             # Ignore bad syntax
+            function += parameter_regexp.group(1)
+        else:
+          function += '()'
+        function_state.Begin(function)
+        break
+    if not body_found:
+      # No body for the function (or evidence of a non-function) was found.
+      error(filename, linenum, 'readability/fn_size', 5,
+            'Lint failed to find start of function body.')
+  elif Match(r'^\}\s*$', line):  # function end
+    function_state.Check(error, filename, linenum)
+    function_state.End()
+  elif not Match(r'^\s*$', line):
+    function_state.Count()  # Count non-blank/non-comment lines.
+
+
+_RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?')
+
+
+def CheckComment(line, filename, linenum, next_line_start, error):
+  """Checks for common mistakes in comments.
+
+  Args:
+    line: The line in question.
+    filename: The name of the current file.
+    linenum: The number of the line to check.
+    next_line_start: The first non-whitespace column of the next line.
+    error: The function to call with any errors found.
+  """
+  commentpos = line.find('//')
+  if commentpos != -1:
+    # Check if the // may be in quotes.  If so, ignore it
+    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
+    if (line.count('"', 0, commentpos) -
+        line.count('\\"', 0, commentpos)) % 2 == 0:   # not in quotes
+      # Allow one space for new scopes, two spaces otherwise:
+      if (not (Match(r'^.*{ *//', line) and next_line_start == commentpos) and
+          ((commentpos >= 1 and
+            line[commentpos-1] not in string.whitespace) or
+           (commentpos >= 2 and
+            line[commentpos-2] not in string.whitespace))):
+        error(filename, linenum, 'whitespace/comments', 2,
+              'At least two spaces is best between code and comments')
+
+      # Checks for common mistakes in TODO comments.
+      comment = line[commentpos:]
+      match = _RE_PATTERN_TODO.match(comment)
+      if match:
+        # One whitespace is correct; zero whitespace is handled elsewhere.
+        leading_whitespace = match.group(1)
+        if len(leading_whitespace) > 1:
+          error(filename, linenum, 'whitespace/todo', 2,
+                'Too many spaces before TODO')
+
+        username = match.group(2)
+        if not username:
+          error(filename, linenum, 'readability/todo', 2,
+                'Missing username in TODO; it should look like '
+                '"// TODO(my_username): Stuff."')
+
+        middle_whitespace = match.group(3)
+        # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
+        if middle_whitespace != ' ' and middle_whitespace != '':
+          error(filename, linenum, 'whitespace/todo', 2,
+                'TODO(my_username) should be followed by a space')
+
+      # If the comment contains an alphanumeric character, there
+      # should be a space somewhere between it and the // unless
+      # it's a /// or //! Doxygen comment.
+      if (Match(r'//[^ ]*\w', comment) and
+          not Match(r'(///|//\!)(\s+|$)', comment)):
+        error(filename, linenum, 'whitespace/comments', 4,
+              'Should have a space between // and comment')
+
+
+def CheckAccess(filename, clean_lines, linenum, nesting_state, error):
+  """Checks for improper use of DISALLOW* macros.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]  # get rid of comments and strings
+
+  matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|'
+                   r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line)
+  if not matched:
+    return
+  if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo):
+    if nesting_state.stack[-1].access != 'private':
+      error(filename, linenum, 'readability/constructors', 3,
+            '%s must be in the private: section' % matched.group(1))
+
+  else:
+    # Found DISALLOW* macro outside a class declaration, or perhaps it
+    # was used inside a function when it should have been part of the
+    # class declaration.  We could issue a warning here, but it
+    # probably resulted in a compiler error already.
+    pass
+
+
+def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
+  """Checks for the correctness of various spacing issues in the code.
+
+  Things we check for: spaces around operators, spaces after
+  if/for/while/switch, no spaces around parens in function calls, two
+  spaces between code and comment, don't start a block with a blank
+  line, don't end a function with a blank line, don't add a blank line
+  after public/protected/private, don't have too many blank lines in a row.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+
+  # Don't use "elided" lines here, otherwise we can't check commented lines.
+  # Don't want to use "raw" either, because we don't want to check inside C++11
+  # raw strings,
+  raw = clean_lines.lines_without_raw_strings
+  line = raw[linenum]
+
+  # Before nixing comments, check if the line is blank for no good
+  # reason.  This includes the first line after a block is opened, and
+  # blank lines at the end of a function (ie, right before a line like '}'
+  #
+  # Skip all the blank line checks if we are immediately inside a
+  # namespace body.  In other words, don't issue blank line warnings
+  # for this block:
+  #   namespace {
+  #
+  #   }
+  #
+  # A warning about missing end of namespace comments will be issued instead.
+  #
+  # Also skip blank line checks for 'extern "C"' blocks, which are formatted
+  # like namespaces.
+  if (IsBlankLine(line) and
+      not nesting_state.InNamespaceBody() and
+      not nesting_state.InExternC()):
+    elided = clean_lines.elided
+    prev_line = elided[linenum - 1]
+    prevbrace = prev_line.rfind('{')
+    # TODO(unknown): Don't complain if line before blank line, and line after,
+    #                both start with alnums and are indented the same amount.
+    #                This ignores whitespace at the start of a namespace block
+    #                because those are not usually indented.
+    if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1:
+      # OK, we have a blank line at the start of a code block.  Before we
+      # complain, we check if it is an exception to the rule: The previous
+      # non-empty line has the parameters of a function header that are indented
+      # 4 spaces (because they did not fit in a 80 column line when placed on
+      # the same line as the function name).  We also check for the case where
+      # the previous line is indented 6 spaces, which may happen when the
+      # initializers of a constructor do not fit into a 80 column line.
+      exception = False
+      if Match(r' {6}\w', prev_line):  # Initializer list?
+        # We are looking for the opening column of initializer list, which
+        # should be indented 4 spaces to cause 6 space indentation afterwards.
+        search_position = linenum-2
+        while (search_position >= 0
+               and Match(r' {6}\w', elided[search_position])):
+          search_position -= 1
+        exception = (search_position >= 0
+                     and elided[search_position][:5] == '    :')
+      else:
+        # Search for the function arguments or an initializer list.  We use a
+        # simple heuristic here: If the line is indented 4 spaces; and we have a
+        # closing paren, without the opening paren, followed by an opening brace
+        # or colon (for initializer lists) we assume that it is the last line of
+        # a function header.  If we have a colon indented 4 spaces, it is an
+        # initializer list.
+        exception = (Match(r' {4}\w[^\(]*\)\s*(const\s*)?(\{\s*$|:)',
+                           prev_line)
+                     or Match(r' {4}:', prev_line))
+
+      if not exception:
+        error(filename, linenum, 'whitespace/blank_line', 2,
+              'Redundant blank line at the start of a code block '
+              'should be deleted.')
+    # Ignore blank lines at the end of a block in a long if-else
+    # chain, like this:
+    #   if (condition1) {
+    #     // Something followed by a blank line
+    #
+    #   } else if (condition2) {
+    #     // Something else
+    #   }
+    if linenum + 1 < clean_lines.NumLines():
+      next_line = raw[linenum + 1]
+      if (next_line
+          and Match(r'\s*}', next_line)
+          and next_line.find('} else ') == -1):
+        error(filename, linenum, 'whitespace/blank_line', 3,
+              'Redundant blank line at the end of a code block '
+              'should be deleted.')
+
+    matched = Match(r'\s*(public|protected|private):', prev_line)
+    if matched:
+      error(filename, linenum, 'whitespace/blank_line', 3,
+            'Do not leave a blank line after "%s:"' % matched.group(1))
+
+  # Next, check comments
+  next_line_start = 0
+  if linenum + 1 < clean_lines.NumLines():
+    next_line = raw[linenum + 1]
+    next_line_start = len(next_line) - len(next_line.lstrip())
+  CheckComment(line, filename, linenum, next_line_start, error)
+
+  # get rid of comments and strings
+  line = clean_lines.elided[linenum]
+
+  # You shouldn't have spaces before your brackets, except maybe after
+  # 'delete []' or 'return []() {};'
+  if Search(r'\w\s+\[', line) and not Search(r'(?:delete|return)\s+\[', line):
+    error(filename, linenum, 'whitespace/braces', 5,
+          'Extra space before [')
+
+  # In range-based for, we wanted spaces before and after the colon, but
+  # not around "::" tokens that might appear.
+  if (Search(r'for *\(.*[^:]:[^: ]', line) or
+      Search(r'for *\(.*[^: ]:[^:]', line)):
+    error(filename, linenum, 'whitespace/forcolon', 2,
+          'Missing space around colon in range-based for loop')
+
+
+def CheckOperatorSpacing(filename, clean_lines, linenum, error):
+  """Checks for horizontal spacing around operators.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Don't try to do spacing checks for operator methods.  Do this by
+  # replacing the troublesome characters with something else,
+  # preserving column position for all other characters.
+  #
+  # The replacement is done repeatedly to avoid false positives from
+  # operators that call operators.
+  while True:
+    match = Match(r'^(.*\boperator\b)(\S+)(\s*\(.*)$', line)
+    if match:
+      line = match.group(1) + ('_' * len(match.group(2))) + match.group(3)
+    else:
+      break
+
+  # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )".
+  # Otherwise not.  Note we only check for non-spaces on *both* sides;
+  # sometimes people put non-spaces on one side when aligning ='s among
+  # many lines (not that this is behavior that I approve of...)
+  if ((Search(r'[\w.]=', line) or
+       Search(r'=[\w.]', line))
+      and not Search(r'\b(if|while|for) ', line)
+      # Operators taken from [lex.operators] in C++11 standard.
+      and not Search(r'(>=|<=|==|!=|&=|\^=|\|=|\+=|\*=|\/=|\%=)', line)
+      and not Search(r'operator=', line)):
+    error(filename, linenum, 'whitespace/operators', 4,
+          'Missing spaces around =')
+
+  # It's ok not to have spaces around binary operators like + - * /, but if
+  # there's too little whitespace, we get concerned.  It's hard to tell,
+  # though, so we punt on this one for now.  TODO.
+
+  # You should always have whitespace around binary operators.
+  #
+  # Check <= and >= first to avoid false positives with < and >, then
+  # check non-include lines for spacing around < and >.
+  #
+  # If the operator is followed by a comma, assume it's be used in a
+  # macro context and don't do any checks.  This avoids false
+  # positives.
+  #
+  # Note that && is not included here.  Those are checked separately
+  # in CheckRValueReference
+  match = Search(r'[^<>=!\s](==|!=|<=|>=|\|\|)[^<>=!\s,;\)]', line)
+  if match:
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around %s' % match.group(1))
+  elif not Match(r'#.*include', line):
+    # Look for < that is not surrounded by spaces.  This is only
+    # triggered if both sides are missing spaces, even though
+    # technically should should flag if at least one side is missing a
+    # space.  This is done to avoid some false positives with shifts.
+    match = Match(r'^(.*[^\s<])<[^\s=<,]', line)
+    if match:
+      (_, _, end_pos) = CloseExpression(
+          clean_lines, linenum, len(match.group(1)))
+      if end_pos <= -1:
+        error(filename, linenum, 'whitespace/operators', 3,
+              'Missing spaces around <')
+
+    # Look for > that is not surrounded by spaces.  Similar to the
+    # above, we only trigger if both sides are missing spaces to avoid
+    # false positives with shifts.
+    match = Match(r'^(.*[^-\s>])>[^\s=>,]', line)
+    if match:
+      (_, _, start_pos) = ReverseCloseExpression(
+          clean_lines, linenum, len(match.group(1)))
+      if start_pos <= -1:
+        error(filename, linenum, 'whitespace/operators', 3,
+              'Missing spaces around >')
+
+  # We allow no-spaces around << when used like this: 10<<20, but
+  # not otherwise (particularly, not when used as streams)
+  #
+  # We also allow operators following an opening parenthesis, since
+  # those tend to be macros that deal with operators.
+  match = Search(r'(operator|[^\s(<])(?:L|UL|ULL|l|ul|ull)?<<([^\s,=<])', line)
+  if (match and not (match.group(1).isdigit() and match.group(2).isdigit()) and
+      not (match.group(1) == 'operator' and match.group(2) == ';')):
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around <<')
+
+  # We allow no-spaces around >> for almost anything.  This is because
+  # C++11 allows ">>" to close nested templates, which accounts for
+  # most cases when ">>" is not followed by a space.
+  #
+  # We still warn on ">>" followed by alpha character, because that is
+  # likely due to ">>" being used for right shifts, e.g.:
+  #   value >> alpha
+  #
+  # When ">>" is used to close templates, the alphanumeric letter that
+  # follows would be part of an identifier, and there should still be
+  # a space separating the template type and the identifier.
+  #   type<type<type>> alpha
+  match = Search(r'>>[a-zA-Z_]', line)
+  if match:
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around >>')
+
+  # There shouldn't be space around unary operators
+  match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line)
+  if match:
+    error(filename, linenum, 'whitespace/operators', 4,
+          'Extra space for operator %s' % match.group(1))
+
+
+def CheckParenthesisSpacing(filename, clean_lines, linenum, error):
+  """Checks for horizontal spacing around parentheses.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # No spaces after an if, while, switch, or for
+  match = Search(r' (if\(|for\(|while\(|switch\()', line)
+  if match:
+    error(filename, linenum, 'whitespace/parens', 5,
+          'Missing space before ( in %s' % match.group(1))
+
+  # For if/for/while/switch, the left and right parens should be
+  # consistent about how many spaces are inside the parens, and
+  # there should either be zero or one spaces inside the parens.
+  # We don't want: "if ( foo)" or "if ( foo   )".
+  # Exception: "for ( ; foo; bar)" and "for (foo; bar; )" are allowed.
+  match = Search(r'\b(if|for|while|switch)\s*'
+                 r'\(([ ]*)(.).*[^ ]+([ ]*)\)\s*{\s*$',
+                 line)
+  if match:
+    if len(match.group(2)) != len(match.group(4)):
+      if not (match.group(3) == ';' and
+              len(match.group(2)) == 1 + len(match.group(4)) or
+              not match.group(2) and Search(r'\bfor\s*\(.*; \)', line)):
+        error(filename, linenum, 'whitespace/parens', 5,
+              'Mismatching spaces inside () in %s' % match.group(1))
+    if len(match.group(2)) not in [0, 1]:
+      error(filename, linenum, 'whitespace/parens', 5,
+            'Should have zero or one spaces inside ( and ) in %s' %
+            match.group(1))
+
+
+def CheckCommaSpacing(filename, clean_lines, linenum, error):
+  """Checks for horizontal spacing near commas and semicolons.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  raw = clean_lines.lines_without_raw_strings
+  line = clean_lines.elided[linenum]
+
+  # You should always have a space after a comma (either as fn arg or operator)
+  #
+  # This does not apply when the non-space character following the
+  # comma is another comma, since the only time when that happens is
+  # for empty macro arguments.
+  #
+  # We run this check in two passes: first pass on elided lines to
+  # verify that lines contain missing whitespaces, second pass on raw
+  # lines to confirm that those missing whitespaces are not due to
+  # elided comments.
+  if (Search(r',[^,\s]', ReplaceAll(r'\boperator\s*,\s*\(', 'F(', line)) and
+      Search(r',[^,\s]', raw[linenum])):
+    error(filename, linenum, 'whitespace/comma', 3,
+          'Missing space after ,')
+
+  # You should always have a space after a semicolon
+  # except for few corner cases
+  # TODO(unknown): clarify if 'if (1) { return 1;}' is requires one more
+  # space after ;
+  if Search(r';[^\s};\\)/]', line):
+    error(filename, linenum, 'whitespace/semicolon', 3,
+          'Missing space after ;')
+
+
+def CheckBracesSpacing(filename, clean_lines, linenum, error):
+  """Checks for horizontal spacing near commas.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Except after an opening paren, or after another opening brace (in case of
+  # an initializer list, for instance), you should have spaces before your
+  # braces. And since you should never have braces at the beginning of a line,
+  # this is an easy test.
+  match = Match(r'^(.*[^ ({>]){', line)
+  if match:
+    # Try a bit harder to check for brace initialization.  This
+    # happens in one of the following forms:
+    #   Constructor() : initializer_list_{} { ... }
+    #   Constructor{}.MemberFunction()
+    #   Type variable{};
+    #   FunctionCall(type{}, ...);
+    #   LastArgument(..., type{});
+    #   LOG(INFO) << type{} << " ...";
+    #   map_of_type[{...}] = ...;
+    #   ternary = expr ? new type{} : nullptr;
+    #   OuterTemplate<InnerTemplateConstructor<Type>{}>
+    #
+    # We check for the character following the closing brace, and
+    # silence the warning if it's one of those listed above, i.e.
+    # "{.;,)<>]:".
+    #
+    # To account for nested initializer list, we allow any number of
+    # closing braces up to "{;,)<".  We can't simply silence the
+    # warning on first sight of closing brace, because that would
+    # cause false negatives for things that are not initializer lists.
+    #   Silence this:         But not this:
+    #     Outer{                if (...) {
+    #       Inner{...}            if (...){  // Missing space before {
+    #     };                    }
+    #
+    # There is a false negative with this approach if people inserted
+    # spurious semicolons, e.g. "if (cond){};", but we will catch the
+    # spurious semicolon with a separate check.
+    (endline, endlinenum, endpos) = CloseExpression(
+        clean_lines, linenum, len(match.group(1)))
+    trailing_text = ''
+    if endpos > -1:
+      trailing_text = endline[endpos:]
+    for offset in xrange(endlinenum + 1,
+                         min(endlinenum + 3, clean_lines.NumLines() - 1)):
+      trailing_text += clean_lines.elided[offset]
+    if not Match(r'^[\s}]*[{.;,)<>\]:]', trailing_text):
+      error(filename, linenum, 'whitespace/braces', 5,
+            'Missing space before {')
+
+  # Make sure '} else {' has spaces.
+  if Search(r'}else', line):
+    error(filename, linenum, 'whitespace/braces', 5,
+          'Missing space before else')
+
+  # You shouldn't have a space before a semicolon at the end of the line.
+  # There's a special case for "for" since the style guide allows space before
+  # the semicolon there.
+  if Search(r':\s*;\s*$', line):
+    error(filename, linenum, 'whitespace/semicolon', 5,
+          'Semicolon defining empty statement. Use {} instead.')
+  elif Search(r'^\s*;\s*$', line):
+    error(filename, linenum, 'whitespace/semicolon', 5,
+          'Line contains only semicolon. If this should be an empty statement, '
+          'use {} instead.')
+  elif (Search(r'\s+;\s*$', line) and
+        not Search(r'\bfor\b', line)):
+    error(filename, linenum, 'whitespace/semicolon', 5,
+          'Extra space before last semicolon. If this should be an empty '
+          'statement, use {} instead.')
+
+
+def IsDecltype(clean_lines, linenum, column):
+  """Check if the token ending on (linenum, column) is decltype().
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: the number of the line to check.
+    column: end column of the token to check.
+  Returns:
+    True if this token is decltype() expression, False otherwise.
+  """
+  (text, _, start_col) = ReverseCloseExpression(clean_lines, linenum, column)
+  if start_col < 0:
+    return False
+  if Search(r'\bdecltype\s*$', text[0:start_col]):
+    return True
+  return False
+
+
+def IsTemplateParameterList(clean_lines, linenum, column):
+  """Check if the token ending on (linenum, column) is the end of template<>.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: the number of the line to check.
+    column: end column of the token to check.
+  Returns:
+    True if this token is end of a template parameter list, False otherwise.
+  """
+  (_, startline, startpos) = ReverseCloseExpression(
+      clean_lines, linenum, column)
+  if (startpos > -1 and
+      Search(r'\btemplate\s*$', clean_lines.elided[startline][0:startpos])):
+    return True
+  return False
+
+
+def IsRValueType(typenames, clean_lines, nesting_state, linenum, column):
+  """Check if the token ending on (linenum, column) is a type.
+
+  Assumes that text to the right of the column is "&&" or a function
+  name.
+
+  Args:
+    typenames: set of type names from template-argument-list.
+    clean_lines: A CleansedLines instance containing the file.
+    nesting_state: A NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    linenum: the number of the line to check.
+    column: end column of the token to check.
+  Returns:
+    True if this token is a type, False if we are not sure.
+  """
+  prefix = clean_lines.elided[linenum][0:column]
+
+  # Get one word to the left.  If we failed to do so, this is most
+  # likely not a type, since it's unlikely that the type name and "&&"
+  # would be split across multiple lines.
+  match = Match(r'^(.*)(\b\w+|[>*)&])\s*$', prefix)
+  if not match:
+    return False
+
+  # Check text following the token.  If it's "&&>" or "&&," or "&&...", it's
+  # most likely a rvalue reference used inside a template.
+  suffix = clean_lines.elided[linenum][column:]
+  if Match(r'&&\s*(?:[>,]|\.\.\.)', suffix):
+    return True
+
+  # Check for known types and end of templates:
+  #   int&& variable
+  #   vector<int>&& variable
+  #
+  # Because this function is called recursively, we also need to
+  # recognize pointer and reference types:
+  #   int* Function()
+  #   int& Function()
+  if (match.group(2) in typenames or
+      match.group(2) in ['char', 'char16_t', 'char32_t', 'wchar_t', 'bool',
+                         'short', 'int', 'long', 'signed', 'unsigned',
+                         'float', 'double', 'void', 'auto', '>', '*', '&']):
+    return True
+
+  # If we see a close parenthesis, look for decltype on the other side.
+  # decltype would unambiguously identify a type, anything else is
+  # probably a parenthesized expression and not a type.
+  if match.group(2) == ')':
+    return IsDecltype(
+        clean_lines, linenum, len(match.group(1)) + len(match.group(2)) - 1)
+
+  # Check for casts and cv-qualifiers.
+  #   match.group(1)  remainder
+  #   --------------  ---------
+  #   const_cast<     type&&
+  #   const           type&&
+  #   type            const&&
+  if Search(r'\b(?:const_cast\s*<|static_cast\s*<|dynamic_cast\s*<|'
+            r'reinterpret_cast\s*<|\w+\s)\s*$',
+            match.group(1)):
+    return True
+
+  # Look for a preceding symbol that might help differentiate the context.
+  # These are the cases that would be ambiguous:
+  #   match.group(1)  remainder
+  #   --------------  ---------
+  #   Call         (   expression &&
+  #   Declaration  (   type&&
+  #   sizeof       (   type&&
+  #   if           (   expression &&
+  #   while        (   expression &&
+  #   for          (   type&&
+  #   for(         ;   expression &&
+  #   statement    ;   type&&
+  #   block        {   type&&
+  #   constructor  {   expression &&
+  start = linenum
+  line = match.group(1)
+  match_symbol = None
+  while start >= 0:
+    # We want to skip over identifiers and commas to get to a symbol.
+    # Commas are skipped so that we can find the opening parenthesis
+    # for function parameter lists.
+    match_symbol = Match(r'^(.*)([^\w\s,])[\w\s,]*$', line)
+    if match_symbol:
+      break
+    start -= 1
+    line = clean_lines.elided[start]
+
+  if not match_symbol:
+    # Probably the first statement in the file is an rvalue reference
+    return True
+
+  if match_symbol.group(2) == '}':
+    # Found closing brace, probably an indicate of this:
+    #   block{} type&&
+    return True
+
+  if match_symbol.group(2) == ';':
+    # Found semicolon, probably one of these:
+    #   for(; expression &&
+    #   statement; type&&
+
+    # Look for the previous 'for(' in the previous lines.
+    before_text = match_symbol.group(1)
+    for i in xrange(start - 1, max(start - 6, 0), -1):
+      before_text = clean_lines.elided[i] + before_text
+    if Search(r'for\s*\([^{};]*$', before_text):
+      # This is the condition inside a for-loop
+      return False
+
+    # Did not find a for-init-statement before this semicolon, so this
+    # is probably a new statement and not a condition.
+    return True
+
+  if match_symbol.group(2) == '{':
+    # Found opening brace, probably one of these:
+    #   block{ type&& = ... ; }
+    #   constructor{ expression && expression }
+
+    # Look for a closing brace or a semicolon.  If we see a semicolon
+    # first, this is probably a rvalue reference.
+    line = clean_lines.elided[start][0:len(match_symbol.group(1)) + 1]
+    end = start
+    depth = 1
+    while True:
+      for ch in line:
+        if ch == ';':
+          return True
+        elif ch == '{':
+          depth += 1
+        elif ch == '}':
+          depth -= 1
+          if depth == 0:
+            return False
+      end += 1
+      if end >= clean_lines.NumLines():
+        break
+      line = clean_lines.elided[end]
+    # Incomplete program?
+    return False
+
+  if match_symbol.group(2) == '(':
+    # Opening parenthesis.  Need to check what's to the left of the
+    # parenthesis.  Look back one extra line for additional context.
+    before_text = match_symbol.group(1)
+    if linenum > 1:
+      before_text = clean_lines.elided[linenum - 1] + before_text
+    before_text = match_symbol.group(1)
+
+    # Patterns that are likely to be types:
+    #   [](type&&
+    #   for (type&&
+    #   sizeof(type&&
+    #   operator=(type&&
+    #
+    if Search(r'(?:\]|\bfor|\bsizeof|\boperator\s*\S+\s*)\s*$', before_text):
+      return True
+
+    # Patterns that are likely to be expressions:
+    #   if (expression &&
+    #   while (expression &&
+    #   : initializer(expression &&
+    #   , initializer(expression &&
+    #   ( FunctionCall(expression &&
+    #   + FunctionCall(expression &&
+    #   + (expression &&
+    #
+    # The last '+' represents operators such as '+' and '-'.
+    if Search(r'(?:\bif|\bwhile|[-+=%^(<!?:,&*]\s*)$', before_text):
+      return False
+
+    # Something else.  Check that tokens to the left look like
+    #   return_type function_name
+    match_func = Match(r'^(.*\S.*)\s+\w(?:\w|::)*(?:<[^<>]*>)?\s*$',
+                       match_symbol.group(1))
+    if match_func:
+      # Check for constructors, which don't have return types.
+      if Search(r'\b(?:explicit|inline)$', match_func.group(1)):
+        return True
+      implicit_constructor = Match(r'\s*(\w+)\((?:const\s+)?(\w+)', prefix)
+      if (implicit_constructor and
+          implicit_constructor.group(1) == implicit_constructor.group(2)):
+        return True
+      return IsRValueType(typenames, clean_lines, nesting_state, linenum,
+                          len(match_func.group(1)))
+
+    # Nothing before the function name.  If this is inside a block scope,
+    # this is probably a function call.
+    return not (nesting_state.previous_stack_top and
+                nesting_state.previous_stack_top.IsBlockInfo())
+
+  if match_symbol.group(2) == '>':
+    # Possibly a closing bracket, check that what's on the other side
+    # looks like the start of a template.
+    return IsTemplateParameterList(
+        clean_lines, start, len(match_symbol.group(1)))
+
+  # Some other symbol, usually something like "a=b&&c".  This is most
+  # likely not a type.
+  return False
+
+
+def IsDeletedOrDefault(clean_lines, linenum):
+  """Check if current constructor or operator is deleted or default.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+  Returns:
+    True if this is a deleted or default constructor.
+  """
+  open_paren = clean_lines.elided[linenum].find('(')
+  if open_paren < 0:
+    return False
+  (close_line, _, close_paren) = CloseExpression(
+      clean_lines, linenum, open_paren)
+  if close_paren < 0:
+    return False
+  return Match(r'\s*=\s*(?:delete|default)\b', close_line[close_paren:])
+
+
+def IsRValueAllowed(clean_lines, linenum, typenames):
+  """Check if RValue reference is allowed on a particular line.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    typenames: set of type names from template-argument-list.
+  Returns:
+    True if line is within the region where RValue references are allowed.
+  """
+  # Allow region marked by PUSH/POP macros
+  for i in xrange(linenum, 0, -1):
+    line = clean_lines.elided[i]
+    if Match(r'GOOGLE_ALLOW_RVALUE_REFERENCES_(?:PUSH|POP)', line):
+      if not line.endswith('PUSH'):
+        return False
+      for j in xrange(linenum, clean_lines.NumLines(), 1):
+        line = clean_lines.elided[j]
+        if Match(r'GOOGLE_ALLOW_RVALUE_REFERENCES_(?:PUSH|POP)', line):
+          return line.endswith('POP')
+
+  # Allow operator=
+  line = clean_lines.elided[linenum]
+  if Search(r'\boperator\s*=\s*\(', line):
+    return IsDeletedOrDefault(clean_lines, linenum)
+
+  # Allow constructors
+  match = Match(r'\s*(?:[\w<>]+::)*([\w<>]+)\s*::\s*([\w<>]+)\s*\(', line)
+  if match and match.group(1) == match.group(2):
+    return IsDeletedOrDefault(clean_lines, linenum)
+  if Search(r'\b(?:explicit|inline)\s+[\w<>]+\s*\(', line):
+    return IsDeletedOrDefault(clean_lines, linenum)
+
+  if Match(r'\s*[\w<>]+\s*\(', line):
+    previous_line = 'ReturnType'
+    if linenum > 0:
+      previous_line = clean_lines.elided[linenum - 1]
+    if Match(r'^\s*$', previous_line) or Search(r'[{}:;]\s*$', previous_line):
+      return IsDeletedOrDefault(clean_lines, linenum)
+
+  # Reject types not mentioned in template-argument-list
+  while line:
+    match = Match(r'^.*?(\w+)\s*&&(.*)$', line)
+    if not match:
+      break
+    if match.group(1) not in typenames:
+      return False
+    line = match.group(2)
+
+  # All RValue types that were in template-argument-list should have
+  # been removed by now.  Those were allowed, assuming that they will
+  # be forwarded.
+  #
+  # If there are no remaining RValue types left (i.e. types that were
+  # not found in template-argument-list), flag those as not allowed.
+  return line.find('&&') < 0
+
+
+def GetTemplateArgs(clean_lines, linenum):
+  """Find list of template arguments associated with this function declaration.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: Line number containing the start of the function declaration,
+             usually one line after the end of the template-argument-list.
+  Returns:
+    Set of type names, or empty set if this does not appear to have
+    any template parameters.
+  """
+  # Find start of function
+  func_line = linenum
+  while func_line > 0:
+    line = clean_lines.elided[func_line]
+    if Match(r'^\s*$', line):
+      return set()
+    if line.find('(') >= 0:
+      break
+    func_line -= 1
+  if func_line == 0:
+    return set()
+
+  # Collapse template-argument-list into a single string
+  argument_list = ''
+  match = Match(r'^(\s*template\s*)<', clean_lines.elided[func_line])
+  if match:
+    # template-argument-list on the same line as function name
+    start_col = len(match.group(1))
+    _, end_line, end_col = CloseExpression(clean_lines, func_line, start_col)
+    if end_col > -1 and end_line == func_line:
+      start_col += 1  # Skip the opening bracket
+      argument_list = clean_lines.elided[func_line][start_col:end_col]
+
+  elif func_line > 1:
+    # template-argument-list one line before function name
+    match = Match(r'^(.*)>\s*$', clean_lines.elided[func_line - 1])
+    if match:
+      end_col = len(match.group(1))
+      _, start_line, start_col = ReverseCloseExpression(
+          clean_lines, func_line - 1, end_col)
+      if start_col > -1:
+        start_col += 1  # Skip the opening bracket
+        while start_line < func_line - 1:
+          argument_list += clean_lines.elided[start_line][start_col:]
+          start_col = 0
+          start_line += 1
+        argument_list += clean_lines.elided[func_line - 1][start_col:end_col]
+
+  if not argument_list:
+    return set()
+
+  # Extract type names
+  typenames = set()
+  while True:
+    match = Match(r'^[,\s]*(?:typename|class)(?:\.\.\.)?\s+(\w+)(.*)$',
+                  argument_list)
+    if not match:
+      break
+    typenames.add(match.group(1))
+    argument_list = match.group(2)
+  return typenames
+
+
+def CheckRValueReference(filename, clean_lines, linenum, nesting_state, error):
+  """Check for rvalue references.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  # Find lines missing spaces around &&.
+  # TODO(unknown): currently we don't check for rvalue references
+  # with spaces surrounding the && to avoid false positives with
+  # boolean expressions.
+  line = clean_lines.elided[linenum]
+  match = Match(r'^(.*\S)&&', line)
+  if not match:
+    match = Match(r'(.*)&&\S', line)
+  if (not match) or '(&&)' in line or Search(r'\boperator\s*$', match.group(1)):
+    return
+
+  # Either poorly formed && or an rvalue reference, check the context
+  # to get a more accurate error message.  Mostly we want to determine
+  # if what's to the left of "&&" is a type or not.
+  typenames = GetTemplateArgs(clean_lines, linenum)
+  and_pos = len(match.group(1))
+  if IsRValueType(typenames, clean_lines, nesting_state, linenum, and_pos):
+    if not IsRValueAllowed(clean_lines, linenum, typenames):
+      error(filename, linenum, 'build/c++11', 3,
+            'RValue references are an unapproved C++ feature.')
+  else:
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around &&')
+
+
+def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error):
+  """Checks for additional blank line issues related to sections.
+
+  Currently the only thing checked here is blank line before protected/private.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    class_info: A _ClassInfo objects.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  # Skip checks if the class is small, where small means 25 lines or less.
+  # 25 lines seems like a good cutoff since that's the usual height of
+  # terminals, and any class that can't fit in one screen can't really
+  # be considered "small".
+  #
+  # Also skip checks if we are on the first line.  This accounts for
+  # classes that look like
+  #   class Foo { public: ... };
+  #
+  # If we didn't find the end of the class, last_line would be zero,
+  # and the check will be skipped by the first condition.
+  if (class_info.last_line - class_info.starting_linenum <= 24 or
+      linenum <= class_info.starting_linenum):
+    return
+
+  matched = Match(r'\s*(public|protected|private):', clean_lines.lines[linenum])
+  if matched:
+    # Issue warning if the line before public/protected/private was
+    # not a blank line, but don't do this if the previous line contains
+    # "class" or "struct".  This can happen two ways:
+    #  - We are at the beginning of the class.
+    #  - We are forward-declaring an inner class that is semantically
+    #    private, but needed to be public for implementation reasons.
+    # Also ignores cases where the previous line ends with a backslash as can be
+    # common when defining classes in C macros.
+    prev_line = clean_lines.lines[linenum - 1]
+    if (not IsBlankLine(prev_line) and
+        not Search(r'\b(class|struct)\b', prev_line) and
+        not Search(r'\\$', prev_line)):
+      # Try a bit harder to find the beginning of the class.  This is to
+      # account for multi-line base-specifier lists, e.g.:
+      #   class Derived
+      #       : public Base {
+      end_class_head = class_info.starting_linenum
+      for i in range(class_info.starting_linenum, linenum):
+        if Search(r'\{\s*$', clean_lines.lines[i]):
+          end_class_head = i
+          break
+      if end_class_head < linenum - 1:
+        error(filename, linenum, 'whitespace/blank_line', 3,
+              '"%s:" should be preceded by a blank line' % matched.group(1))
+
+
+def GetPreviousNonBlankLine(clean_lines, linenum):
+  """Return the most recent non-blank line and its line number.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file contents.
+    linenum: The number of the line to check.
+
+  Returns:
+    A tuple with two elements.  The first element is the contents of the last
+    non-blank line before the current line, or the empty string if this is the
+    first non-blank line.  The second is the line number of that line, or -1
+    if this is the first non-blank line.
+  """
+
+  prevlinenum = linenum - 1
+  while prevlinenum >= 0:
+    prevline = clean_lines.elided[prevlinenum]
+    if not IsBlankLine(prevline):     # if not a blank line...
+      return (prevline, prevlinenum)
+    prevlinenum -= 1
+  return ('', -1)
+
+
+def CheckBraces(filename, clean_lines, linenum, error):
+  """Looks for misplaced braces (e.g. at the end of line).
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  line = clean_lines.elided[linenum]        # get rid of comments and strings
+
+  if Match(r'\s*{\s*$', line):
+    # We allow an open brace to start a line in the case where someone is using
+    # braces in a block to explicitly create a new scope, which is commonly used
+    # to control the lifetime of stack-allocated variables.  Braces are also
+    # used for brace initializers inside function calls.  We don't detect this
+    # perfectly: we just don't complain if the last non-whitespace character on
+    # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the
+    # previous line starts a preprocessor block.
+    prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+    if (not Search(r'[,;:}{(]\s*$', prevline) and
+        not Match(r'\s*#', prevline)):
+      error(filename, linenum, 'whitespace/braces', 4,
+            '{ should almost always be at the end of the previous line')
+
+  # An else clause should be on the same line as the preceding closing brace.
+  if Match(r'\s*else\b\s*(?:if\b|\{|$)', line):
+    prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+    if Match(r'\s*}\s*$', prevline):
+      error(filename, linenum, 'whitespace/newline', 4,
+            'An else should appear on the same line as the preceding }')
+
+  # If braces come on one side of an else, they should be on both.
+  # However, we have to worry about "else if" that spans multiple lines!
+  if Search(r'else if\s*\(', line):       # could be multi-line if
+    brace_on_left = bool(Search(r'}\s*else if\s*\(', line))
+    # find the ( after the if
+    pos = line.find('else if')
+    pos = line.find('(', pos)
+    if pos > 0:
+      (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
+      brace_on_right = endline[endpos:].find('{') != -1
+      if brace_on_left != brace_on_right:    # must be brace after if
+        error(filename, linenum, 'readability/braces', 5,
+              'If an else has a brace on one side, it should have it on both')
+  elif Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
+    error(filename, linenum, 'readability/braces', 5,
+          'If an else has a brace on one side, it should have it on both')
+
+  # Likewise, an else should never have the else clause on the same line
+  if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line):
+    error(filename, linenum, 'whitespace/newline', 4,
+          'Else clause should never be on same line as else (use 2 lines)')
+
+  # In the same way, a do/while should never be on one line
+  if Match(r'\s*do [^\s{]', line):
+    error(filename, linenum, 'whitespace/newline', 4,
+          'do/while clauses should not be on a single line')
+
+  # Check single-line if/else bodies. The style guide says 'curly braces are not
+  # required for single-line statements'. We additionally allow multi-line,
+  # single statements, but we reject anything with more than one semicolon in
+  # it. This means that the first semicolon after the if should be at the end of
+  # its line, and the line after that should have an indent level equal to or
+  # lower than the if. We also check for ambiguous if/else nesting without
+  # braces.
+  if_else_match = Search(r'\b(if\s*\(|else\b)', line)
+  if if_else_match and not Match(r'\s*#', line):
+    if_indent = GetIndentLevel(line)
+    endline, endlinenum, endpos = line, linenum, if_else_match.end()
+    if_match = Search(r'\bif\s*\(', line)
+    if if_match:
+      # This could be a multiline if condition, so find the end first.
+      pos = if_match.end() - 1
+      (endline, endlinenum, endpos) = CloseExpression(clean_lines, linenum, pos)
+    # Check for an opening brace, either directly after the if or on the next
+    # line. If found, this isn't a single-statement conditional.
+    if (not Match(r'\s*{', endline[endpos:])
+        and not (Match(r'\s*$', endline[endpos:])
+                 and endlinenum < (len(clean_lines.elided) - 1)
+                 and Match(r'\s*{', clean_lines.elided[endlinenum + 1]))):
+      while (endlinenum < len(clean_lines.elided)
+             and ';' not in clean_lines.elided[endlinenum][endpos:]):
+        endlinenum += 1
+        endpos = 0
+      if endlinenum < len(clean_lines.elided):
+        endline = clean_lines.elided[endlinenum]
+        # We allow a mix of whitespace and closing braces (e.g. for one-liner
+        # methods) and a single \ after the semicolon (for macros)
+        endpos = endline.find(';')
+        if not Match(r';[\s}]*(\\?)$', endline[endpos:]):
+          # Semicolon isn't the last character, there's something trailing.
+          # Output a warning if the semicolon is not contained inside
+          # a lambda expression.
+          if not Match(r'^[^{};]*\[[^\[\]]*\][^{}]*\{[^{}]*\}\s*\)*[;,]\s*$',
+                       endline):
+            error(filename, linenum, 'readability/braces', 4,
+                  'If/else bodies with multiple statements require braces')
+        elif endlinenum < len(clean_lines.elided) - 1:
+          # Make sure the next line is dedented
+          next_line = clean_lines.elided[endlinenum + 1]
+          next_indent = GetIndentLevel(next_line)
+          # With ambiguous nested if statements, this will error out on the
+          # if that *doesn't* match the else, regardless of whether it's the
+          # inner one or outer one.
+          if (if_match and Match(r'\s*else\b', next_line)
+              and next_indent != if_indent):
+            error(filename, linenum, 'readability/braces', 4,
+                  'Else clause should be indented at the same level as if. '
+                  'Ambiguous nested if/else chains require braces.')
+          elif next_indent > if_indent:
+            error(filename, linenum, 'readability/braces', 4,
+                  'If/else bodies with multiple statements require braces')
+
+
+def CheckTrailingSemicolon(filename, clean_lines, linenum, error):
+  """Looks for redundant trailing semicolon.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  line = clean_lines.elided[linenum]
+
+  # Block bodies should not be followed by a semicolon.  Due to C++11
+  # brace initialization, there are more places where semicolons are
+  # required than not, so we use a whitelist approach to check these
+  # rather than a blacklist.  These are the places where "};" should
+  # be replaced by just "}":
+  # 1. Some flavor of block following closing parenthesis:
+  #    for (;;) {};
+  #    while (...) {};
+  #    switch (...) {};
+  #    Function(...) {};
+  #    if (...) {};
+  #    if (...) else if (...) {};
+  #
+  # 2. else block:
+  #    if (...) else {};
+  #
+  # 3. const member function:
+  #    Function(...) const {};
+  #
+  # 4. Block following some statement:
+  #    x = 42;
+  #    {};
+  #
+  # 5. Block at the beginning of a function:
+  #    Function(...) {
+  #      {};
+  #    }
+  #
+  #    Note that naively checking for the preceding "{" will also match
+  #    braces inside multi-dimensional arrays, but this is fine since
+  #    that expression will not contain semicolons.
+  #
+  # 6. Block following another block:
+  #    while (true) {}
+  #    {};
+  #
+  # 7. End of namespaces:
+  #    namespace {};
+  #
+  #    These semicolons seems far more common than other kinds of
+  #    redundant semicolons, possibly due to people converting classes
+  #    to namespaces.  For now we do not warn for this case.
+  #
+  # Try matching case 1 first.
+  match = Match(r'^(.*\)\s*)\{', line)
+  if match:
+    # Matched closing parenthesis (case 1).  Check the token before the
+    # matching opening parenthesis, and don't warn if it looks like a
+    # macro.  This avoids these false positives:
+    #  - macro that defines a base class
+    #  - multi-line macro that defines a base class
+    #  - macro that defines the whole class-head
+    #
+    # But we still issue warnings for macros that we know are safe to
+    # warn, specifically:
+    #  - TEST, TEST_F, TEST_P, MATCHER, MATCHER_P
+    #  - TYPED_TEST
+    #  - INTERFACE_DEF
+    #  - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED:
+    #
+    # We implement a whitelist of safe macros instead of a blacklist of
+    # unsafe macros, even though the latter appears less frequently in
+    # google code and would have been easier to implement.  This is because
+    # the downside for getting the whitelist wrong means some extra
+    # semicolons, while the downside for getting the blacklist wrong
+    # would result in compile errors.
+    #
+    # In addition to macros, we also don't want to warn on
+    #  - Compound literals
+    #  - Lambdas
+    #  - alignas specifier with anonymous structs:
+    closing_brace_pos = match.group(1).rfind(')')
+    opening_parenthesis = ReverseCloseExpression(
+        clean_lines, linenum, closing_brace_pos)
+    if opening_parenthesis[2] > -1:
+      line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]]
+      macro = Search(r'\b([A-Z_]+)\s*$', line_prefix)
+      func = Match(r'^(.*\])\s*$', line_prefix)
+      if ((macro and
+           macro.group(1) not in (
+               'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST',
+               'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED',
+               'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or
+          (func and not Search(r'\boperator\s*\[\s*\]', func.group(1))) or
+          Search(r'\b(?:struct|union)\s+alignas\s*$', line_prefix) or
+          Search(r'\s+=\s*$', line_prefix)):
+        match = None
+    if (match and
+        opening_parenthesis[1] > 1 and
+        Search(r'\]\s*$', clean_lines.elided[opening_parenthesis[1] - 1])):
+      # Multi-line lambda-expression
+      match = None
+
+  else:
+    # Try matching cases 2-3.
+    match = Match(r'^(.*(?:else|\)\s*const)\s*)\{', line)
+    if not match:
+      # Try matching cases 4-6.  These are always matched on separate lines.
+      #
+      # Note that we can't simply concatenate the previous line to the
+      # current line and do a single match, otherwise we may output
+      # duplicate warnings for the blank line case:
+      #   if (cond) {
+      #     // blank line
+      #   }
+      prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+      if prevline and Search(r'[;{}]\s*$', prevline):
+        match = Match(r'^(\s*)\{', line)
+
+  # Check matching closing brace
+  if match:
+    (endline, endlinenum, endpos) = CloseExpression(
+        clean_lines, linenum, len(match.group(1)))
+    if endpos > -1 and Match(r'^\s*;', endline[endpos:]):
+      # Current {} pair is eligible for semicolon check, and we have found
+      # the redundant semicolon, output warning here.
+      #
+      # Note: because we are scanning forward for opening braces, and
+      # outputting warnings for the matching closing brace, if there are
+      # nested blocks with trailing semicolons, we will get the error
+      # messages in reversed order.
+      error(filename, endlinenum, 'readability/braces', 4,
+            "You don't need a ; after a }")
+
+
+def CheckEmptyBlockBody(filename, clean_lines, linenum, error):
+  """Look for empty loop/conditional body with only a single semicolon.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Search for loop keywords at the beginning of the line.  Because only
+  # whitespaces are allowed before the keywords, this will also ignore most
+  # do-while-loops, since those lines should start with closing brace.
+  #
+  # We also check "if" blocks here, since an empty conditional block
+  # is likely an error.
+  line = clean_lines.elided[linenum]
+  matched = Match(r'\s*(for|while|if)\s*\(', line)
+  if matched:
+    # Find the end of the conditional expression
+    (end_line, end_linenum, end_pos) = CloseExpression(
+        clean_lines, linenum, line.find('('))
+
+    # Output warning if what follows the condition expression is a semicolon.
+    # No warning for all other cases, including whitespace or newline, since we
+    # have a separate check for semicolons preceded by whitespace.
+    if end_pos >= 0 and Match(r';', end_line[end_pos:]):
+      if matched.group(1) == 'if':
+        error(filename, end_linenum, 'whitespace/empty_conditional_body', 5,
+              'Empty conditional bodies should use {}')
+      else:
+        error(filename, end_linenum, 'whitespace/empty_loop_body', 5,
+              'Empty loop bodies should use {} or continue')
+
+
+def FindCheckMacro(line):
+  """Find a replaceable CHECK-like macro.
+
+  Args:
+    line: line to search on.
+  Returns:
+    (macro name, start position), or (None, -1) if no replaceable
+    macro is found.
+  """
+  for macro in _CHECK_MACROS:
+    i = line.find(macro)
+    if i >= 0:
+      # Find opening parenthesis.  Do a regular expression match here
+      # to make sure that we are matching the expected CHECK macro, as
+      # opposed to some other macro that happens to contain the CHECK
+      # substring.
+      matched = Match(r'^(.*\b' + macro + r'\s*)\(', line)
+      if not matched:
+        continue
+      return (macro, len(matched.group(1)))
+  return (None, -1)
+
+
+def CheckCheck(filename, clean_lines, linenum, error):
+  """Checks the use of CHECK and EXPECT macros.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Decide the set of replacement macros that should be suggested
+  lines = clean_lines.elided
+  (check_macro, start_pos) = FindCheckMacro(lines[linenum])
+  if not check_macro:
+    return
+
+  # Find end of the boolean expression by matching parentheses
+  (last_line, end_line, end_pos) = CloseExpression(
+      clean_lines, linenum, start_pos)
+  if end_pos < 0:
+    return
+
+  # If the check macro is followed by something other than a
+  # semicolon, assume users will log their own custom error messages
+  # and don't suggest any replacements.
+  if not Match(r'\s*;', last_line[end_pos:]):
+    return
+
+  if linenum == end_line:
+    expression = lines[linenum][start_pos + 1:end_pos - 1]
+  else:
+    expression = lines[linenum][start_pos + 1:]
+    for i in xrange(linenum + 1, end_line):
+      expression += lines[i]
+    expression += last_line[0:end_pos - 1]
+
+  # Parse expression so that we can take parentheses into account.
+  # This avoids false positives for inputs like "CHECK((a < 4) == b)",
+  # which is not replaceable by CHECK_LE.
+  lhs = ''
+  rhs = ''
+  operator = None
+  while expression:
+    matched = Match(r'^\s*(<<|<<=|>>|>>=|->\*|->|&&|\|\||'
+                    r'==|!=|>=|>|<=|<|\()(.*)$', expression)
+    if matched:
+      token = matched.group(1)
+      if token == '(':
+        # Parenthesized operand
+        expression = matched.group(2)
+        (end, _) = FindEndOfExpressionInLine(expression, 0, ['('])
+        if end < 0:
+          return  # Unmatched parenthesis
+        lhs += '(' + expression[0:end]
+        expression = expression[end:]
+      elif token in ('&&', '||'):
+        # Logical and/or operators.  This means the expression
+        # contains more than one term, for example:
+        #   CHECK(42 < a && a < b);
+        #
+        # These are not replaceable with CHECK_LE, so bail out early.
+        return
+      elif token in ('<<', '<<=', '>>', '>>=', '->*', '->'):
+        # Non-relational operator
+        lhs += token
+        expression = matched.group(2)
+      else:
+        # Relational operator
+        operator = token
+        rhs = matched.group(2)
+        break
+    else:
+      # Unparenthesized operand.  Instead of appending to lhs one character
+      # at a time, we do another regular expression match to consume several
+      # characters at once if possible.  Trivial benchmark shows that this
+      # is more efficient when the operands are longer than a single
+      # character, which is generally the case.
+      matched = Match(r'^([^-=!<>()&|]+)(.*)$', expression)
+      if not matched:
+        matched = Match(r'^(\s*\S)(.*)$', expression)
+        if not matched:
+          break
+      lhs += matched.group(1)
+      expression = matched.group(2)
+
+  # Only apply checks if we got all parts of the boolean expression
+  if not (lhs and operator and rhs):
+    return
+
+  # Check that rhs do not contain logical operators.  We already know
+  # that lhs is fine since the loop above parses out && and ||.
+  if rhs.find('&&') > -1 or rhs.find('||') > -1:
+    return
+
+  # At least one of the operands must be a constant literal.  This is
+  # to avoid suggesting replacements for unprintable things like
+  # CHECK(variable != iterator)
+  #
+  # The following pattern matches decimal, hex integers, strings, and
+  # characters (in that order).
+  lhs = lhs.strip()
+  rhs = rhs.strip()
+  match_constant = r'^([-+]?(\d+|0[xX][0-9a-fA-F]+)[lLuU]{0,3}|".*"|\'.*\')$'
+  if Match(match_constant, lhs) or Match(match_constant, rhs):
+    # Note: since we know both lhs and rhs, we can provide a more
+    # descriptive error message like:
+    #   Consider using CHECK_EQ(x, 42) instead of CHECK(x == 42)
+    # Instead of:
+    #   Consider using CHECK_EQ instead of CHECK(a == b)
+    #
+    # We are still keeping the less descriptive message because if lhs
+    # or rhs gets long, the error message might become unreadable.
+    error(filename, linenum, 'readability/check', 2,
+          'Consider using %s instead of %s(a %s b)' % (
+              _CHECK_REPLACEMENT[check_macro][operator],
+              check_macro, operator))
+
+
+def CheckAltTokens(filename, clean_lines, linenum, error):
+  """Check alternative keywords being used in boolean expressions.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Avoid preprocessor lines
+  if Match(r'^\s*#', line):
+    return
+
+  # Last ditch effort to avoid multi-line comments.  This will not help
+  # if the comment started before the current line or ended after the
+  # current line, but it catches most of the false positives.  At least,
+  # it provides a way to workaround this warning for people who use
+  # multi-line comments in preprocessor macros.
+  #
+  # TODO(unknown): remove this once cpplint has better support for
+  # multi-line comments.
+  if line.find('/*') >= 0 or line.find('*/') >= 0:
+    return
+
+  for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line):
+    error(filename, linenum, 'readability/alt_tokens', 2,
+          'Use operator %s instead of %s' % (
+              _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1)))
+
+
+def GetLineWidth(line):
+  """Determines the width of the line in column positions.
+
+  Args:
+    line: A string, which may be a Unicode string.
+
+  Returns:
+    The width of the line in column positions, accounting for Unicode
+    combining characters and wide characters.
+  """
+  if isinstance(line, unicode):
+    width = 0
+    for uc in unicodedata.normalize('NFC', line):
+      if unicodedata.east_asian_width(uc) in ('W', 'F'):
+        width += 2
+      elif not unicodedata.combining(uc):
+        width += 1
+    return width
+  else:
+    return len(line)
+
+
+def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
+               error):
+  """Checks rules from the 'C++ style rules' section of cppguide.html.
+
+  Most of these rules are hard to test (naming, comment style), but we
+  do what we can.  In particular we check for 2-space indents, line lengths,
+  tab usage, spaces inside code, etc.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    file_extension: The extension (without the dot) of the filename.
+    nesting_state: A NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+
+  # Don't use "elided" lines here, otherwise we can't check commented lines.
+  # Don't want to use "raw" either, because we don't want to check inside C++11
+  # raw strings,
+  raw_lines = clean_lines.lines_without_raw_strings
+  line = raw_lines[linenum]
+
+  if line.find('\t') != -1:
+    error(filename, linenum, 'whitespace/tab', 1,
+          'Tab found; better to use spaces')
+
+  # One or three blank spaces at the beginning of the line is weird; it's
+  # hard to reconcile that with 2-space indents.
+  # NOTE: here are the conditions rob pike used for his tests.  Mine aren't
+  # as sophisticated, but it may be worth becoming so:  RLENGTH==initial_spaces
+  # if(RLENGTH > 20) complain = 0;
+  # if(match($0, " +(error|private|public|protected):")) complain = 0;
+  # if(match(prev, "&& *$")) complain = 0;
+  # if(match(prev, "\\|\\| *$")) complain = 0;
+  # if(match(prev, "[\",=><] *$")) complain = 0;
+  # if(match($0, " <<")) complain = 0;
+  # if(match(prev, " +for \\(")) complain = 0;
+  # if(prevodd && match(prevprev, " +for \\(")) complain = 0;
+  scope_or_label_pattern = r'\s*\w+\s*:\s*\\?$'
+  classinfo = nesting_state.InnermostClass()
+  initial_spaces = 0
+  cleansed_line = clean_lines.elided[linenum]
+  while initial_spaces < len(line) and line[initial_spaces] == ' ':
+    initial_spaces += 1
+  if line and line[-1].isspace():
+    error(filename, linenum, 'whitespace/end_of_line', 4,
+          'Line ends in whitespace.  Consider deleting these extra spaces.')
+  # There are certain situations we allow one space, notably for
+  # section labels, and also lines containing multi-line raw strings.
+  elif ((initial_spaces == 1 or initial_spaces == 3) and
+        not Match(scope_or_label_pattern, cleansed_line) and
+        not (clean_lines.raw_lines[linenum] != line and
+             Match(r'^\s*""', line))):
+    error(filename, linenum, 'whitespace/indent', 3,
+          'Weird number of spaces at line-start.  '
+          'Are you using a 2-space indent?')
+
+  # Check if the line is a header guard.
+  is_header_guard = False
+  if file_extension == 'h':
+    cppvar = GetHeaderGuardCPPVariable(filename)
+    if (line.startswith('#ifndef %s' % cppvar) or
+        line.startswith('#define %s' % cppvar) or
+        line.startswith('#endif  // %s' % cppvar)):
+      is_header_guard = True
+  # #include lines and header guards can be long, since there's no clean way to
+  # split them.
+  #
+  # URLs can be long too.  It's possible to split these, but it makes them
+  # harder to cut&paste.
+  #
+  # The "$Id:...$" comment may also get very long without it being the
+  # developers fault.
+  if (not line.startswith('#include') and not is_header_guard and
+      not Match(r'^\s*//.*http(s?)://\S*$', line) and
+      not Match(r'^// \$Id:.*#[0-9]+ \$$', line)):
+    line_width = GetLineWidth(line)
+    extended_length = int((_line_length * 1.25))
+    if line_width > extended_length:
+      error(filename, linenum, 'whitespace/line_length', 4,
+            'Lines should very rarely be longer than %i characters' %
+            extended_length)
+    elif line_width > _line_length:
+      error(filename, linenum, 'whitespace/line_length', 2,
+            'Lines should be <= %i characters long' % _line_length)
+
+  if (cleansed_line.count(';') > 1 and
+      # for loops are allowed two ;'s (and may run over two lines).
+      cleansed_line.find('for') == -1 and
+      (GetPreviousNonBlankLine(clean_lines, linenum)[0].find('for') == -1 or
+       GetPreviousNonBlankLine(clean_lines, linenum)[0].find(';') != -1) and
+      # It's ok to have many commands in a switch case that fits in 1 line
+      not ((cleansed_line.find('case ') != -1 or
+            cleansed_line.find('default:') != -1) and
+           cleansed_line.find('break;') != -1)):
+    error(filename, linenum, 'whitespace/newline', 0,
+          'More than one command on the same line')
+
+  # Some more style checks
+  CheckBraces(filename, clean_lines, linenum, error)
+  CheckTrailingSemicolon(filename, clean_lines, linenum, error)
+  CheckEmptyBlockBody(filename, clean_lines, linenum, error)
+  CheckAccess(filename, clean_lines, linenum, nesting_state, error)
+  CheckSpacing(filename, clean_lines, linenum, nesting_state, error)
+  CheckOperatorSpacing(filename, clean_lines, linenum, error)
+  CheckParenthesisSpacing(filename, clean_lines, linenum, error)
+  CheckCommaSpacing(filename, clean_lines, linenum, error)
+  CheckBracesSpacing(filename, clean_lines, linenum, error)
+  CheckSpacingForFunctionCall(filename, clean_lines, linenum, error)
+  CheckRValueReference(filename, clean_lines, linenum, nesting_state, error)
+  CheckCheck(filename, clean_lines, linenum, error)
+  CheckAltTokens(filename, clean_lines, linenum, error)
+  classinfo = nesting_state.InnermostClass()
+  if classinfo:
+    CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error)
+
+
+_RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$')
+# Matches the first component of a filename delimited by -s and _s. That is:
+#  _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo'
+#  _RE_FIRST_COMPONENT.match('foo.cc').group(0) == 'foo'
+#  _RE_FIRST_COMPONENT.match('foo-bar_baz.cc').group(0) == 'foo'
+#  _RE_FIRST_COMPONENT.match('foo_bar-baz.cc').group(0) == 'foo'
+_RE_FIRST_COMPONENT = re.compile(r'^[^-_.]+')
+
+
+def _DropCommonSuffixes(filename):
+  """Drops common suffixes like _test.cc or -inl.h from filename.
+
+  For example:
+    >>> _DropCommonSuffixes('foo/foo-inl.h')
+    'foo/foo'
+    >>> _DropCommonSuffixes('foo/bar/foo.cc')
+    'foo/bar/foo'
+    >>> _DropCommonSuffixes('foo/foo_internal.h')
+    'foo/foo'
+    >>> _DropCommonSuffixes('foo/foo_unusualinternal.h')
+    'foo/foo_unusualinternal'
+
+  Args:
+    filename: The input filename.
+
+  Returns:
+    The filename with the common suffix removed.
+  """
+  for suffix in ('test.cc', 'regtest.cc', 'unittest.cc',
+                 'inl.h', 'impl.h', 'internal.h'):
+    if (filename.endswith(suffix) and len(filename) > len(suffix) and
+        filename[-len(suffix) - 1] in ('-', '_')):
+      return filename[:-len(suffix) - 1]
+  return os.path.splitext(filename)[0]
+
+
+def _IsTestFilename(filename):
+  """Determines if the given filename has a suffix that identifies it as a test.
+
+  Args:
+    filename: The input filename.
+
+  Returns:
+    True if 'filename' looks like a test, False otherwise.
+  """
+  if (filename.endswith('_test.cc') or
+      filename.endswith('_unittest.cc') or
+      filename.endswith('_regtest.cc')):
+    return True
+  else:
+    return False
+
+
+def _ClassifyInclude(fileinfo, include, is_system):
+  """Figures out what kind of header 'include' is.
+
+  Args:
+    fileinfo: The current file cpplint is running over. A FileInfo instance.
+    include: The path to a #included file.
+    is_system: True if the #include used <> rather than "".
+
+  Returns:
+    One of the _XXX_HEADER constants.
+
+  For example:
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'stdio.h', True)
+    _C_SYS_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'string', True)
+    _CPP_SYS_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/foo.h', False)
+    _LIKELY_MY_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo_unknown_extension.cc'),
+    ...                  'bar/foo_other_ext.h', False)
+    _POSSIBLE_MY_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/bar.h', False)
+    _OTHER_HEADER
+  """
+  # This is a list of all standard c++ header files, except
+  # those already checked for above.
+  is_cpp_h = include in _CPP_HEADERS
+
+  if is_system:
+    if is_cpp_h:
+      return _CPP_SYS_HEADER
+    else:
+      return _C_SYS_HEADER
+
+  # If the target file and the include we're checking share a
+  # basename when we drop common extensions, and the include
+  # lives in . , then it's likely to be owned by the target file.
+  target_dir, target_base = (
+      os.path.split(_DropCommonSuffixes(fileinfo.RepositoryName())))
+  include_dir, include_base = os.path.split(_DropCommonSuffixes(include))
+  if target_base == include_base and (
+      include_dir == target_dir or
+      include_dir == os.path.normpath(target_dir + '/../public')):
+    return _LIKELY_MY_HEADER
+
+  # If the target and include share some initial basename
+  # component, it's possible the target is implementing the
+  # include, so it's allowed to be first, but we'll never
+  # complain if it's not there.
+  target_first_component = _RE_FIRST_COMPONENT.match(target_base)
+  include_first_component = _RE_FIRST_COMPONENT.match(include_base)
+  if (target_first_component and include_first_component and
+      target_first_component.group(0) ==
+      include_first_component.group(0)):
+    return _POSSIBLE_MY_HEADER
+
+  return _OTHER_HEADER
+
+
+
+def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
+  """Check rules that are applicable to #include lines.
+
+  Strings on #include lines are NOT removed from elided line, to make
+  certain tasks easier. However, to prevent false positives, checks
+  applicable to #include lines in CheckLanguage must be put here.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    include_state: An _IncludeState instance in which the headers are inserted.
+    error: The function to call with any errors found.
+  """
+  fileinfo = FileInfo(filename)
+  line = clean_lines.lines[linenum]
+
+  # "include" should use the new style "foo/bar.h" instead of just "bar.h"
+  # Only do this check if the included header follows google naming
+  # conventions.  If not, assume that it's a 3rd party API that
+  # requires special include conventions.
+  #
+  # We also make an exception for Lua headers, which follow google
+  # naming convention but not the include convention.
+  match = Match(r'#include\s*"([^/]+\.h)"', line)
+  if match and not _THIRD_PARTY_HEADERS_PATTERN.match(match.group(1)):
+    error(filename, linenum, 'build/include', 4,
+          'Include the directory when naming .h files')
+
+  # we shouldn't include a file more than once. actually, there are a
+  # handful of instances where doing so is okay, but in general it's
+  # not.
+  match = _RE_PATTERN_INCLUDE.search(line)
+  if match:
+    include = match.group(2)
+    is_system = (match.group(1) == '<')
+    duplicate_line = include_state.FindHeader(include)
+    if duplicate_line >= 0:
+      error(filename, linenum, 'build/include', 4,
+            '"%s" already included at %s:%s' %
+            (include, filename, duplicate_line))
+    elif (include.endswith('.cc') and
+          os.path.dirname(fileinfo.RepositoryName()) != os.path.dirname(include)):
+      error(filename, linenum, 'build/include', 4,
+            'Do not include .cc files from other packages')
+    elif not _THIRD_PARTY_HEADERS_PATTERN.match(include):
+      include_state.include_list[-1].append((include, linenum))
+
+      # We want to ensure that headers appear in the right order:
+      # 1) for foo.cc, foo.h  (preferred location)
+      # 2) c system files
+      # 3) cpp system files
+      # 4) for foo.cc, foo.h  (deprecated location)
+      # 5) other google headers
+      #
+      # We classify each include statement as one of those 5 types
+      # using a number of techniques. The include_state object keeps
+      # track of the highest type seen, and complains if we see a
+      # lower type after that.
+      error_message = include_state.CheckNextIncludeOrder(
+          _ClassifyInclude(fileinfo, include, is_system))
+      if error_message:
+        error(filename, linenum, 'build/include_order', 4,
+              '%s. Should be: %s.h, c system, c++ system, other.' %
+              (error_message, fileinfo.BaseName()))
+      canonical_include = include_state.CanonicalizeAlphabeticalOrder(include)
+      if not include_state.IsInAlphabeticalOrder(
+          clean_lines, linenum, canonical_include):
+        error(filename, linenum, 'build/include_alpha', 4,
+              'Include "%s" not in alphabetical order' % include)
+      include_state.SetLastHeader(canonical_include)
+
+
+
+def _GetTextInside(text, start_pattern):
+  r"""Retrieves all the text between matching open and close parentheses.
+
+  Given a string of lines and a regular expression string, retrieve all the text
+  following the expression and between opening punctuation symbols like
+  (, [, or {, and the matching close-punctuation symbol. This properly nested
+  occurrences of the punctuations, so for the text like
+    printf(a(), b(c()));
+  a call to _GetTextInside(text, r'printf\(') will return 'a(), b(c())'.
+  start_pattern must match string having an open punctuation symbol at the end.
+
+  Args:
+    text: The lines to extract text. Its comments and strings must be elided.
+           It can be single line and can span multiple lines.
+    start_pattern: The regexp string indicating where to start extracting
+                   the text.
+  Returns:
+    The extracted text.
+    None if either the opening string or ending punctuation could not be found.
+  """
+  # TODO(unknown): Audit cpplint.py to see what places could be profitably
+  # rewritten to use _GetTextInside (and use inferior regexp matching today).
+
+  # Give opening punctuations to get the matching close-punctuations.
+  matching_punctuation = {'(': ')', '{': '}', '[': ']'}
+  closing_punctuation = set(matching_punctuation.itervalues())
+
+  # Find the position to start extracting text.
+  match = re.search(start_pattern, text, re.M)
+  if not match:  # start_pattern not found in text.
+    return None
+  start_position = match.end(0)
+
+  assert start_position > 0, (
+      'start_pattern must ends with an opening punctuation.')
+  assert text[start_position - 1] in matching_punctuation, (
+      'start_pattern must ends with an opening punctuation.')
+  # Stack of closing punctuations we expect to have in text after position.
+  punctuation_stack = [matching_punctuation[text[start_position - 1]]]
+  position = start_position
+  while punctuation_stack and position < len(text):
+    if text[position] == punctuation_stack[-1]:
+      punctuation_stack.pop()
+    elif text[position] in closing_punctuation:
+      # A closing punctuation without matching opening punctuations.
+      return None
+    elif text[position] in matching_punctuation:
+      punctuation_stack.append(matching_punctuation[text[position]])
+    position += 1
+  if punctuation_stack:
+    # Opening punctuations left without matching close-punctuations.
+    return None
+  # punctuations match.
+  return text[start_position:position - 1]
+
+
+# Patterns for matching call-by-reference parameters.
+#
+# Supports nested templates up to 2 levels deep using this messy pattern:
+#   < (?: < (?: < [^<>]*
+#               >
+#           |   [^<>] )*
+#         >
+#     |   [^<>] )*
+#   >
+_RE_PATTERN_IDENT = r'[_a-zA-Z]\w*'  # =~ [[:alpha:]][[:alnum:]]*
+_RE_PATTERN_TYPE = (
+    r'(?:const\s+)?(?:typename\s+|class\s+|struct\s+|union\s+|enum\s+)?'
+    r'(?:\w|'
+    r'\s*<(?:<(?:<[^<>]*>|[^<>])*>|[^<>])*>|'
+    r'::)+')
+# A call-by-reference parameter ends with '& identifier'.
+_RE_PATTERN_REF_PARAM = re.compile(
+    r'(' + _RE_PATTERN_TYPE + r'(?:\s*(?:\bconst\b|[*]))*\s*'
+    r'&\s*' + _RE_PATTERN_IDENT + r')\s*(?:=[^,()]+)?[,)]')
+# A call-by-const-reference parameter either ends with 'const& identifier'
+# or looks like 'const type& identifier' when 'type' is atomic.
+_RE_PATTERN_CONST_REF_PARAM = (
+    r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT +
+    r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')')
+
+
+def CheckLanguage(filename, clean_lines, linenum, file_extension,
+                  include_state, nesting_state, error):
+  """Checks rules from the 'C++ language rules' section of cppguide.html.
+
+  Some of these rules are hard to test (function overloading, using
+  uint32 inappropriately), but we do the best we can.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    file_extension: The extension (without the dot) of the filename.
+    include_state: An _IncludeState instance in which the headers are inserted.
+    nesting_state: A NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  # If the line is empty or consists of entirely a comment, no need to
+  # check it.
+  line = clean_lines.elided[linenum]
+  if not line:
+    return
+
+  match = _RE_PATTERN_INCLUDE.search(line)
+  if match:
+    CheckIncludeLine(filename, clean_lines, linenum, include_state, error)
+    return
+
+  # Reset include state across preprocessor directives.  This is meant
+  # to silence warnings for conditional includes.
+  match = Match(r'^\s*#\s*(if|ifdef|ifndef|elif|else|endif)\b', line)
+  if match:
+    include_state.ResetSection(match.group(1))
+
+  # Make Windows paths like Unix.
+  fullname = os.path.abspath(filename).replace('\\', '/')
+  
+  # Perform other checks now that we are sure that this is not an include line
+  CheckCasts(filename, clean_lines, linenum, error)
+  CheckGlobalStatic(filename, clean_lines, linenum, error)
+  CheckPrintf(filename, clean_lines, linenum, error)
+
+  if file_extension == 'h':
+    # TODO(unknown): check that 1-arg constructors are explicit.
+    #                How to tell it's a constructor?
+    #                (handled in CheckForNonStandardConstructs for now)
+    # TODO(unknown): check that classes declare or disable copy/assign
+    #                (level 1 error)
+    pass
+
+  # Check if people are using the verboten C basic types.  The only exception
+  # we regularly allow is "unsigned short port" for port.
+  if Search(r'\bshort port\b', line):
+    if not Search(r'\bunsigned short port\b', line):
+      error(filename, linenum, 'runtime/int', 4,
+            'Use "unsigned short" for ports, not "short"')
+  else:
+    match = Search(r'\b(short|long(?! +double)|long long)\b', line)
+    if match:
+      error(filename, linenum, 'runtime/int', 4,
+            'Use int16/int64/etc, rather than the C type %s' % match.group(1))
+
+  # Check if some verboten operator overloading is going on
+  # TODO(unknown): catch out-of-line unary operator&:
+  #   class X {};
+  #   int operator&(const X& x) { return 42; }  // unary operator&
+  # The trick is it's hard to tell apart from binary operator&:
+  #   class Y { int operator&(const Y& x) { return 23; } }; // binary operator&
+  if Search(r'\boperator\s*&\s*\(\s*\)', line):
+    error(filename, linenum, 'runtime/operator', 4,
+          'Unary operator& is dangerous.  Do not use it.')
+
+  # Check for suspicious usage of "if" like
+  # } if (a == b) {
+  if Search(r'\}\s*if\s*\(', line):
+    error(filename, linenum, 'readability/braces', 4,
+          'Did you mean "else if"? If not, start a new line for "if".')
+
+  # Check for potential format string bugs like printf(foo).
+  # We constrain the pattern not to pick things like DocidForPrintf(foo).
+  # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str())
+  # TODO(unknown): Catch the following case. Need to change the calling
+  # convention of the whole function to process multiple line to handle it.
+  #   printf(
+  #       boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line);
+  printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(')
+  if printf_args:
+    match = Match(r'([\w.\->()]+)$', printf_args)
+    if match and match.group(1) != '__VA_ARGS__':
+      function_name = re.search(r'\b((?:string)?printf)\s*\(',
+                                line, re.I).group(1)
+      error(filename, linenum, 'runtime/printf', 4,
+            'Potential format string bug. Do %s("%%s", %s) instead.'
+            % (function_name, match.group(1)))
+
+  # Check for potential memset bugs like memset(buf, sizeof(buf), 0).
+  match = Search(r'memset\s*\(([^,]*),\s*([^,]*),\s*0\s*\)', line)
+  if match and not Match(r"^''|-?[0-9]+|0x[0-9A-Fa-f]$", match.group(2)):
+    error(filename, linenum, 'runtime/memset', 4,
+          'Did you mean "memset(%s, 0, %s)"?'
+          % (match.group(1), match.group(2)))
+
+  if Search(r'\busing namespace\b', line):
+    error(filename, linenum, 'build/namespaces', 5,
+          'Do not use namespace using-directives.  '
+          'Use using-declarations instead.')
+
+  # Detect variable-length arrays.
+  match = Match(r'\s*(.+::)?(\w+) [a-z]\w*\[(.+)];', line)
+  if (match and match.group(2) != 'return' and match.group(2) != 'delete' and
+      match.group(3).find(']') == -1):
+    # Split the size using space and arithmetic operators as delimiters.
+    # If any of the resulting tokens are not compile time constants then
+    # report the error.
+    tokens = re.split(r'\s|\+|\-|\*|\/|<<|>>]', match.group(3))
+    is_const = True
+    skip_next = False
+    for tok in tokens:
+      if skip_next:
+        skip_next = False
+        continue
+
+      if Search(r'sizeof\(.+\)', tok): continue
+      if Search(r'arraysize\(\w+\)', tok): continue
+
+      tok = tok.lstrip('(')
+      tok = tok.rstrip(')')
+      if not tok: continue
+      if Match(r'\d+', tok): continue
+      if Match(r'0[xX][0-9a-fA-F]+', tok): continue
+      if Match(r'k[A-Z0-9]\w*', tok): continue
+      if Match(r'(.+::)?k[A-Z0-9]\w*', tok): continue
+      if Match(r'(.+::)?[A-Z][A-Z0-9_]*', tok): continue
+      # A catch all for tricky sizeof cases, including 'sizeof expression',
+      # 'sizeof(*type)', 'sizeof(const type)', 'sizeof(struct StructName)'
+      # requires skipping the next token because we split on ' ' and '*'.
+      if tok.startswith('sizeof'):
+        skip_next = True
+        continue
+      is_const = False
+      break
+    if not is_const:
+      error(filename, linenum, 'runtime/arrays', 1,
+            'Do not use variable-length arrays.  Use an appropriately named '
+            "('k' followed by CamelCase) compile-time constant for the size.")
+
+  # Check for use of unnamed namespaces in header files.  Registration
+  # macros are typically OK, so we allow use of "namespace {" on lines
+  # that end with backslashes.
+  if (file_extension == 'h'
+      and Search(r'\bnamespace\s*{', line)
+      and line[-1] != '\\'):
+    error(filename, linenum, 'build/namespaces', 4,
+          'Do not use unnamed namespaces in header files.  See '
+          'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
+          ' for more information.')
+
+
+def CheckGlobalStatic(filename, clean_lines, linenum, error):
+  """Check for unsafe global or static objects.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Match two lines at a time to support multiline declarations
+  if linenum + 1 < clean_lines.NumLines() and not Search(r'[;({]', line):
+    line += clean_lines.elided[linenum + 1].strip()
+
+  # Check for people declaring static/global STL strings at the top level.
+  # This is dangerous because the C++ language does not guarantee that
+  # globals with constructors are initialized before the first access.
+  match = Match(
+      r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)',
+      line)
+
+  # Remove false positives:
+  # - String pointers (as opposed to values).
+  #    string *pointer
+  #    const string *pointer
+  #    string const *pointer
+  #    string *const pointer
+  #
+  # - Functions and template specializations.
+  #    string Function<Type>(...
+  #    string Class<Type>::Method(...
+  #
+  # - Operators.  These are matched separately because operator names
+  #   cross non-word boundaries, and trying to match both operators
+  #   and functions at the same time would decrease accuracy of
+  #   matching identifiers.
+  #    string Class::operator*()
+  if (match and
+      not Search(r'\bstring\b(\s+const)?\s*\*\s*(const\s+)?\w', line) and
+      not Search(r'\boperator\W', line) and
+      not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)*\s*\(([^"]|$)', match.group(3))):
+    error(filename, linenum, 'runtime/string', 4,
+          'For a static/global string constant, use a C style string instead: '
+          '"%schar %s[]".' %
+          (match.group(1), match.group(2)))
+
+  if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line):
+    error(filename, linenum, 'runtime/init', 4,
+          'You seem to be initializing a member variable with itself.')
+
+
+def CheckPrintf(filename, clean_lines, linenum, error):
+  """Check for printf related issues.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # When snprintf is used, the second argument shouldn't be a literal.
+  match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
+  if match and match.group(2) != '0':
+    # If 2nd arg is zero, snprintf is used to calculate size.
+    error(filename, linenum, 'runtime/printf', 3,
+          'If you can, use sizeof(%s) instead of %s as the 2nd arg '
+          'to snprintf.' % (match.group(1), match.group(2)))
+
+  # Check if some verboten C functions are being used.
+  if Search(r'\bsprintf\s*\(', line):
+    error(filename, linenum, 'runtime/printf', 5,
+          'Never use sprintf. Use snprintf instead.')
+  match = Search(r'\b(strcpy|strcat)\s*\(', line)
+  if match:
+    error(filename, linenum, 'runtime/printf', 4,
+          'Almost always, snprintf is better than %s' % match.group(1))
+
+
+def IsDerivedFunction(clean_lines, linenum):
+  """Check if current line contains an inherited function.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+  Returns:
+    True if current line contains a function with "override"
+    virt-specifier.
+  """
+  # Scan back a few lines for start of current function
+  for i in xrange(linenum, max(-1, linenum - 10), -1):
+    match = Match(r'^([^()]*\w+)\(', clean_lines.elided[i])
+    if match:
+      # Look for "override" after the matching closing parenthesis
+      line, _, closing_paren = CloseExpression(
+          clean_lines, i, len(match.group(1)))
+      return (closing_paren >= 0 and
+              Search(r'\boverride\b', line[closing_paren:]))
+  return False
+
+
+def IsOutOfLineMethodDefinition(clean_lines, linenum):
+  """Check if current line contains an out-of-line method definition.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+  Returns:
+    True if current line contains an out-of-line method definition.
+  """
+  # Scan back a few lines for start of current function
+  for i in xrange(linenum, max(-1, linenum - 10), -1):
+    if Match(r'^([^()]*\w+)\(', clean_lines.elided[i]):
+      return Match(r'^[^()]*\w+::\w+\(', clean_lines.elided[i]) is not None
+  return False
+
+
+def IsInitializerList(clean_lines, linenum):
+  """Check if current line is inside constructor initializer list.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+  Returns:
+    True if current line appears to be inside constructor initializer
+    list, False otherwise.
+  """
+  for i in xrange(linenum, 1, -1):
+    line = clean_lines.elided[i]
+    if i == linenum:
+      remove_function_body = Match(r'^(.*)\{\s*$', line)
+      if remove_function_body:
+        line = remove_function_body.group(1)
+
+    if Search(r'\s:\s*\w+[({]', line):
+      # A lone colon tend to indicate the start of a constructor
+      # initializer list.  It could also be a ternary operator, which
+      # also tend to appear in constructor initializer lists as
+      # opposed to parameter lists.
+      return True
+    if Search(r'\}\s*,\s*$', line):
+      # A closing brace followed by a comma is probably the end of a
+      # brace-initialized member in constructor initializer list.
+      return True
+    if Search(r'[{};]\s*$', line):
+      # Found one of the following:
+      # - A closing brace or semicolon, probably the end of the previous
+      #   function.
+      # - An opening brace, probably the start of current class or namespace.
+      #
+      # Current line is probably not inside an initializer list since
+      # we saw one of those things without seeing the starting colon.
+      return False
+
+  # Got to the beginning of the file without seeing the start of
+  # constructor initializer list.
+  return False
+
+
+def CheckForNonConstReference(filename, clean_lines, linenum,
+                              nesting_state, error):
+  """Check for non-const references.
+
+  Separate from CheckLanguage since it scans backwards from current
+  line, instead of scanning forward.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  # Do nothing if there is no '&' on current line.
+  line = clean_lines.elided[linenum]
+  if '&' not in line:
+    return
+
+  # If a function is inherited, current function doesn't have much of
+  # a choice, so any non-const references should not be blamed on
+  # derived function.
+  if IsDerivedFunction(clean_lines, linenum):
+    return
+
+  # Don't warn on out-of-line method definitions, as we would warn on the
+  # in-line declaration, if it isn't marked with 'override'.
+  if IsOutOfLineMethodDefinition(clean_lines, linenum):
+    return
+
+  # Long type names may be broken across multiple lines, usually in one
+  # of these forms:
+  #   LongType
+  #       ::LongTypeContinued &identifier
+  #   LongType::
+  #       LongTypeContinued &identifier
+  #   LongType<
+  #       ...>::LongTypeContinued &identifier
+  #
+  # If we detected a type split across two lines, join the previous
+  # line to current line so that we can match const references
+  # accordingly.
+  #
+  # Note that this only scans back one line, since scanning back
+  # arbitrary number of lines would be expensive.  If you have a type
+  # that spans more than 2 lines, please use a typedef.
+  if linenum > 1:
+    previous = None
+    if Match(r'\s*::(?:[\w<>]|::)+\s*&\s*\S', line):
+      # previous_line\n + ::current_line
+      previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+[\w<>])\s*$',
+                        clean_lines.elided[linenum - 1])
+    elif Match(r'\s*[a-zA-Z_]([\w<>]|::)+\s*&\s*\S', line):
+      # previous_line::\n + current_line
+      previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+::)\s*$',
+                        clean_lines.elided[linenum - 1])
+    if previous:
+      line = previous.group(1) + line.lstrip()
+    else:
+      # Check for templated parameter that is split across multiple lines
+      endpos = line.rfind('>')
+      if endpos > -1:
+        (_, startline, startpos) = ReverseCloseExpression(
+            clean_lines, linenum, endpos)
+        if startpos > -1 and startline < linenum:
+          # Found the matching < on an earlier line, collect all
+          # pieces up to current line.
+          line = ''
+          for i in xrange(startline, linenum + 1):
+            line += clean_lines.elided[i].strip()
+
+  # Check for non-const references in function parameters.  A single '&' may
+  # found in the following places:
+  #   inside expression: binary & for bitwise AND
+  #   inside expression: unary & for taking the address of something
+  #   inside declarators: reference parameter
+  # We will exclude the first two cases by checking that we are not inside a
+  # function body, including one that was just introduced by a trailing '{'.
+  # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare].
+  if (nesting_state.previous_stack_top and
+      not (isinstance(nesting_state.previous_stack_top, _ClassInfo) or
+           isinstance(nesting_state.previous_stack_top, _NamespaceInfo))):
+    # Not at toplevel, not within a class, and not within a namespace
+    return
+
+  # Avoid initializer lists.  We only need to scan back from the
+  # current line for something that starts with ':'.
+  #
+  # We don't need to check the current line, since the '&' would
+  # appear inside the second set of parentheses on the current line as
+  # opposed to the first set.
+  if linenum > 0:
+    for i in xrange(linenum - 1, max(0, linenum - 10), -1):
+      previous_line = clean_lines.elided[i]
+      if not Search(r'[),]\s*$', previous_line):
+        break
+      if Match(r'^\s*:\s+\S', previous_line):
+        return
+
+  # Avoid preprocessors
+  if Search(r'\\\s*$', line):
+    return
+
+  # Avoid constructor initializer lists
+  if IsInitializerList(clean_lines, linenum):
+    return
+
+  # We allow non-const references in a few standard places, like functions
+  # called "swap()" or iostream operators like "<<" or ">>".  Do not check
+  # those function parameters.
+  #
+  # We also accept & in static_assert, which looks like a function but
+  # it's actually a declaration expression.
+  whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
+                           r'operator\s*[<>][<>]|'
+                           r'static_assert|COMPILE_ASSERT'
+                           r')\s*\(')
+  if Search(whitelisted_functions, line):
+    return
+  elif not Search(r'\S+\([^)]*$', line):
+    # Don't see a whitelisted function on this line.  Actually we
+    # didn't see any function name on this line, so this is likely a
+    # multi-line parameter list.  Try a bit harder to catch this case.
+    for i in xrange(2):
+      if (linenum > i and
+          Search(whitelisted_functions, clean_lines.elided[linenum - i - 1])):
+        return
+
+  decls = ReplaceAll(r'{[^}]*}', ' ', line)  # exclude function body
+  for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
+    if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter):
+      error(filename, linenum, 'runtime/references', 2,
+            'Is this a non-const reference? '
+            'If so, make const or use a pointer: ' +
+            ReplaceAll(' *<', '<', parameter))
+
+
+def CheckCasts(filename, clean_lines, linenum, error):
+  """Various cast related checks.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Check to see if they're using an conversion function cast.
+  # I just try to capture the most common basic types, though there are more.
+  # Parameterless conversion functions, such as bool(), are allowed as they are
+  # probably a member operator declaration or default constructor.
+  match = Search(
+      r'(\bnew\s+|\S<\s*(?:const\s+)?)?\b'
+      r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
+      r'(\([^)].*)', line)
+  expecting_function = ExpectingFunctionArgs(clean_lines, linenum)
+  if match and not expecting_function:
+    matched_type = match.group(2)
+
+    # matched_new_or_template is used to silence two false positives:
+    # - New operators
+    # - Template arguments with function types
+    #
+    # For template arguments, we match on types immediately following
+    # an opening bracket without any spaces.  This is a fast way to
+    # silence the common case where the function type is the first
+    # template argument.  False negative with less-than comparison is
+    # avoided because those operators are usually followed by a space.
+    #
+    #   function<double(double)>   // bracket + no space = false positive
+    #   value < double(42)         // bracket + space = true positive
+    matched_new_or_template = match.group(1)
+
+    # Avoid arrays by looking for brackets that come after the closing
+    # parenthesis.
+    if Match(r'\([^()]+\)\s*\[', match.group(3)):
+      return
+
+    # Other things to ignore:
+    # - Function pointers
+    # - Casts to pointer types
+    # - Placement new
+    # - Alias declarations
+    matched_funcptr = match.group(3)
+    if (matched_new_or_template is None and
+        not (matched_funcptr and
+             (Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
+                    matched_funcptr) or
+              matched_funcptr.startswith('(*)'))) and
+        not Match(r'\s*using\s+\S+\s*=\s*' + matched_type, line) and
+        not Search(r'new\(\S+\)\s*' + matched_type, line)):
+      error(filename, linenum, 'readability/casting', 4,
+            'Using deprecated casting style.  '
+            'Use static_cast<%s>(...) instead' %
+            matched_type)
+
+  if not expecting_function:
+    CheckCStyleCast(filename, clean_lines, linenum, 'static_cast',
+                    r'\((int|float|double|bool|char|u?int(16|32|64))\)', error)
+
+  # This doesn't catch all cases. Consider (const char * const)"hello".
+  #
+  # (char *) "foo" should always be a const_cast (reinterpret_cast won't
+  # compile).
+  if CheckCStyleCast(filename, clean_lines, linenum, 'const_cast',
+                     r'\((char\s?\*+\s?)\)\s*"', error):
+    pass
+  else:
+    # Check pointer casts for other than string constants
+    CheckCStyleCast(filename, clean_lines, linenum, 'reinterpret_cast',
+                    r'\((\w+\s?\*+\s?)\)', error)
+
+  # In addition, we look for people taking the address of a cast.  This
+  # is dangerous -- casts can assign to temporaries, so the pointer doesn't
+  # point where you think.
+  #
+  # Some non-identifier character is required before the '&' for the
+  # expression to be recognized as a cast.  These are casts:
+  #   expression = &static_cast<int*>(temporary());
+  #   function(&(int*)(temporary()));
+  #
+  # This is not a cast:
+  #   reference_type&(int* function_param);
+  match = Search(
+      r'(?:[^\w]&\(([^)*][^)]*)\)[\w(])|'
+      r'(?:[^\w]&(static|dynamic|down|reinterpret)_cast\b)', line)
+  if match:
+    # Try a better error message when the & is bound to something
+    # dereferenced by the casted pointer, as opposed to the casted
+    # pointer itself.
+    parenthesis_error = False
+    match = Match(r'^(.*&(?:static|dynamic|down|reinterpret)_cast\b)<', line)
+    if match:
+      _, y1, x1 = CloseExpression(clean_lines, linenum, len(match.group(1)))
+      if x1 >= 0 and clean_lines.elided[y1][x1] == '(':
+        _, y2, x2 = CloseExpression(clean_lines, y1, x1)
+        if x2 >= 0:
+          extended_line = clean_lines.elided[y2][x2:]
+          if y2 < clean_lines.NumLines() - 1:
+            extended_line += clean_lines.elided[y2 + 1]
+          if Match(r'\s*(?:->|\[)', extended_line):
+            parenthesis_error = True
+
+    if parenthesis_error:
+      error(filename, linenum, 'readability/casting', 4,
+            ('Are you taking an address of something dereferenced '
+             'from a cast?  Wrapping the dereferenced expression in '
+             'parentheses will make the binding more obvious'))
+    else:
+      error(filename, linenum, 'runtime/casting', 4,
+            ('Are you taking an address of a cast?  '
+             'This is dangerous: could be a temp var.  '
+             'Take the address before doing the cast, rather than after'))
+
+
+def CheckCStyleCast(filename, clean_lines, linenum, cast_type, pattern, error):
+  """Checks for a C-style cast by looking for the pattern.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    cast_type: The string for the C++ cast to recommend.  This is either
+      reinterpret_cast, static_cast, or const_cast, depending.
+    pattern: The regular expression used to find C-style casts.
+    error: The function to call with any errors found.
+
+  Returns:
+    True if an error was emitted.
+    False otherwise.
+  """
+  line = clean_lines.elided[linenum]
+  match = Search(pattern, line)
+  if not match:
+    return False
+
+  # Exclude lines with keywords that tend to look like casts
+  context = line[0:match.start(1) - 1]
+  if Match(r'.*\b(?:sizeof|alignof|alignas|[_A-Z][_A-Z0-9]*)\s*$', context):
+    return False
+
+  # Try expanding current context to see if we one level of
+  # parentheses inside a macro.
+  if linenum > 0:
+    for i in xrange(linenum - 1, max(0, linenum - 5), -1):
+      context = clean_lines.elided[i] + context
+  if Match(r'.*\b[_A-Z][_A-Z0-9]*\s*\((?:\([^()]*\)|[^()])*$', context):
+    return False
+
+  # operator++(int) and operator--(int)
+  if context.endswith(' operator++') or context.endswith(' operator--'):
+    return False
+
+  # A single unnamed argument for a function tends to look like old
+  # style cast.  If we see those, don't issue warnings for deprecated
+  # casts, instead issue warnings for unnamed arguments where
+  # appropriate.
+  #
+  # These are things that we want warnings for, since the style guide
+  # explicitly require all parameters to be named:
+  #   Function(int);
+  #   Function(int) {
+  #   ConstMember(int) const;
+  #   ConstMember(int) const {
+  #   ExceptionMember(int) throw (...);
+  #   ExceptionMember(int) throw (...) {
+  #   PureVirtual(int) = 0;
+  #   [](int) -> bool {
+  #
+  # These are functions of some sort, where the compiler would be fine
+  # if they had named parameters, but people often omit those
+  # identifiers to reduce clutter:
+  #   (FunctionPointer)(int);
+  #   (FunctionPointer)(int) = value;
+  #   Function((function_pointer_arg)(int))
+  #   Function((function_pointer_arg)(int), int param)
+  #   <TemplateArgument(int)>;
+  #   <(FunctionPointerTemplateArgument)(int)>;
+  remainder = line[match.end(0):]
+  if Match(r'^\s*(?:;|const\b|throw\b|final\b|override\b|[=>{),]|->)',
+           remainder):
+    # Looks like an unnamed parameter.
+
+    # Don't warn on any kind of template arguments.
+    if Match(r'^\s*>', remainder):
+      return False
+
+    # Don't warn on assignments to function pointers, but keep warnings for
+    # unnamed parameters to pure virtual functions.  Note that this pattern
+    # will also pass on assignments of "0" to function pointers, but the
+    # preferred values for those would be "nullptr" or "NULL".
+    matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder)
+    if matched_zero and matched_zero.group(1) != '0':
+      return False
+
+    # Don't warn on function pointer declarations.  For this we need
+    # to check what came before the "(type)" string.
+    if Match(r'.*\)\s*$', line[0:match.start(0)]):
+      return False
+
+    # Don't warn if the parameter is named with block comments, e.g.:
+    #  Function(int /*unused_param*/);
+    raw_line = clean_lines.raw_lines[linenum]
+    if '/*' in raw_line:
+      return False
+
+    # Passed all filters, issue warning here.
+    error(filename, linenum, 'readability/function', 3,
+          'All parameters should be named in a function')
+    return True
+
+  # At this point, all that should be left is actual casts.
+  error(filename, linenum, 'readability/casting', 4,
+        'Using C-style cast.  Use %s<%s>(...) instead' %
+        (cast_type, match.group(1)))
+
+  return True
+
+
+def ExpectingFunctionArgs(clean_lines, linenum):
+  """Checks whether where function type arguments are expected.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+
+  Returns:
+    True if the line at 'linenum' is inside something that expects arguments
+    of function types.
+  """
+  line = clean_lines.elided[linenum]
+  return (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
+          (linenum >= 2 and
+           (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
+                  clean_lines.elided[linenum - 1]) or
+            Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
+                  clean_lines.elided[linenum - 2]) or
+            Search(r'\bstd::m?function\s*\<\s*$',
+                   clean_lines.elided[linenum - 1]))))
+
+
+_HEADERS_CONTAINING_TEMPLATES = (
+    ('<deque>', ('deque',)),
+    ('<functional>', ('unary_function', 'binary_function',
+                      'plus', 'minus', 'multiplies', 'divides', 'modulus',
+                      'negate',
+                      'equal_to', 'not_equal_to', 'greater', 'less',
+                      'greater_equal', 'less_equal',
+                      'logical_and', 'logical_or', 'logical_not',
+                      'unary_negate', 'not1', 'binary_negate', 'not2',
+                      'bind1st', 'bind2nd',
+                      'pointer_to_unary_function',
+                      'pointer_to_binary_function',
+                      'ptr_fun',
+                      'mem_fun_t', 'mem_fun', 'mem_fun1_t', 'mem_fun1_ref_t',
+                      'mem_fun_ref_t',
+                      'const_mem_fun_t', 'const_mem_fun1_t',
+                      'const_mem_fun_ref_t', 'const_mem_fun1_ref_t',
+                      'mem_fun_ref',
+                     )),
+    ('<limits>', ('numeric_limits',)),
+    ('<list>', ('list',)),
+    ('<map>', ('map', 'multimap',)),
+    ('<memory>', ('allocator',)),
+    ('<queue>', ('queue', 'priority_queue',)),
+    ('<set>', ('set', 'multiset',)),
+    ('<stack>', ('stack',)),
+    ('<string>', ('char_traits', 'basic_string',)),
+    ('<tuple>', ('tuple',)),
+    ('<utility>', ('pair',)),
+    ('<vector>', ('vector',)),
+
+    # gcc extensions.
+    # Note: std::hash is their hash, ::hash is our hash
+    ('<hash_map>', ('hash_map', 'hash_multimap',)),
+    ('<hash_set>', ('hash_set', 'hash_multiset',)),
+    ('<slist>', ('slist',)),
+    )
+
+_RE_PATTERN_STRING = re.compile(r'\bstring\b')
+
+_re_pattern_algorithm_header = []
+for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap',
+                  'transform'):
+  # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
+  # type::max().
+  _re_pattern_algorithm_header.append(
+      (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'),
+       _template,
+       '<algorithm>'))
+
+_re_pattern_templates = []
+for _header, _templates in _HEADERS_CONTAINING_TEMPLATES:
+  for _template in _templates:
+    _re_pattern_templates.append(
+        (re.compile(r'(\<|\b)' + _template + r'\s*\<'),
+         _template + '<>',
+         _header))
+
+
+def FilesBelongToSameModule(filename_cc, filename_h):
+  """Check if these two filenames belong to the same module.
+
+  The concept of a 'module' here is a as follows:
+  foo.h, foo-inl.h, foo.cc, foo_test.cc and foo_unittest.cc belong to the
+  same 'module' if they are in the same directory.
+  some/path/public/xyzzy and some/path/internal/xyzzy are also considered
+  to belong to the same module here.
+
+  If the filename_cc contains a longer path than the filename_h, for example,
+  '/absolute/path/to/base/sysinfo.cc', and this file would include
+  'base/sysinfo.h', this function also produces the prefix needed to open the
+  header. This is used by the caller of this function to more robustly open the
+  header file. We don't have access to the real include paths in this context,
+  so we need this guesswork here.
+
+  Known bugs: tools/base/bar.cc and base/bar.h belong to the same module
+  according to this implementation. Because of this, this function gives
+  some false positives. This should be sufficiently rare in practice.
+
+  Args:
+    filename_cc: is the path for the .cc file
+    filename_h: is the path for the header path
+
+  Returns:
+    Tuple with a bool and a string:
+    bool: True if filename_cc and filename_h belong to the same module.
+    string: the additional prefix needed to open the header file.
+  """
+
+  if not filename_cc.endswith('.cc'):
+    return (False, '')
+  filename_cc = filename_cc[:-len('.cc')]
+  if filename_cc.endswith('_unittest'):
+    filename_cc = filename_cc[:-len('_unittest')]
+  elif filename_cc.endswith('_test'):
+    filename_cc = filename_cc[:-len('_test')]
+  filename_cc = filename_cc.replace('/public/', '/')
+  filename_cc = filename_cc.replace('/internal/', '/')
+
+  if not filename_h.endswith('.h'):
+    return (False, '')
+  filename_h = filename_h[:-len('.h')]
+  if filename_h.endswith('-inl'):
+    filename_h = filename_h[:-len('-inl')]
+  filename_h = filename_h.replace('/public/', '/')
+  filename_h = filename_h.replace('/internal/', '/')
+
+  files_belong_to_same_module = filename_cc.endswith(filename_h)
+  common_path = ''
+  if files_belong_to_same_module:
+    common_path = filename_cc[:-len(filename_h)]
+  return files_belong_to_same_module, common_path
+
+
+def UpdateIncludeState(filename, include_dict, io=codecs):
+  """Fill up the include_dict with new includes found from the file.
+
+  Args:
+    filename: the name of the header to read.
+    include_dict: a dictionary in which the headers are inserted.
+    io: The io factory to use to read the file. Provided for testability.
+
+  Returns:
+    True if a header was successfully added. False otherwise.
+  """
+  headerfile = None
+  try:
+    headerfile = io.open(filename, 'r', 'utf8', 'replace')
+  except IOError:
+    return False
+  linenum = 0
+  for line in headerfile:
+    linenum += 1
+    clean_line = CleanseComments(line)
+    match = _RE_PATTERN_INCLUDE.search(clean_line)
+    if match:
+      include = match.group(2)
+      include_dict.setdefault(include, linenum)
+  return True
+
+
+def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
+                              io=codecs):
+  """Reports for missing stl includes.
+
+  This function will output warnings to make sure you are including the headers
+  necessary for the stl containers and functions that you use. We only give one
+  reason to include a header. For example, if you use both equal_to<> and
+  less<> in a .h file, only one (the latter in the file) of these will be
+  reported as a reason to include the <functional>.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    include_state: An _IncludeState instance.
+    error: The function to call with any errors found.
+    io: The IO factory to use to read the header file. Provided for unittest
+        injection.
+  """
+  required = {}  # A map of header name to linenumber and the template entity.
+                 # Example of required: { '<functional>': (1219, 'less<>') }
+
+  for linenum in xrange(clean_lines.NumLines()):
+    line = clean_lines.elided[linenum]
+    if not line or line[0] == '#':
+      continue
+
+    # String is special -- it is a non-templatized type in STL.
+    matched = _RE_PATTERN_STRING.search(line)
+    if matched:
+      # Don't warn about strings in non-STL namespaces:
+      # (We check only the first match per line; good enough.)
+      prefix = line[:matched.start()]
+      if prefix.endswith('std::') or not prefix.endswith('::'):
+        required['<string>'] = (linenum, 'string')
+
+    for pattern, template, header in _re_pattern_algorithm_header:
+      if pattern.search(line):
+        required[header] = (linenum, template)
+
+    # The following function is just a speed up, no semantics are changed.
+    if not '<' in line:  # Reduces the cpu time usage by skipping lines.
+      continue
+
+    for pattern, template, header in _re_pattern_templates:
+      if pattern.search(line):
+        required[header] = (linenum, template)
+
+  # The policy is that if you #include something in foo.h you don't need to
+  # include it again in foo.cc. Here, we will look at possible includes.
+  # Let's flatten the include_state include_list and copy it into a dictionary.
+  include_dict = dict([item for sublist in include_state.include_list
+                       for item in sublist])
+
+  # Did we find the header for this file (if any) and successfully load it?
+  header_found = False
+
+  # Use the absolute path so that matching works properly.
+  abs_filename = FileInfo(filename).FullName()
+
+  # For Emacs's flymake.
+  # If cpplint is invoked from Emacs's flymake, a temporary file is generated
+  # by flymake and that file name might end with '_flymake.cc'. In that case,
+  # restore original file name here so that the corresponding header file can be
+  # found.
+  # e.g. If the file name is 'foo_flymake.cc', we should search for 'foo.h'
+  # instead of 'foo_flymake.h'
+  abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename)
+
+  # include_dict is modified during iteration, so we iterate over a copy of
+  # the keys.
+  header_keys = include_dict.keys()
+  for header in header_keys:
+    (same_module, common_path) = FilesBelongToSameModule(abs_filename, header)
+    fullpath = common_path + header
+    if same_module and UpdateIncludeState(fullpath, include_dict, io):
+      header_found = True
+
+  # If we can't find the header file for a .cc, assume it's because we don't
+  # know where to look. In that case we'll give up as we're not sure they
+  # didn't include it in the .h file.
+  # TODO(unknown): Do a better job of finding .h files so we are confident that
+  # not having the .h file means there isn't one.
+  if filename.endswith('.cc') and not header_found:
+    return
+
+  # All the lines have been processed, report the errors found.
+  for required_header_unstripped in required:
+    template = required[required_header_unstripped][1]
+    if required_header_unstripped.strip('<>"') not in include_dict:
+      error(filename, required[required_header_unstripped][0],
+            'build/include_what_you_use', 4,
+            'Add #include ' + required_header_unstripped + ' for ' + template)
+
+
+_RE_PATTERN_EXPLICIT_MAKEPAIR = re.compile(r'\bmake_pair\s*<')
+
+
+def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
+  """Check that make_pair's template arguments are deduced.
+
+  G++ 4.6 in C++11 mode fails badly if make_pair's template arguments are
+  specified explicitly, and such use isn't intended in any case.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  match = _RE_PATTERN_EXPLICIT_MAKEPAIR.search(line)
+  if match:
+    error(filename, linenum, 'build/explicit_make_pair',
+          4,  # 4 = high confidence
+          'For C++11-compatibility, omit template arguments from make_pair'
+          ' OR use pair directly OR if appropriate, construct a pair directly')
+
+
+def CheckDefaultLambdaCaptures(filename, clean_lines, linenum, error):
+  """Check that default lambda captures are not used.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # A lambda introducer specifies a default capture if it starts with "[="
+  # or if it starts with "[&" _not_ followed by an identifier.
+  match = Match(r'^(.*)\[\s*(?:=|&[^\w])', line)
+  if match:
+    # Found a potential error, check what comes after the lambda-introducer.
+    # If it's not open parenthesis (for lambda-declarator) or open brace
+    # (for compound-statement), it's not a lambda.
+    line, _, pos = CloseExpression(clean_lines, linenum, len(match.group(1)))
+    if pos >= 0 and Match(r'^\s*[{(]', line[pos:]):
+      error(filename, linenum, 'build/c++11',
+            4,  # 4 = high confidence
+            'Default lambda captures are an unapproved C++ feature.')
+
+
+def CheckRedundantVirtual(filename, clean_lines, linenum, error):
+  """Check if line contains a redundant "virtual" function-specifier.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  # Look for "virtual" on current line.
+  line = clean_lines.elided[linenum]
+  virtual = Match(r'^(.*)(\bvirtual\b)(.*)$', line)
+  if not virtual: return
+
+  # Ignore "virtual" keywords that are near access-specifiers.  These
+  # are only used in class base-specifier and do not apply to member
+  # functions.
+  if (Search(r'\b(public|protected|private)\s+$', virtual.group(1)) or
+      Match(r'^\s+(public|protected|private)\b', virtual.group(3))):
+    return
+
+  # Ignore the "virtual" keyword from virtual base classes.  Usually
+  # there is a column on the same line in these cases (virtual base
+  # classes are rare in google3 because multiple inheritance is rare).
+  if Match(r'^.*[^:]:[^:].*$', line): return
+
+  # Look for the next opening parenthesis.  This is the start of the
+  # parameter list (possibly on the next line shortly after virtual).
+  # TODO(unknown): doesn't work if there are virtual functions with
+  # decltype() or other things that use parentheses, but csearch suggests
+  # that this is rare.
+  end_col = -1
+  end_line = -1
+  start_col = len(virtual.group(2))
+  for start_line in xrange(linenum, min(linenum + 3, clean_lines.NumLines())):
+    line = clean_lines.elided[start_line][start_col:]
+    parameter_list = Match(r'^([^(]*)\(', line)
+    if parameter_list:
+      # Match parentheses to find the end of the parameter list
+      (_, end_line, end_col) = CloseExpression(
+          clean_lines, start_line, start_col + len(parameter_list.group(1)))
+      break
+    start_col = 0
+
+  if end_col < 0:
+    return  # Couldn't find end of parameter list, give up
+
+  # Look for "override" or "final" after the parameter list
+  # (possibly on the next few lines).
+  for i in xrange(end_line, min(end_line + 3, clean_lines.NumLines())):
+    line = clean_lines.elided[i][end_col:]
+    match = Search(r'\b(override|final)\b', line)
+    if match:
+      error(filename, linenum, 'readability/inheritance', 4,
+            ('"virtual" is redundant since function is '
+             'already declared as "%s"' % match.group(1)))
+
+    # Set end_col to check whole lines after we are done with the
+    # first line.
+    end_col = 0
+    if Search(r'[^\w]\s*$', line):
+      break
+
+
+def CheckRedundantOverrideOrFinal(filename, clean_lines, linenum, error):
+  """Check if line contains a redundant "override" or "final" virt-specifier.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  # Look for closing parenthesis nearby.  We need one to confirm where
+  # the declarator ends and where the virt-specifier starts to avoid
+  # false positives.
+  line = clean_lines.elided[linenum]
+  declarator_end = line.rfind(')')
+  if declarator_end >= 0:
+    fragment = line[declarator_end:]
+  else:
+    if linenum > 1 and clean_lines.elided[linenum - 1].rfind(')') >= 0:
+      fragment = line
+    else:
+      return
+
+  # Check that at most one of "override" or "final" is present, not both
+  if Search(r'\boverride\b', fragment) and Search(r'\bfinal\b', fragment):
+    error(filename, linenum, 'readability/inheritance', 4,
+          ('"override" is redundant since function is '
+           'already declared as "final"'))
+
+
+
+
+# Returns true if we are at a new block, and it is directly
+# inside of a namespace.
+def IsBlockInNameSpace(nesting_state, is_forward_declaration):
+  """Checks that the new block is directly in a namespace.
+
+  Args:
+    nesting_state: The _NestingState object that contains info about our state.
+    is_forward_declaration: If the class is a forward declared class.
+  Returns:
+    Whether or not the new block is directly in a namespace.
+  """
+  if is_forward_declaration:
+    if len(nesting_state.stack) >= 1 and (
+        isinstance(nesting_state.stack[-1], _NamespaceInfo)):
+      return True
+    else:
+      return False
+
+  return (len(nesting_state.stack) > 1 and
+          nesting_state.stack[-1].check_namespace_indentation and
+          isinstance(nesting_state.stack[-2], _NamespaceInfo))
+
+
+def ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item,
+                                    raw_lines_no_comments, linenum):
+  """This method determines if we should apply our namespace indentation check.
+
+  Args:
+    nesting_state: The current nesting state.
+    is_namespace_indent_item: If we just put a new class on the stack, True.
+      If the top of the stack is not a class, or we did not recently
+      add the class, False.
+    raw_lines_no_comments: The lines without the comments.
+    linenum: The current line number we are processing.
+
+  Returns:
+    True if we should apply our namespace indentation check. Currently, it
+    only works for classes and namespaces inside of a namespace.
+  """
+
+  is_forward_declaration = IsForwardClassDeclaration(raw_lines_no_comments,
+                                                     linenum)
+
+  if not (is_namespace_indent_item or is_forward_declaration):
+    return False
+
+  # If we are in a macro, we do not want to check the namespace indentation.
+  if IsMacroDefinition(raw_lines_no_comments, linenum):
+    return False
+
+  return IsBlockInNameSpace(nesting_state, is_forward_declaration)
+
+
+# Call this method if the line is directly inside of a namespace.
+# If the line above is blank (excluding comments) or the start of
+# an inner namespace, it cannot be indented.
+def CheckItemIndentationInNamespace(filename, raw_lines_no_comments, linenum,
+                                    error):
+  line = raw_lines_no_comments[linenum]
+  if Match(r'^\s+', line):
+    error(filename, linenum, 'runtime/indentation_namespace', 4,
+          'Do not indent within a namespace')
+
+
+def ProcessLine(filename, file_extension, clean_lines, line,
+                include_state, function_state, nesting_state, error,
+                extra_check_functions=[]):
+  """Processes a single line in the file.
+
+  Args:
+    filename: Filename of the file that is being processed.
+    file_extension: The extension (dot not included) of the file.
+    clean_lines: An array of strings, each representing a line of the file,
+                 with comments stripped.
+    line: Number of line being processed.
+    include_state: An _IncludeState instance in which the headers are inserted.
+    function_state: A _FunctionState instance which counts function lines, etc.
+    nesting_state: A NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: A callable to which errors are reported, which takes 4 arguments:
+           filename, line number, error level, and message
+    extra_check_functions: An array of additional check functions that will be
+                           run on each source line. Each function takes 4
+                           arguments: filename, clean_lines, line, error
+  """
+  raw_lines = clean_lines.raw_lines
+  ParseNolintSuppressions(filename, raw_lines[line], line, error)
+  nesting_state.Update(filename, clean_lines, line, error)
+  CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line,
+                               error)
+  if nesting_state.InAsmBlock(): return
+  CheckForFunctionLengths(filename, clean_lines, line, function_state, error)
+  CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error)
+  CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error)
+  CheckLanguage(filename, clean_lines, line, file_extension, include_state,
+                nesting_state, error)
+  CheckForNonConstReference(filename, clean_lines, line, nesting_state, error)
+  CheckForNonStandardConstructs(filename, clean_lines, line,
+                                nesting_state, error)
+  CheckVlogArguments(filename, clean_lines, line, error)
+  CheckPosixThreading(filename, clean_lines, line, error)
+  CheckInvalidIncrement(filename, clean_lines, line, error)
+  CheckMakePairUsesDeduction(filename, clean_lines, line, error)
+  CheckDefaultLambdaCaptures(filename, clean_lines, line, error)
+  CheckRedundantVirtual(filename, clean_lines, line, error)
+  CheckRedundantOverrideOrFinal(filename, clean_lines, line, error)
+  for check_fn in extra_check_functions:
+    check_fn(filename, clean_lines, line, error)
+
+def FlagCxx11Features(filename, clean_lines, linenum, error):
+  """Flag those c++11 features that we only allow in certain places.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Flag unapproved C++11 headers.
+  include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line)
+  if include and include.group(1) in ('cfenv',
+                                      'condition_variable',
+                                      'fenv.h',
+                                      'future',
+                                      'mutex',
+                                      'thread',
+                                      'chrono',
+                                      'ratio',
+                                      'regex',
+                                      'system_error',
+                                     ):
+    error(filename, linenum, 'build/c++11', 5,
+          ('<%s> is an unapproved C++11 header.') % include.group(1))
+
+  # The only place where we need to worry about C++11 keywords and library
+  # features in preprocessor directives is in macro definitions.
+  if Match(r'\s*#', line) and not Match(r'\s*#\s*define\b', line): return
+
+  # These are classes and free functions.  The classes are always
+  # mentioned as std::*, but we only catch the free functions if
+  # they're not found by ADL.  They're alphabetical by header.
+  for top_name in (
+      # type_traits
+      'alignment_of',
+      'aligned_union',
+      ):
+    if Search(r'\bstd::%s\b' % top_name, line):
+      error(filename, linenum, 'build/c++11', 5,
+            ('std::%s is an unapproved C++11 class or function.  Send c-style '
+             'an example of where it would make your code more readable, and '
+             'they may let you use it.') % top_name)
+
+
+def ProcessFileData(filename, file_extension, lines, error,
+                    extra_check_functions=[]):
+  """Performs lint checks and reports any errors to the given error function.
+
+  Args:
+    filename: Filename of the file that is being processed.
+    file_extension: The extension (dot not included) of the file.
+    lines: An array of strings, each representing a line of the file, with the
+           last element being empty if the file is terminated with a newline.
+    error: A callable to which errors are reported, which takes 4 arguments:
+           filename, line number, error level, and message
+    extra_check_functions: An array of additional check functions that will be
+                           run on each source line. Each function takes 4
+                           arguments: filename, clean_lines, line, error
+  """
+  lines = (['// marker so line numbers and indices both start at 1'] + lines +
+           ['// marker so line numbers end in a known way'])
+
+  include_state = _IncludeState()
+  function_state = _FunctionState()
+  nesting_state = NestingState()
+
+  ResetNolintSuppressions()
+
+  CheckForCopyright(filename, lines, error)
+
+  RemoveMultiLineComments(filename, lines, error)
+  clean_lines = CleansedLines(lines)
+
+  if file_extension == 'h':
+    CheckForHeaderGuard(filename, clean_lines, error)
+
+  for line in xrange(clean_lines.NumLines()):
+    ProcessLine(filename, file_extension, clean_lines, line,
+                include_state, function_state, nesting_state, error,
+                extra_check_functions)
+    FlagCxx11Features(filename, clean_lines, line, error)
+  nesting_state.CheckCompletedBlocks(filename, error)
+
+  CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error)
+  
+  # Check that the .cc file has included its header if it exists.
+  if file_extension == 'cc':
+    CheckHeaderFileIncluded(filename, include_state, error)
+
+  # We check here rather than inside ProcessLine so that we see raw
+  # lines rather than "cleaned" lines.
+  CheckForBadCharacters(filename, lines, error)
+
+  CheckForNewlineAtEOF(filename, lines, error)
+
+def ProcessConfigOverrides(filename):
+  """ Loads the configuration files and processes the config overrides.
+
+  Args:
+    filename: The name of the file being processed by the linter.
+
+  Returns:
+    False if the current |filename| should not be processed further.
+  """
+
+  abs_filename = os.path.abspath(filename)
+  cfg_filters = []
+  keep_looking = True
+  while keep_looking:
+    abs_path, base_name = os.path.split(abs_filename)
+    if not base_name:
+      break  # Reached the root directory.
+
+    cfg_file = os.path.join(abs_path, "CPPLINT.cfg")
+    abs_filename = abs_path
+    if not os.path.isfile(cfg_file):
+      continue
+
+    try:
+      with open(cfg_file) as file_handle:
+        for line in file_handle:
+          line, _, _ = line.partition('#')  # Remove comments.
+          if not line.strip():
+            continue
+
+          name, _, val = line.partition('=')
+          name = name.strip()
+          val = val.strip()
+          if name == 'set noparent':
+            keep_looking = False
+          elif name == 'filter':
+            cfg_filters.append(val)
+          elif name == 'exclude_files':
+            # When matching exclude_files pattern, use the base_name of
+            # the current file name or the directory name we are processing.
+            # For example, if we are checking for lint errors in /foo/bar/baz.cc
+            # and we found the .cfg file at /foo/CPPLINT.cfg, then the config
+            # file's "exclude_files" filter is meant to be checked against "bar"
+            # and not "baz" nor "bar/baz.cc".
+            if base_name:
+              pattern = re.compile(val)
+              if pattern.match(base_name):
+                sys.stderr.write('Ignoring "%s": file excluded by "%s". '
+                                 'File path component "%s" matches '
+                                 'pattern "%s"\n' %
+                                 (filename, cfg_file, base_name, val))
+                return False
+          elif name == 'linelength':
+            global _line_length
+            try:
+                _line_length = int(val)
+            except ValueError:
+                sys.stderr.write('Line length must be numeric.')
+          else:
+            sys.stderr.write(
+                'Invalid configuration option (%s) in file %s\n' %
+                (name, cfg_file))
+
+    except IOError:
+      sys.stderr.write(
+          "Skipping config file '%s': Can't open for reading\n" % cfg_file)
+      keep_looking = False
+
+  # Apply all the accumulated filters in reverse order (top-level directory
+  # config options having the least priority).
+  for filter in reversed(cfg_filters):
+     _AddFilters(filter)
+
+  return True
+
+
+def ProcessFile(filename, vlevel, extra_check_functions=[]):
+  """Does google-lint on a single file.
+
+  Args:
+    filename: The name of the file to parse.
+
+    vlevel: The level of errors to report.  Every error of confidence
+    >= verbose_level will be reported.  0 is a good default.
+
+    extra_check_functions: An array of additional check functions that will be
+                           run on each source line. Each function takes 4
+                           arguments: filename, clean_lines, line, error
+  """
+
+  _SetVerboseLevel(vlevel)
+  _BackupFilters()
+
+  if not ProcessConfigOverrides(filename):
+    _RestoreFilters()
+    return
+
+  lf_lines = []
+  crlf_lines = []
+  try:
+    # Support the UNIX convention of using "-" for stdin.  Note that
+    # we are not opening the file with universal newline support
+    # (which codecs doesn't support anyway), so the resulting lines do
+    # contain trailing '\r' characters if we are reading a file that
+    # has CRLF endings.
+    # If after the split a trailing '\r' is present, it is removed
+    # below.
+    if filename == '-':
+      lines = codecs.StreamReaderWriter(sys.stdin,
+                                        codecs.getreader('utf8'),
+                                        codecs.getwriter('utf8'),
+                                        'replace').read().split('\n')
+    else:
+      lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n')
+
+    # Remove trailing '\r'.
+    # The -1 accounts for the extra trailing blank line we get from split()
+    for linenum in range(len(lines) - 1):
+      if lines[linenum].endswith('\r'):
+        lines[linenum] = lines[linenum].rstrip('\r')
+        crlf_lines.append(linenum + 1)
+      else:
+        lf_lines.append(linenum + 1)
+
+  except IOError:
+    sys.stderr.write(
+        "Skipping input '%s': Can't open for reading\n" % filename)
+    _RestoreFilters()
+    return
+
+  # Note, if no dot is found, this will give the entire filename as the ext.
+  file_extension = filename[filename.rfind('.') + 1:]
+
+  # When reading from stdin, the extension is unknown, so no cpplint tests
+  # should rely on the extension.
+  if filename != '-' and file_extension not in _valid_extensions:
+    sys.stderr.write('Ignoring %s; not a valid file name '
+                     '(%s)\n' % (filename, ', '.join(_valid_extensions)))
+  else:
+    ProcessFileData(filename, file_extension, lines, Error,
+                    extra_check_functions)
+
+    # If end-of-line sequences are a mix of LF and CR-LF, issue
+    # warnings on the lines with CR.
+    #
+    # Don't issue any warnings if all lines are uniformly LF or CR-LF,
+    # since critique can handle these just fine, and the style guide
+    # doesn't dictate a particular end of line sequence.
+    #
+    # We can't depend on os.linesep to determine what the desired
+    # end-of-line sequence should be, since that will return the
+    # server-side end-of-line sequence.
+    if lf_lines and crlf_lines:
+      # Warn on every line with CR.  An alternative approach might be to
+      # check whether the file is mostly CRLF or just LF, and warn on the
+      # minority, we bias toward LF here since most tools prefer LF.
+      for linenum in crlf_lines:
+        Error(filename, linenum, 'whitespace/newline', 1,
+              'Unexpected \\r (^M) found; better to use only \\n')
+
+  sys.stdout.write('Done processing %s\n' % filename)
+  _RestoreFilters()
+
+
+def PrintUsage(message):
+  """Prints a brief usage string and exits, optionally with an error message.
+
+  Args:
+    message: The optional error message.
+  """
+  sys.stderr.write(_USAGE)
+  if message:
+    sys.exit('\nFATAL ERROR: ' + message)
+  else:
+    sys.exit(1)
+
+
+def PrintCategories():
+  """Prints a list of all the error-categories used by error messages.
+
+  These are the categories used to filter messages via --filter.
+  """
+  sys.stderr.write(''.join('  %s\n' % cat for cat in _ERROR_CATEGORIES))
+  sys.exit(0)
+
+
+def ParseArguments(args):
+  """Parses the command line arguments.
+
+  This may set the output format and verbosity level as side-effects.
+
+  Args:
+    args: The command line arguments:
+
+  Returns:
+    The list of filenames to lint.
+  """
+  try:
+    (opts, filenames) = getopt.getopt(args, '', ['help', 'output=', 'verbose=',
+                                                 'counting=',
+                                                 'filter=',
+                                                 'root=',
+                                                 'linelength=',
+                                                 'extensions='])
+  except getopt.GetoptError:
+    PrintUsage('Invalid arguments.')
+
+  verbosity = _VerboseLevel()
+  output_format = _OutputFormat()
+  filters = ''
+  counting_style = ''
+
+  for (opt, val) in opts:
+    if opt == '--help':
+      PrintUsage(None)
+    elif opt == '--output':
+      if val not in ('emacs', 'vs7', 'eclipse'):
+        PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.')
+      output_format = val
+    elif opt == '--verbose':
+      verbosity = int(val)
+    elif opt == '--filter':
+      filters = val
+      if not filters:
+        PrintCategories()
+    elif opt == '--counting':
+      if val not in ('total', 'toplevel', 'detailed'):
+        PrintUsage('Valid counting options are total, toplevel, and detailed')
+      counting_style = val
+    elif opt == '--root':
+      global _root
+      _root = val
+    elif opt == '--linelength':
+      global _line_length
+      try:
+          _line_length = int(val)
+      except ValueError:
+          PrintUsage('Line length must be digits.')
+    elif opt == '--extensions':
+      global _valid_extensions
+      try:
+          _valid_extensions = set(val.split(','))
+      except ValueError:
+          PrintUsage('Extensions must be comma seperated list.')
+
+  if not filenames:
+    PrintUsage('No files were specified.')
+
+  _SetOutputFormat(output_format)
+  _SetVerboseLevel(verbosity)
+  _SetFilters(filters)
+  _SetCountingStyle(counting_style)
+
+  return filenames
+
+
+def main():
+  filenames = ParseArguments(sys.argv[1:])
+
+  # Change stderr to write with replacement characters so we don't die
+  # if we try to print something containing non-ASCII characters.
+  sys.stderr = codecs.StreamReaderWriter(sys.stderr,
+                                         codecs.getreader('utf8'),
+                                         codecs.getwriter('utf8'),
+                                         'replace')
+
+  _cpplint_state.ResetErrorCounts()
+  for filename in filenames:
+    ProcessFile(filename, _cpplint_state.verbose_level)
+  _cpplint_state.PrintErrorCounts()
+
+  sys.exit(_cpplint_state.error_count > 0)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/paddle/scripts/deb/postinst b/paddle/scripts/deb/postinst
new file mode 100644
index 00000000000000..1d2dd3171a1329
--- /dev/null
+++ b/paddle/scripts/deb/postinst
@@ -0,0 +1,7 @@
+#!/bin/bash
+set -e
+echo "Post install paddle debian package."
+echo "Install some python package used for paddle. You can run "
+echo "  pip install /usr/opt/paddle/share/wheels/*.whl to install them."
+pip install /usr/opt/paddle/share/wheels/*.whl
+
diff --git a/paddle/scripts/docker/Dockerfile b/paddle/scripts/docker/Dockerfile
new file mode 100644
index 00000000000000..625f2a0d6322fd
--- /dev/null
+++ b/paddle/scripts/docker/Dockerfile
@@ -0,0 +1,11 @@
+FROM ubuntu:14.04
+MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
+COPY build.sh /root/
+ARG LOWEST_DL_SPEED=0
+ARG WITH_GPU=OFF
+ARG IS_DEVEL=OFF 
+ARG WITH_DEMO=OFF
+ARG PIP_INSTALL_ARGS
+ARG PIP_GENERAL_ARGS
+ENV CUDNN_DOWNLOAD_SUM 40d506d0a8a00a3faccce1433346806b8cd2535683b6f08a63683ce6e474419f
+RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
new file mode 100644
index 00000000000000..be59c325b19687
--- /dev/null
+++ b/paddle/scripts/docker/build.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+
+function abort(){
+    echo "An error occurred. Exiting..." 1>&2
+    exit 1
+}
+
+trap 'abort' 0
+set -e
+sed -i 's#http://archive\.ubuntu\.com/ubuntu/#mirror://mirrors\.ubuntu\.com/mirrors\.txt#g' /etc/apt/sources.list
+apt-get update
+apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
+    libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip\
+    python-protobuf python-numpy python-dev swig
+
+if [ ${WITH_GPU} == "ON" ]; then  # install cuda
+  cd ~
+  apt-get install -y aria2 wget
+  echo "Downloading cuda tookit"
+  set +e
+  for ((i=0; i<100; i++))
+  do
+    aria2c -x 10 -s 10 --lowest-speed-limit=${LOWEST_DL_SPEED} http://developer.download.nvidia.com/compute/cuda/7.5/Prod/local_installers/cuda_7.5.18_linux.run
+    if [ $? -eq 0 ]; then
+       break
+    fi
+  done
+
+  set -e
+  wget http://developer.download.nvidia.com/compute/cuda/7.5/Prod/local_installers/cuda_7.5.18_linux.run.md5
+  md5sum -c cuda_7.5.18_linux.run.md5
+  chmod +x cuda_7.5.18_linux.run
+  ./cuda_7.5.18_linux.run --extract=$PWD
+  ./cuda-linux64-rel-7.5.18-19867135.run -noprompt
+  rm *.run *.run.md5
+
+  echo "Downloading cudnn v5.1"
+  set +e
+  for ((i=0; i<100; i++))
+  do
+    aria2c -x 10 --lowest-speed-limit=${LOWEST_DL_SPEED} http://developer.download.nvidia.com/compute/redist/cudnn/v5.1/cudnn-7.5-linux-x64-v5.1.tgz
+    if [ $? -eq 0 ]; then
+        break
+    fi
+  done
+  set -e
+  echo "$CUDNN_DOWNLOAD_SUM  cudnn-7.5-linux-x64-v5.1.tgz" | sha256sum -c --strict -
+  tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
+  rm cudnn-7.5-linux-x64-v5.1.tgz
+  ldconfig
+  export PATH=/usr/local/cuda/bin:$PATH
+  apt-get purge -y aria2
+fi
+set -e
+cd ~
+git clone https://github.com/baidu/Paddle.git paddle
+cd paddle
+mkdir build
+cd build
+cmake .. -DWITH_DOC=OFF -DWITH_GPU=${WITH_GPU} -DWITH_SWIG_PY=ON
+make -j `nproc`
+# because durning make install, there are several warning, so set +e, do not cause abort
+make install
+echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
+pip ${PIP_GENERAL_ARGS} install ${PIP_INSTALL_ARGS} /usr/local/opt/paddle/share/wheels/*.whl
+paddle version  # print version after build
+
+if [ ${WITH_DEMO} == "ON" ]; then
+  apt-get install -y wget unzip perl python-matplotlib tar xz-utils bzip2 gzip coreutils\
+	          sed grep graphviz 
+  pip ${PIP_GENERAL_ARGS} install ${PIP_INSTALL_ARGS}  BeautifulSoup docopt PyYAML
+fi
+if [ ${IS_DEVEL} == "OFF" ]; then  # clean build packages.
+  cd ~
+  # TODO(yuyang18): Do clean for devel package, and cuda devel tools
+  rm -rf paddle
+fi
+apt-get clean -y
+trap : 0
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
new file mode 100644
index 00000000000000..eed2d315932c59
--- /dev/null
+++ b/paddle/scripts/submit_local.sh.in
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+function usage(){
+        echo "usage: paddle [--help] [<args>]"
+        echo "These are common paddle commands used in various situations:"
+        echo "    train             Start a paddle_trainer"
+        echo "    merge_model       Start a paddle_merge_model"
+        echo "    pserver           Start a paddle_pserver_main"
+        echo "    version           Print paddle version"
+        echo "    dump_config       Dump the trainer config as proto string"
+        echo "    make_diagram      Make Diagram using Graphviz"
+        echo ""
+        echo "'paddle train --help' 'paddle merge_model --help', 'paddle pserver --help', list more detailed usage of each command"
+}
+
+
+function version(){
+        echo "PaddlePaddle @PADDLE_VERSION@, compiled with"
+        echo "    with_avx: @WITH_AVX@"
+        echo "    with_gpu: @WITH_GPU@"
+        echo "    with_double: @WITH_DOUBLE@"
+        echo "    with_python: @WITH_PYTHON@"
+        echo "    with_rdma: @WITH_RDMA@"
+        echo "    with_glog: @WITH_GLOG@"
+        echo "    with_gflags: @WITH_GFLAGS@"
+        echo "    with_metric_learning: @WITH_METRIC@"
+        echo "    with_timer: @WITH_TIMER@"
+        echo "    with_predict_sdk: @WITH_PREDICT_SDK@"
+}
+
+
+MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+if [ ! -z "${DEBUGGER}" ]; then
+    echo "Using debug command ${DEBUGGER}"
+fi
+
+CUDNN_LIB_PATH="@CUDNN_LIB_PATH@"
+
+if [ ! -z "${CUDNN_LIB_PATH}" ]; then
+    export LD_LIBRARY_PATH=${CUDNN_LIB_PATH}:${LD_LIBRARY_PATH}
+fi
+
+export PYTHONPATH=${PWD}:${PYTHONPATH}
+
+case "$1" in
+    "train")
+        ${DEBUGGER} $MYDIR/../opt/paddle/bin/paddle_trainer ${@:2}
+        ;;
+    "merge_model")
+        ${DEBUGGER} $MYDIR/../opt/paddle/bin/paddle_merge_model ${@:2}
+        ;;
+    "pserver")
+        ${DEBUGGER} $MYDIR/../opt/paddle/bin/paddle_pserver_main ${@:2}
+        ;;
+    "dump_config")
+        python -m paddle.utils.dump_config ${@:2}
+        ;;
+    "make_diagram")
+        python -m paddle.utils.make_model_diagram ${@:2}
+        ;;
+    "version")
+        version
+        ;;
+    "--help")
+        usage
+        ;;
+    *)
+        usage
+        ;;
+ esac
diff --git a/paddle/setup.py b/paddle/setup.py
new file mode 100644
index 00000000000000..0b98ea5fb8cf08
--- /dev/null
+++ b/paddle/setup.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is used to build paddle python binding package.
+# It will be invoked by Makefile that generated by COMAKE
+from setuptools import setup, Extension
+import numpy as np
+import api.paddle_ld_flags
+
+# The extra links will passed from COMAKE
+#   because generate paddle LDFLAGS is too complicated to do in setup.py
+#   it just read COMAKE generated LDFLAGS.
+extra_links = []
+ldflags = api.paddle_ld_flags.PaddleLDFlag()
+ldflags = ldflags.ldflag_str()
+if ldflags is not None:
+  extra_links.extend(ldflags.split(" "))
+
+try:
+  with open('.py_paddle_extra_link_flags', 'r') as f:
+    for line in f:
+      extra_links += line.split()
+except:
+  pass
+
+setup(name="py_paddle",
+  version="0.8.0b",  # TODO(yuyang18): Make this version same as CMake
+  ext_modules=[
+    Extension('py_paddle._swig_paddle',      # Build SWIG Extension.
+      ['Paddle_wrap.cxx'],
+      extra_link_args=["-Xlinker", '-start-group'] + 
+                        extra_links + ["-Xlinker", "-end-group"]
+    )
+  ],
+  packages=['py_paddle'],
+  include_dirs = [np.get_include(), "../"],   # include numpy and paddle.
+  install_requires = [
+    'numpy>=1.10.1',      # The numpy is required.
+    'protobuf>=2.4.1' # The paddle protobuf version
+  ])
diff --git a/paddle/trainer/CMakeLists.txt b/paddle/trainer/CMakeLists.txt
new file mode 100644
index 00000000000000..08b411d2ccbae7
--- /dev/null
+++ b/paddle/trainer/CMakeLists.txt
@@ -0,0 +1,57 @@
+# paddle trainer package
+
+set(TRAINER_SOURCES
+        ParameterUpdater.cpp
+        ParamUtil.cpp
+        RemoteParameterUpdater.cpp
+        Tester.cpp
+        Trainer.cpp
+        TrainerInternal.cpp
+        ThreadParameterUpdater.cpp
+        TrainerInternalConfig.cpp
+        TrainerConfigHelper.cpp)
+
+set(TRAINER_HEADERS
+        ParameterUpdater.h
+        ParamUtil.h
+        RemoteParameterUpdater.h
+        Tester.h
+        TesterConfig.h
+        Trainer.h
+        TrainerInternal.h
+        TrainerInternalConfig.h
+        ThreadParameterUpdater.h
+        TrainerConfigHelper.h)
+
+add_library(paddle_trainer_lib STATIC
+    ${TRAINER_SOURCES})
+
+add_style_check_target(paddle_trainer_lib
+    ${TRAINER_SOURCES})
+add_style_check_target(paddle_trainer_lib
+    ${TRAINER_HEADERS})
+add_dependencies(paddle_trainer_lib
+    gen_proto_cpp)
+
+macro(add_paddle_exe TARGET_NAME)
+  add_executable(${TARGET_NAME} ${ARGN})
+  add_style_check_target(${TARGET_NAME} ${ARGN})
+  link_paddle_exe(${TARGET_NAME})
+endmacro()
+
+add_paddle_exe(paddle_trainer
+    TrainerMain.cpp)
+
+add_paddle_exe(paddle_merge_model
+    MergeModel.cpp)
+
+if(WITH_TESTING)
+    add_subdirectory(tests)
+endif()
+install(TARGETS paddle_trainer paddle_merge_model
+    RUNTIME DESTINATION opt/paddle/bin
+    PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
+        GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
+
+set_target_properties(paddle_trainer PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
+set_target_properties(paddle_merge_model PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
diff --git a/paddle/trainer/MergeModel.cpp b/paddle/trainer/MergeModel.cpp
new file mode 100644
index 00000000000000..1d15c66d4d09d0
--- /dev/null
+++ b/paddle/trainer/MergeModel.cpp
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+
+#include "paddle/utils/PythonUtil.h"
+#include "paddle/pserver/ParameterServer2.h"
+#include "ParamUtil.h"
+#include "Trainer.h"
+
+P_DEFINE_string(model_dir, "", "Directory for separated model files");
+P_DEFINE_string(model_file, "", "File for merged model file");
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  initPython(argc, argv);
+  string confFile = TrainerConfigHelper::getConfigNameFromPath(FLAGS_model_dir);
+#ifdef PADDLE_ONLY_CPU
+  FLAGS_use_gpu = false;
+#endif
+  auto config = std::make_shared<TrainerConfigHelper>(confFile);
+  unique_ptr<GradientMachine> gradientMachine(GradientMachine::create(*config));
+  gradientMachine->loadParameters(FLAGS_model_dir);
+
+  ofstream os(FLAGS_model_file);
+
+  string buf;
+  config->getConfig().SerializeToString(&buf);
+  int64_t size = buf.size();
+  os.write((char*)&size, sizeof(size));
+  CHECK(os) << "Fail to write to " << FLAGS_model_file;
+  os.write(buf.data(), buf.size());
+  vector<ParameterPtr>& parameters = gradientMachine->getParameters();
+  for (auto& para : parameters) {
+    para->save(os);
+    CHECK(os) << "Fail to write to " << FLAGS_model_file;
+  }
+  os.close();
+
+  return 0;
+}
diff --git a/paddle/trainer/ParamUtil.cpp b/paddle/trainer/ParamUtil.cpp
new file mode 100644
index 00000000000000..dae8b44b6db8ee
--- /dev/null
+++ b/paddle/trainer/ParamUtil.cpp
@@ -0,0 +1,159 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "ParamUtil.h"
+
+#include <fenv.h>
+#include <stdio.h>
+
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+#include <limits>
+
+#include <google/protobuf/text_format.h>
+#include <paddle/utils/Version.h>
+
+#include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
+#include "paddle/utils/GlobalConstants.h"
+
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/gserver/layers/ValidationLayer.h"
+#include "TesterConfig.h"
+
+namespace paddle {
+
+ParameterUtil::ParameterUtil(
+    const std::shared_ptr<TrainerConfigHelper> &config,
+    std::unique_ptr<ParameterUtilConfig> &&intconfig,
+    const GradientMachinePtr &gradientMachine,
+    const std::shared_ptr<ParameterUpdater> &parameterUpdater) {
+  config_ = config;
+  intConfig_ = std::move(intconfig);
+  gserver_ = gradientMachine;
+  pUpdater_ = parameterUpdater;
+}
+
+
+
+bool ParameterUtil::loadParameters(int passId, bool local, bool remote) {
+  constexpr int kBufLen = 100;
+  char buf[kBufLen];
+  snprintf(buf, kBufLen, "pass-%05d", passId);
+  std::string doneFile = path::join(config_->getSaveDir(), buf, "done");
+  if (!fileExist(doneFile.c_str())) return false;
+  loadParametersWithPath(path::join(config_->getSaveDir(), buf), local, remote);
+  return true;
+}
+
+void ParameterUtil::loadParametersWithPath(const std::string& dir,
+                                    bool local, bool remote) {
+  if (local) {
+    gserver_->loadParameters(dir);
+  }
+  if (remote && pUpdater_) {
+    pUpdater_->loadParametersRemote(dir);
+  }
+}
+
+void ParameterUtil::saveParametersOnePass(int passId, int passInnerId) {
+  pUpdater_->apply();
+  saveParameters(passId, passInnerId);
+  if (intConfig_->save_only_one_ && passId >= intConfig_->saving_period_) {
+    deleteParameters(passId - intConfig_->saving_period_);
+  }
+  pUpdater_->restore();
+}
+
+void ParameterUtil::saveParameters(int passId, int passInnerId) {
+  constexpr int kBufLen = 100;
+  char buf[kBufLen];
+  if (passInnerId > 0) {
+    snprintf(buf, kBufLen, "pass-%05d-%03d", passId, passInnerId);
+  } else {
+    snprintf(buf, kBufLen, "pass-%05d", passId);
+  }
+
+  std::string basePath = config_->getSaveDir();
+  mkDirRecursively(basePath.c_str());
+
+  std::string saveDir = path::join(basePath, buf);
+  mkDir(saveDir.c_str());
+  if (!intConfig_->load_save_param_pserver_) {
+    pUpdater_->getParametersRemote(true /*full parameter*/,
+                                  true /*after apply*/);
+  }
+
+  gserver_->saveParameters(saveDir);
+  if (intConfig_->load_save_param_pserver_) {
+    pUpdater_->saveParametersRemote(saveDir);
+  }
+  std::string doneFile = path::join(saveDir, "done");
+  touchFile(doneFile.c_str());
+  std::ofstream out(doneFile);
+  version::printVersion(out);
+  out.close();
+  VLOG(1) << "save dir " << saveDir;
+  saveConfigWithPath(saveDir);
+}
+
+void ParameterUtil::deleteParameters(int passId, int passInnerId) {
+  constexpr int kBufLen = 100;
+  char buf[kBufLen];
+  const std::string& saveDir = config_->getSaveDir();
+  if (passInnerId > 0) {
+    snprintf(buf, kBufLen, "%s/pass-%05d-%03d", saveDir.c_str(), passId,
+             passInnerId);
+  } else {
+    snprintf(buf, kBufLen, "%s/pass-%05d", saveDir.c_str(), passId);
+  }
+  mkDir(saveDir.c_str());
+  LOG(INFO) << "delete dir " << buf;
+  rmDir(buf);
+}
+
+
+void ParameterUtil::saveConfigWithPath(const std::string& path) {
+  std::string src;
+  // save config in some path
+  if (!intConfig_->config_.empty()) {
+    src = intConfig_->config_;
+  } else {
+    bool ok;
+    src = config_->getConfigName(&ok);
+    if (!ok) {
+      return;
+    }
+  }
+  copyFileToPath(src, path);
+
+  // save other import config file name to path.txt
+  std::string ss = path::join(path, "path.txt");
+  std::ofstream os(ss);
+  std::string fileName = path::basename(src);
+  CHECK(os.write(fileName.c_str(), fileName.length()))
+      << "Fail to write config file name " << ss;
+  VLOG(1) << "fileName " << fileName;
+  os.close();
+
+  // copy other import config files
+  for (int i = 0; i < config_->getConfig().config_files_size(); ++i) {
+    copyFileToPath(config_->getConfig().config_files(i), path);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/trainer/ParamUtil.h b/paddle/trainer/ParamUtil.h
new file mode 100644
index 00000000000000..cfb637a3edfdca
--- /dev/null
+++ b/paddle/trainer/ParamUtil.h
@@ -0,0 +1,126 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "paddle/utils/Util.h"
+
+#include <stdio.h>
+
+#include "hl_gpu.h"
+#include "paddle/gserver/dataproviders/DataProvider.h"
+#include "paddle/gserver/gradientmachines/GradientMachine.h"
+
+#include "TrainerConfig.pb.h"
+#include "TrainerConfigHelper.h"
+#include "ParameterUpdater.h"
+#include <fstream>
+#include <stdlib.h>
+
+namespace paddle {
+
+/**
+ * Configuration for parameter utils.
+ */
+struct ParameterUtilConfig {
+  DISABLE_COPY(ParameterUtilConfig);
+
+  ParameterUtilConfig(bool save_only_one, int saving_period,
+                      bool load_save_parameters_in_pserver,
+                      std::string config):
+                      save_only_one_(save_only_one),
+                      saving_period_(saving_period),
+                      load_save_param_pserver_(load_save_parameters_in_pserver),
+                      config_(config) {
+                      }
+
+  bool save_only_one_;
+  int saving_period_;
+  bool load_save_param_pserver_;
+  std::string config_;
+};
+
+
+/**
+ * ParameterUtil
+ * Utility class for loading and saving parameters
+ */
+class ParameterUtil {
+public:
+  /**
+   * Ctor.
+   *
+   * @param config
+   * @param intconfig
+   * @param gradientMachine
+   * @param parameterUpdater
+   * @return
+   */
+  ParameterUtil(const std::shared_ptr<TrainerConfigHelper> &config,
+                std::unique_ptr<ParameterUtilConfig> &&intconfig,
+                const GradientMachinePtr &gradientMachine,
+                const std::shared_ptr<ParameterUpdater> &parameterUpdater);
+
+  /// Load parameter from the saved parameter file as pass passId
+  /// if loadsave_parameters_in_pserver is set, some parameters MUST
+  /// load in pserver, which is "remote".
+  /// loadParameters can choose to load local/remote parameter, or both.
+  bool loadParameters(int passId, bool local = true, bool remote = false);
+
+  /// load parameters given path info
+  void loadParametersWithPath(const std::string& dir, bool local = true,
+                      bool remote = false);
+
+  /// Save parameter to dist for pass passId
+  /// passInnerId means saving times in one pass, some users want to
+  /// save parameters when have processed some batches in one pass
+  /// passInnerId = 0 means do not need to save in one inner pass
+  void saveParameters(int passId, int passInnerId = 0);
+
+  /// save parameters for one pass, when passInnerId > 0 means saving
+  /// the passInnerId times in one pass
+  void saveParametersOnePass(int passId, int passInnerId = 0);
+
+  /// delete parameter from disk via passId
+  void deleteParameters(int passId, int passInnerId = 0);
+
+  /// save config given path info
+  void saveConfigWithPath(const std::string& path);
+
+  /**
+   * Try to load parameter from config.
+   * @return true if can load from trainer config.
+   */
+  inline bool tryLoadParametersFromConfig() {
+    auto& c = config_->getConfig();
+    if (!c.init_model_path().empty()) {
+      loadParametersWithPath(c.init_model_path());
+      return true;
+    } else if (c.start_pass() > 0) {
+      CHECK(loadParameters(c.start_pass() - 1));
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+private:
+  std::shared_ptr<TrainerConfigHelper> config_;
+  std::unique_ptr<ParameterUtilConfig> intConfig_;
+  GradientMachinePtr gserver_;
+  std::shared_ptr<ParameterUpdater> pUpdater_;
+};
+
+}  //  namespace paddle
diff --git a/paddle/trainer/ParameterUpdater.cpp b/paddle/trainer/ParameterUpdater.cpp
new file mode 100644
index 00000000000000..ef2b1443d9c35e
--- /dev/null
+++ b/paddle/trainer/ParameterUpdater.cpp
@@ -0,0 +1,151 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "ParameterUpdater.h"
+
+#include "paddle/utils/Logging.h"
+
+#include "paddle/utils/Thread.h"
+
+namespace paddle {
+
+static const hl_stream_t kDeviceToHostStream = HPPL_STREAM_1;
+static const hl_stream_t kHostToDeviceStream = HPPL_STREAM_2;
+
+SgdUpdaterWithCpuAverager::SgdUpdaterWithCpuAverager(
+    const OptimizationConfig& optConfig)
+    : SgdLocalUpdater(optConfig, false /*with averager*/) {
+  CHECK(FLAGS_use_gpu && optConfig.do_average_in_cpu());
+  averager_.reset(AverageOptimizer::create(optConfig,
+                                           new DummyOptimizer(optConfig),
+                                           false /*sparse*/, true /*apply*/));
+  updateWorker_.addJob([]() { hl_set_device(FLAGS_gpu_id); });
+}
+
+void SgdUpdaterWithCpuAverager::init(std::vector<ParameterPtr>& parameters) {
+  SgdLocalUpdater::init(parameters);
+  averager_->init(parameters_.size(), nullptr);
+  copyEvents_.resize(parameters_.size());
+  for (auto& parameter : parameters) {
+    SetDevice device(parameter->getDeviceId());
+    cpuParameters_.emplace_back(new Parameter(parameter->getConfig(),
+                                              /* useGpu= */ false,
+                                              /* doInit= */ false));
+    if (parameter->useGpu()) {
+      cpuParameters_.back()->enableType(PARAMETER_APPLY);
+    } else {
+      cpuParameters_.back()->enableSharedType(
+          PARAMETER_APPLY, parameter->getBuf(PARAMETER_VALUE));
+    }
+    for (ParameterType type : averager_->getParameterTypes()) {
+      cpuParameters_.back()->enableType(type);
+    }
+
+    hl_create_event(&copyEvents_[nonStaticParaIDMap_[parameter->getID()]]);
+  }
+}
+
+SgdUpdaterWithCpuAverager::~SgdUpdaterWithCpuAverager() {
+  for (auto& event : copyEvents_) {
+    hl_destroy_event(event);
+  }
+}
+
+void SgdUpdaterWithCpuAverager::updateImpl(Parameter* para) {
+  SgdLocalUpdater::updateImpl(para);
+
+  if (para->useGpu()) {
+    size_t pid = nonStaticParaIDMap_[para->getID()];
+    Parameter* cpuPara = cpuParameters_[pid].get();
+    cpuPara->getBuf(PARAMETER_VALUE)
+        ->copyFrom(*para->getBuf(PARAMETER_VALUE), kDeviceToHostStream);
+    hl_stream_record_event(kDeviceToHostStream, copyEvents_[pid]);
+  }
+
+  updateWorker_.addJob(
+      std::bind(&SgdUpdaterWithCpuAverager::updateFunc, this, para));
+}
+
+void SgdUpdaterWithCpuAverager::updateFunc(Parameter* para) {
+  SetDevice setDevice(para->getDeviceId());
+  size_t pid = nonStaticParaIDMap_[para->getID()];
+  Parameter* cpuPara = cpuParameters_[pid].get();
+  if (para->useGpu()) {
+    hl_event_synchronize(copyEvents_[pid]);
+  }
+  averager_->update(cpuPara->getBufs(), cpuPara->getConfig(), -1LU);
+}
+
+void SgdUpdaterWithCpuAverager::finishBatch(real cost) {
+  SgdLocalUpdater::finishBatch(cost);
+
+  updateWorker_.wait();
+  for (auto para : cpuParameters_) {
+    if (auto callback = averager_->needSpecialTraversal(para->getConfig())) {
+      callback(para->getBufs(), para->getConfig(), -1LU);
+    }
+  }
+  averager_->finishBatch();
+}
+
+void SgdUpdaterWithCpuAverager::apply() {
+  // backup gpu value
+  for (auto& para : parameters_) {
+    SetDevice setDevice(para->getDeviceId());
+    para->getBuf(PARAMETER_GRADIENT)
+        ->copyFrom(*para->getBuf(PARAMETER_VALUE), kHostToDeviceStream);
+  }
+
+  // apply on cpu parameter
+  if (auto callback = averager_->apply()) {
+    for (auto para : cpuParameters_) {
+      callback(para->getBufs(), para->getConfig(), -1LU);
+    }
+  }
+
+  // copy to gpu value
+  for (auto& para : parameters_) {
+    SetDevice setDevice(para->getDeviceId());
+    size_t pid = nonStaticParaIDMap_[para->getID()];
+    Parameter* cpuPara = cpuParameters_[pid].get();
+    if (parameters_[pid]->useGpu()) {
+      para->getBuf(PARAMETER_VALUE)
+          ->copyFrom(*cpuPara->getBuf(PARAMETER_APPLY), kHostToDeviceStream);
+    }
+  }
+  hl_stream_synchronize(kHostToDeviceStream);
+  for (auto& para : parameters_) {
+    para->setValueUpdated();
+  }
+}
+
+void SgdUpdaterWithCpuAverager::restore() {
+  // restore on cpu parameter
+  if (auto callback = averager_->restore()) {
+    for (auto para : cpuParameters_) {
+      callback(para->getBufs(), para->getConfig(), -1LU);
+    }
+  }
+
+  // restore gpu value
+  for (auto& para : parameters_) {
+    SetDevice device(para->getDeviceId());
+    para->getBuf(PARAMETER_VALUE)->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
+    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+    para->setValueUpdated();
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/trainer/ParameterUpdater.h b/paddle/trainer/ParameterUpdater.h
new file mode 100644
index 00000000000000..854e6a45d890f6
--- /dev/null
+++ b/paddle/trainer/ParameterUpdater.h
@@ -0,0 +1,267 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "paddle/utils/Thread.h"
+#include "paddle/utils/Util.h"
+
+#include "paddle/parameter/AverageOptimizer.h"
+#include "paddle/parameter/FirstOrderOptimizer.h"
+#include "paddle/parameter/OptimizerFunctions.h"
+#include "paddle/parameter/OptimizerWithRegularizer.h"
+#include "paddle/parameter/Parameter.h"
+#include "paddle/parameter/ParameterUpdaterBase.h"
+
+#include "paddle/gserver/layers/Layer.h"
+#include "TrainerConfig.pb.h"
+
+#include <memory>
+#include <vector>
+
+namespace paddle {
+
+/**
+ * @brief Parameter Updater for SGD, and local(not cluster) run.
+ */
+class SgdLocalUpdater : public ParameterUpdater {
+public:
+  /**
+   * @brief Ctor. Initialize optimizer locally by optConfig.
+   * @param optConfig optimization config.
+   * @param withAverager with average optimizer or not, default is true.
+   */
+  explicit SgdLocalUpdater(const OptimizationConfig& optConfig,
+                           bool withAverager = true)
+      : numSamplesProcessed_(0) {
+    auto baseOptimizer = ParameterOptimizer::create(optConfig);
+    optimizer_.reset(withAverager
+                         ? AverageOptimizer::create(optConfig, baseOptimizer)
+                         : baseOptimizer);
+    CHECK(optimizer_) << "fail to create optimizer: "
+                      << optConfig.learning_method();
+    auto types = optimizer_->getParameterTypes();
+    for (auto type : types) {
+      addParameterType(type);
+    }
+  }
+
+  /**
+   * @brief Initialize parameters and optimizer_.
+   *        For example,
+   *           If optimizer need hassien vector, then parameter's hassien will
+   *           be initialized.
+   * @param parameters The parameter need to be initialized.
+   */
+  virtual void init(std::vector<ParameterPtr>& parameters) {
+    ParameterUpdater::init(parameters);
+    optimizer_->init(parameters_.size(), nullptr);
+    // check no L1 decay in parameter configs
+    CHECK(std::find_if(parameters.begin(), parameters.end(),
+                       [](const ParameterPtr& para) {
+                         return para->getConfig().decay_rate_l1() > 0.0f;
+                       }) == parameters.end())
+        << "SgdLocalUpdater cannot support L1 decay in parameter";
+  }
+
+  /**
+   * @brief Start a batch with current mini-batch size
+   * @param current mini-batch size.
+   * @return Always PASS_TRAIN.
+   */
+  virtual PassType startBatch(int64_t batchSize) {
+    numSamplesProcessed_ += batchSize;
+    optimizer_->startBatch(numSamplesProcessed_);
+    return PASS_TRAIN;
+  }
+
+  /**
+   * @brief finish a mini-batch.
+   */
+  virtual void finishBatch(real cost) { optimizer_->finishBatch(); }
+
+  /**
+   * @brief start a pass.
+   */
+  virtual void startPass() { optimizer_->startPass(); }
+
+  /**
+   * @brief finish a pass.
+   * @param cost sum cost during one pass.
+   * @return true if accept (used for owlqn).
+   */
+  virtual bool finishPass(real cost) {
+    optimizer_->finishPass();
+    return ParameterUpdater::finishPass(cost);
+  }
+
+  /**
+   * @brief apply model average.
+   */
+  virtual void apply() {
+    if (auto callback = optimizer_->apply()) {
+      for (auto para : parameters_) {
+        SetDevice device(para->getDeviceId());
+        callback(para->getBufs(), para->getConfig(), -1UL);
+      }
+    }
+  }
+
+  /**
+   * @brief restore parameter value before model average
+   */
+  virtual void restore() {
+    if (auto callback = optimizer_->restore()) {
+      for (auto para : parameters_) {
+        SetDevice device(para->getDeviceId());
+        callback(para->getBufs(), para->getConfig(), -1UL);
+      }
+    }
+  }
+
+protected:
+  /**
+   * @brief update method. Update value from gradient.
+   * @param para parameter that will be updated.
+   */
+  virtual void updateImpl(Parameter* para) {
+    optimizer_->update(para->getBufs(), para->getConfig());
+    if (auto callback = optimizer_->needSpecialTraversal(para->getConfig())) {
+      callback(para->getBufs(), para->getConfig(), -1UL);
+    }
+
+    para->setValueUpdated();
+    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+  }
+
+
+  std::unique_ptr<ParameterOptimizer> optimizer_;
+
+  /**
+   * @brief total number of samples processed.
+   */
+  int64_t numSamplesProcessed_;
+};
+
+/**
+ * @brief SgdCpuUpdater is used only in recursive neural network
+ * @deprecated
+ */
+class SgdCpuUpdater : public SgdLocalUpdater, public Deprecated {
+public:
+  explicit SgdCpuUpdater(const OptimizationConfig& optConfig)
+      : SgdLocalUpdater(optConfig),
+        Deprecated("SgdCpuUpdater is used only in recursive neural network, "
+                   "and recursive neural network is deprecated in paddle. "
+                   "Use it all by your own.")
+  {}
+
+  /**
+   * @brief update all parameter on finish batch.
+   * @param cost
+   */
+  virtual void finishBatch(real cost) {
+    for (auto para : parameters_) {
+      SgdLocalUpdater::update(para.get());
+    }
+    optimizer_->finishBatch();
+  }
+
+protected:
+  /**
+   * @brief do nothing.
+   * @param para
+   */
+  virtual void updateImpl(Parameter* para) {}
+  virtual void update(Parameter* para) {}
+};
+
+/**
+ * @brief Sgd Local Updater With average in cpu.
+ *
+ * It will do model average in cpu to reduce gpu memory comsuption.
+ */
+class SgdUpdaterWithCpuAverager : public SgdLocalUpdater {
+public:
+  /**
+   * @brief Ctor.
+   *
+   * SgdUpdaterWithCpuAverager will do everything as a
+   * SgdLocalUpdater, then copy parameter from GPU to CPU, and do model
+   * average in cpu.
+   */
+  explicit SgdUpdaterWithCpuAverager(const OptimizationConfig& optConfig);
+  ~SgdUpdaterWithCpuAverager();
+
+  /**
+   * @brief init. Initialize cpu parameters, model average optimizer.
+   * @param parameters
+   */
+  virtual void init(std::vector<ParameterPtr>& parameters);
+
+  virtual PassType startBatch(int64_t batchSize) {
+    averager_->startBatch(-1UL);
+    return SgdLocalUpdater::startBatch(batchSize);
+  }
+  virtual void finishBatch(real cost);
+
+  virtual void startPass() {
+    averager_->startPass();
+    SgdLocalUpdater::startPass();
+  }
+  virtual bool finishPass(real cost) {
+    averager_->finishPass();
+    return SgdLocalUpdater::finishPass(cost);
+  }
+
+  /// apply the averaged parameter to PARAMETER_VALUE
+  /// use PARAETER_GRADIENT for backing up PARAMETER_VALUE
+  virtual void apply();
+
+  /**
+   * @brief Restore parameter before apply().
+   */
+  virtual void restore();
+
+protected:
+  virtual void updateImpl(Parameter* para);
+
+  void updateFunc(Parameter* para);
+
+protected:
+  std::unique_ptr<ParameterOptimizer> averager_;
+
+  /**
+   * @brief The thread worker which do model average.
+   *
+   * For each parameter, GPU->CPU parameter is async, and do model average in
+   * another thread. Because the training process don't need model average while
+   * training, and model average only used in evaluation stage and saving stage.
+   * So the model average is totally async.
+   */
+  ThreadWorker updateWorker_;
+
+  /**
+   * @brief The parameter mirror in cpu.
+   */
+  std::vector<ParameterPtr> cpuParameters_;
+
+  /**
+   * @brief GPU -> CPU copy event. Model average will wait after copy done.
+   */
+  std::vector<hl_event_t> copyEvents_;
+};
+
+}  // namespace paddle
diff --git a/paddle/trainer/RemoteParameterUpdater.cpp b/paddle/trainer/RemoteParameterUpdater.cpp
new file mode 100644
index 00000000000000..4e677774aeeecc
--- /dev/null
+++ b/paddle/trainer/RemoteParameterUpdater.cpp
@@ -0,0 +1,828 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "RemoteParameterUpdater.h"
+#include "Trainer.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/GlobalConstants.h"
+
+P_DECLARE_int32(trainer_id);
+P_DECLARE_string(save_dir);
+
+namespace paddle {
+
+static const hl_stream_t kDeviceToHostStream = HPPL_STREAM_1;
+static const hl_stream_t kHostToDeviceStream = HPPL_STREAM_2;
+static const int kFinishBatchPid = -1;
+
+const std::string RemoteParameterUpdater::kAverage = "average";
+const std::string RemoteParameterUpdater::kElasticAverage = "elastic_average";
+
+RemoteParameterUpdater::RemoteParameterUpdater(
+    const OptimizationConfig& config, int expectedPassCount,
+    std::unique_ptr<ParameterUpdater>&& localUpdater)
+    : config_(config),
+      localUpdater_(std::move(localUpdater)),
+      numBatches_(0),
+      passCount_(0),
+      expectedPassCount_(expectedPassCount),
+      separateSendAndRecv_(false),
+      isFirstPass_(true),
+      useApplyInPserver_(false) {
+  addParameterType(PARAMETER_MOMENTUM);
+}
+
+void RemoteParameterUpdater::init(std::vector<ParameterPtr>& parameters) {
+  ParameterUpdater::init(parameters);
+
+  if (localUpdater_) {
+    localUpdater_->init(parameters);
+
+    for (auto& parameter : parameters) {
+      parameter->enableType(PARAMETER_DELTA);
+    }
+
+    CHECK(config_.center_parameter_update_method() == kAverage ||
+          config_.center_parameter_update_method() == kElasticAverage)
+        << "unknown center_parameter_update_method";
+
+    // modify delta_add_rate
+    CHECK_GT(FLAGS_num_gradient_servers, 1)
+        << "FLAGS_num_gradient_servers should be set in trainer args.";
+    real delta_add_rate = config_.delta_add_rate() / FLAGS_num_gradient_servers;
+    config_.set_delta_add_rate(delta_add_rate);
+    LOG(INFO) << "center parameter in pserver,"
+              << " modify delta_add_rate=" << delta_add_rate;
+  }
+
+  if (!FLAGS_use_gpu) {
+    cpuParameters_ = parameters;
+  } else {
+    for (auto& parameter : parameters) {
+      cpuParameters_.emplace_back(new Parameter(parameter->getConfig(),
+                                                /* useGpu= */ false));
+      cpuParameters_.back()->setID(parameter->getID());
+      if (localUpdater_) {
+        cpuParameters_.back()->enableType(PARAMETER_DELTA);
+      }
+    }
+  }
+
+  parameterClient_.reset(new ParameterClient2(separateSendAndRecv_));
+  parameterClient_->init(cpuParameters_);
+  parameterClient_->setTrainerId(FLAGS_trainer_id);
+
+  if (FLAGS_trainer_id == 0) {
+    parameterClient_->setConfig(config_);
+    copyParametersFromDevice(PARAMETER_VALUE);
+    parameterClient_->setParameter();
+    parameterClient_->setStatus(PSERVER_STATUS_PARAMETER_READY);
+  } else {
+    parameterClient_->waitForStatus(PSERVER_STATUS_PARAMETER_READY);
+    parameterClient_->getParameter();
+    copyParametersToDevice(PARAMETER_VALUE);
+  }
+  if (FLAGS_trainer_id == 0 && (config_.algorithm()
+                                != TrainAlgorithm::AsyncSGD)) {
+    startController();
+    useApplyInPserver_ = useApplyInPserver(config_);
+  }
+}
+
+void RemoteParameterUpdater::startController() {
+  controllerThread_.reset(new std::thread([this]() { this->controller(); }));
+}
+
+void RemoteParameterUpdater::controller() {
+  ParameterClient2 client(false);
+  client.init(cpuParameters_);
+  while (true) {
+    /*start pass*/ {
+      client.waitPassStart();
+
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_START_PASS);
+      client.doOperation(ops,
+                         /* waitForGradient= */ false,
+                         /* sendBackarameter= */ false,
+                         /* releasePass= */ false);
+    }
+
+    while (true) {
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_SGD);
+      client.doOperation(ops,
+                         /* waitForGradient= */ true,
+                         /* sendBackarameter= */ true,
+                         /* releasePass= */ false);
+      if (client.isPassFinish()) {
+        break;
+      }
+    }
+
+    /*finish pass*/ {
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_FINISH_PASS);
+      client.doOperation(ops,
+                         /* waitForGradient= */ true,
+                         /* sendBackarameter= */ true,
+                         /* releasePass= */ true);
+    }
+
+    passCount_++;
+    if (passCount_ == expectedPassCount_) {
+      break;
+    }
+  }
+}
+
+void RemoteParameterUpdater::copyParametersToDevice(
+    ParameterType parameterType) {
+  if (!FLAGS_use_gpu) {
+    return;
+  }
+  int numParameters = cpuParameters_.size();
+  for (int i = 0; i < numParameters; ++i) {
+    parameters_[i]
+        ->getBuf(parameterType)
+        ->copyFrom(*cpuParameters_[i]->getBuf(parameterType));
+    if (parameterType == PARAMETER_VALUE) {
+      parameters_[i]->setValueUpdated();
+    }
+  }
+}
+
+void RemoteParameterUpdater::copyParametersFromDevice(
+    ParameterType parameterType) {
+  if (!FLAGS_use_gpu) {
+    return;
+  }
+  int numParameters = cpuParameters_.size();
+  for (int i = 0; i < numParameters; ++i) {
+    cpuParameters_[i]
+        ->getBuf(parameterType)
+        ->copyFrom(*parameters_[i]->getBuf(parameterType));
+  }
+}
+
+void RemoteParameterUpdater::updateImpl(Parameter* para) {
+  REGISTER_TIMER("update");
+  if (localUpdater_) {
+    localUpdater_->update(para);
+  }
+}
+
+void RemoteParameterUpdater::finishBatch(real cost) {
+  if (localUpdater_) {
+    localUpdater_->finishBatch(cost);
+  }
+
+  const std::string& algorithm = config_.algorithm();
+  ParameterUpdateMode mode;
+  if (algorithm == TrainAlgorithm::AsyncSGD) {
+    mode = PSERVER_UPDATE_MODE_ASYNC_SGD;
+  } else if (algorithm == TrainAlgorithm::SGD) {
+    mode = PSERVER_UPDATE_MODE_ADD_GRADIENT;
+  } else {
+    LOG(FATAL) << "Unknown algorithm: " << algorithm;
+  }
+
+  ParameterType sendType;
+  bool sendBackParameter = true;
+  if (localUpdater_) {
+    ++numBatches_;
+    if (numBatches_ % config_.num_batches_per_send_parameter() != 0) {
+      return;
+    }
+
+    if (config_.center_parameter_update_method() == kElasticAverage) {
+      parameterClient_->getParameter(PARAMETER_DELTA);
+      copyParametersToDevice(PARAMETER_DELTA);
+      sendBackParameter = false;  // no need send back after send
+
+      // calc delta
+      for (auto& para : parameters_) {
+        // DELTA = LOCAL_VALUE - CENTER_VALUE/*store in DELTA*/
+        para->getBuf(PARAMETER_DELTA)
+            ->add(*para->getBuf(PARAMETER_VALUE), -1.0f, 1.0f);
+
+        // when delta send to pserver, pserver will do:
+        // CENTER_VALUE += alpha * (LOCAL_VALUE - CENTER_VALUE)
+      }
+    } else {
+      // calc delta
+      for (auto& para : parameters_) {
+        // DELTA = NEW_VALUE - OLD_VALUE/*store in DELTA*/
+        para->getBuf(PARAMETER_DELTA)
+            ->add(*para->getBuf(PARAMETER_VALUE), -1.0f, 1.0f);
+      }
+    }
+
+    sendType = PARAMETER_DELTA;
+
+  } else {
+    // In this case, we perform SGD on pserver.
+    sendType = PARAMETER_GRADIENT;
+  }
+
+  copyParametersFromDevice(sendType);
+
+  {
+    REGISTER_TIMER("sendAndRecv_dense");
+    parameterClient_->sendAndReceiveParameter(mode, sendType, batchSize_,
+                                              0,  // cost = 0
+                                              sendBackParameter);
+  }
+
+  if (sendBackParameter) {
+    copyParametersToDevice(PARAMETER_VALUE);
+  }
+
+  if (localUpdater_) {
+    if (config_.center_parameter_update_method() == kElasticAverage) {
+      for (auto& para : parameters_) {
+        SetDevice device(para->getDeviceId());
+        // LOCAL_VALUE += -alpha * (LOCAL_VALUE - CENTER_VALUE)
+        para->getBuf(PARAMETER_VALUE)
+            ->add(*para->getBuf(PARAMETER_DELTA), -config_.delta_add_rate());
+      }
+
+    } else {  // average
+      // copy value to delta
+      for (auto& para : parameters_) {
+        SetDevice device(para->getDeviceId());
+        para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
+      }
+    }
+  } else {
+    for (auto& para : parameters_) {
+      SetDevice device(para->getDeviceId());
+      para->getBuf(sendType)->zeroMem();
+    }
+  }
+}
+
+void RemoteParameterUpdater::startPass() {
+  if (config_.algorithm() == TrainAlgorithm::SGD) {
+    parameterClient_->waitPassStart();
+  } else {
+    // sync could benifits reducing lagged trainer for async-sgd
+    // even if sync could not remove all lagged trainer for the
+    // sake of file loading, buffer etc.
+    parameterClient_->asyncStartPass();
+  }
+
+  if (localUpdater_) {
+    localUpdater_->startPass();
+    numBatches_ = 0;
+
+    if (config_.center_parameter_update_method() == kElasticAverage) {
+      if (!isFirstPass_) {
+        // restore local value from delta
+        for (auto& para : parameters_) {
+          SetDevice device(para->getDeviceId());
+          para->getBuf(PARAMETER_VALUE)
+              ->copyFrom(*para->getBuf(PARAMETER_DELTA));
+        }
+      }
+    } else {  // average
+      // copy value to delta
+      for (auto& para : parameters_) {
+        SetDevice device(para->getDeviceId());
+        para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
+      }
+    }
+  }
+}
+
+bool RemoteParameterUpdater::finishPass(real cost) {
+  if (localUpdater_) {
+    localUpdater_->finishPass();
+  }
+
+  if (config_.algorithm() == TrainAlgorithm::SGD) {
+    parameterClient_->waitPassFinish();
+  } else {
+    parameterClient_->asyncFinishPass();
+  }
+  if (localUpdater_) {
+    if (config_.center_parameter_update_method() == kElasticAverage) {
+      // backup local value to delta as we will get
+      // the remote parameter for saving/testing
+      for (auto& para : parameters_) {
+        SetDevice device(para->getDeviceId());
+        para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
+      }
+    }
+  }
+  parameterClient_->getParameter();
+  copyParametersToDevice(PARAMETER_VALUE);
+
+  isFirstPass_ = false;
+  return true;
+}
+
+void RemoteParameterUpdater::apply() {
+  if (useApplyInPserver_) {
+    PreparedOperations ops;
+    ops.addOperation(PSERVER_OP_APPLY);
+    parameterClient_->doOperation(ops,
+                                  /* waitForGradient= */ false,
+                                  /* sendBackarameter= */ false);
+    parameterClient_->getParameter(
+        /* recvParameterType= */ PARAMETER_VALUE,
+        /* sendBackParameterType= */ PARAMETER_APPLY);
+    copyParametersToDevice(PARAMETER_VALUE);
+  }
+}
+
+void RemoteParameterUpdater::restore() {
+  if (useApplyInPserver_) {
+    parameterClient_->getParameter();
+    copyParametersToDevice(PARAMETER_VALUE);
+  }
+}
+
+ConcurrentRemoteParameterUpdater::ConcurrentRemoteParameterUpdater(
+    OptimizationConfig config, int passCount,
+    std::unique_ptr<ParameterUpdater>&& localUpdater)
+    : RemoteParameterUpdater(config, passCount, std::move(localUpdater)) {
+  sendThread_.reset(new std::thread([this]() { this->send(); }));
+  recvThread_.reset(new std::thread([this]() { this->recv(); }));
+
+  stopping_ = false;
+  oneBatchFinished_ = false;
+  separateSendAndRecv_ = true;
+}
+
+ConcurrentRemoteParameterUpdater::~ConcurrentRemoteParameterUpdater() {
+  stopping_ = true;
+  sendQueue_.enqueue(0);
+  sendThread_->join();
+  recvQueue_.enqueue(0);
+  recvThread_->join();
+}
+
+void ConcurrentRemoteParameterUpdater::finishBatch(real cost) {
+  if (localUpdater_) {
+    localUpdater_->finishBatch(cost);
+
+    if (!needToUpdateRemotely()) {
+      ++numBatches_;
+      return;
+    }
+  }
+
+  sendQueue_.enqueue(kFinishBatchPid);
+
+  finishBatchCond_.wait([this]() { return oneBatchFinished_; });
+  oneBatchFinished_ = false;
+  {
+    REGISTER_TIMER("sync_hostToDeviceStream");
+    for (auto& para : parameters_) {
+      SetDevice device(para->getDeviceId());
+      hl_stream_synchronize(kHostToDeviceStream);
+    }
+  }
+
+  if (localUpdater_) {
+    ++numBatches_;
+  }
+}
+
+// Use para=NULL to signal the end of one batch
+void ConcurrentRemoteParameterUpdater::send(Parameter* para) {
+  const std::string& algorithm = config_.algorithm();
+  ParameterUpdateMode mode;
+  if (algorithm == TrainAlgorithm::AsyncSGD) {
+    mode = PSERVER_UPDATE_MODE_ASYNC_SGD;
+  } else if (algorithm == TrainAlgorithm::SGD) {
+    mode = PSERVER_UPDATE_MODE_ADD_GRADIENT;
+  } else {
+    LOG(FATAL) << "Unknown algorithm: " << algorithm;
+  }
+  ParameterType sendType;
+  if (localUpdater_) {
+    sendType = PARAMETER_DELTA;
+  } else {
+    // In this case, we perform SGD on pserver.
+    sendType = PARAMETER_GRADIENT;
+  }
+  std::vector<ParameterSegments> paraSegment;
+  if (para == NULL) {
+    parameterClient_->sendParameter(
+        mode, sendType, paraSegment, batchSize_,
+        0,              // cost=0
+        true,           // sendBackParameter = true
+        batchStatus_);  // batchStatus_ = BATCH_FINISH
+
+  } else {
+    ParameterSegments paraSegTemp;
+    paraSegment.reserve(1);
+    paraSegTemp.name = para->getName();
+    paraSegTemp.id = para->getID();
+    paraSegment.push_back(paraSegTemp);
+    {
+      SetDevice device(para->getDeviceId());
+      REGISTER_TIMER("copySingleParaFromDevice");
+      copySingleParaFromDevice(para, sendType);
+      hl_stream_synchronize(kDeviceToHostStream);
+    }
+    parameterClient_->sendParameter(mode, sendType, paraSegment, batchSize_,
+                                    0,     // cost=0
+                                    true,  // sendBackParameter = true
+                                    batchStatus_);
+    if (batchStatus_ == BATCH_START) batchStatus_ = BATCH_ON;
+  }
+}
+void ConcurrentRemoteParameterUpdater::recv(Parameter* para) {
+  parameterClient_->recvParameter();
+  if (para != NULL) {
+    REGISTER_TIMER("copySingleParaToDevice");
+    SetDevice device(para->getDeviceId());
+    copySingleParaToDevice(para, PARAMETER_VALUE);
+
+    if (localUpdater_) {
+      para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
+    } else {
+      // if cpu, parameter should not changes until recvParameter().
+      // if gpu, zero mem when send finish
+      if (!FLAGS_use_gpu) {
+        para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+      }
+    }
+  }
+}
+
+void ConcurrentRemoteParameterUpdater::recv() {
+  hl_set_device(FLAGS_gpu_id);
+  StatPtr stat = getStat("recv");
+  FOR_TIMING(Timer timer);
+  while (true) {
+    int pid;
+    {
+      REGISTER_TIMER("recv_dequeue");
+      pid = recvQueue_.dequeue();
+    }
+    if (pid == kFinishBatchPid) {
+      Parameter* para = NULL;
+      FOR_TIMING(timer.start());
+      recv(para);
+      FOR_TIMING(timer.stop());
+      FOR_TIMING(stat->addSample(timer.get()));
+      FOR_TIMING(timer.reset());
+      finishBatchCond_.notify_all([this] { oneBatchFinished_ = true; });
+    } else {
+      if (stopping_) break;
+      Parameter* para = parameters_[pid].get();
+      FOR_TIMING(timer.start());
+      recv(para);
+      FOR_TIMING(timer.stop());
+      oneBatchFinished_ = false;
+    }
+  }
+}
+
+void ConcurrentRemoteParameterUpdater::send() {
+  hl_set_device(FLAGS_gpu_id);
+  StatPtr stat = getStat("send");
+  FOR_TIMING(Timer timer);
+  while (true) {
+    int pid;
+    {
+      REGISTER_TIMER("send_dequeue");
+      pid = sendQueue_.dequeue();
+    }
+    if (pid == kFinishBatchPid) {
+      batchStatus_ = BATCH_FINISH;
+      if (!localUpdater_) {
+        // if cpu, parameter should not changes until recvParameter().
+        // if gpu, zeroMem() at the end of batch so that it won't
+        // interfere with computation.
+        if (FLAGS_use_gpu) {
+          REGISTER_TIMER("para_zeroMem");
+          for (auto& para : parameters_) {
+            SetDevice device(para->getDeviceId());
+            para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+          }
+        }
+      }
+      Parameter* para = NULL;
+      FOR_TIMING(timer.start());
+      send(para);
+      FOR_TIMING(timer.stop());
+      FOR_TIMING(stat->addSample(timer.get()));
+      FOR_TIMING(timer.reset());
+      recvQueue_.enqueue(pid);
+    } else {
+      if (stopping_) break;
+      Parameter* para = parameters_[pid].get();
+      if (localUpdater_) {
+        // DELTA = NEW_VALUE - OLD_VALUE/*store in DELTA*/
+        para->getBuf(PARAMETER_DELTA)
+            ->add(*para->getBuf(PARAMETER_VALUE), -1.0f, 1.0f);
+      }
+      FOR_TIMING(timer.start());
+      send(para);
+      FOR_TIMING(timer.stop());
+      recvQueue_.enqueue(nonStaticParaIDMap_[para->getID()]);
+    }
+  }
+}
+
+void ConcurrentRemoteParameterUpdater::updateImpl(Parameter* para) {
+  REGISTER_TIMER("update");
+  if (localUpdater_) {
+    localUpdater_->update(para);
+    if (!needToUpdateRemotely()) {
+      return;
+    }
+  }
+  sendQueue_.enqueue(nonStaticParaIDMap_[para->getID()]);
+}
+
+void ConcurrentRemoteParameterUpdater::copySingleParaToDevice(
+    Parameter* para, ParameterType parameterType) {
+  if (!FLAGS_use_gpu) {
+    return;
+  }
+  int i = nonStaticParaIDMap_[para->getID()];
+  para->getBuf(parameterType)
+      ->copyFrom(*cpuParameters_[i]->getBuf(parameterType),
+                 kHostToDeviceStream);
+  if (parameterType == PARAMETER_VALUE) {
+    para->setValueUpdated();
+  }
+}
+
+void ConcurrentRemoteParameterUpdater::copySingleParaFromDevice(
+    Parameter* para, ParameterType parameterType) {
+  if (!FLAGS_use_gpu) {
+    return;
+  }
+  int i = nonStaticParaIDMap_[para->getID()];
+  cpuParameters_[i]
+      ->getBuf(parameterType)
+      ->copyFrom(*para->getBuf(parameterType), kDeviceToHostStream);
+}
+
+SparseRemoteParameterUpdater::SparseRemoteParameterUpdater(
+    const OptimizationConfig& config, int expectedPassCount, bool testing)
+    : config_(config),
+      passCount_(0),
+      expectedPassCount_(expectedPassCount),
+      testing_(testing),
+      useApplyInPserver_(false) {}
+
+void SparseRemoteParameterUpdater::init(std::vector<ParameterPtr>& parameters) {
+  ParameterUpdater::init(parameters);
+
+  parameterClient_.reset(new ParameterClient2(false,
+      FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse));
+  parameterClient_->init(parameters_);
+  parameterClient_->setTrainerId(FLAGS_trainer_id);
+
+  if (FLAGS_trainer_id == 0) {
+    parameterClient_->setConfig(config_, FLAGS_save_dir,
+                                true /*is_sparse_server*/);
+    if (parameters[0]->isFullSize()) {
+      parameterClient_->setParameter();
+    } else {  // init in pserver
+      parameterClient_->setParameterZero();
+    }
+  }
+  if (FLAGS_trainer_id == 0 && !testing_ &&
+      config_.algorithm() == TrainAlgorithm::SGD) {
+    startController();
+    useApplyInPserver_ = useApplyInPserver(config_);
+  }
+}
+
+void SparseRemoteParameterUpdater::startController() {
+  controllerThread_.reset(new std::thread([this]() { this->controller(); }));
+}
+
+void SparseRemoteParameterUpdater::controller() {
+  ParameterClient2 client(false,
+                          FLAGS_port + FLAGS_ports_num,
+                          FLAGS_ports_num_for_sparse);
+  client.init(parameters_);
+
+  while (true) {
+    /*start pass*/ {
+      client.waitPassStart();
+
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_START_PASS);
+      client.doOperation(ops,
+                         /* waitForGradient= */ false,
+                         /* sendBackarameter= */ false,
+                         /* releasePass= */ false);
+    }
+
+    while (true) {
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_SGD);
+      client.doOperation(ops,
+                         /* waitForGradient= */ true,
+                         /* sendBackarameter= */ true,
+                         /* releasePass= */ false);
+      if (client.isPassFinish()) {
+        break;
+      }
+    }
+
+    /*finish pass*/ {
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_FINISH_PASS);
+      client.doOperation(ops,
+                         /* waitForGradient= */ true,
+                         /* sendBackarameter= */ true,
+                         /* releasePass= */ true);
+    }
+
+    passCount_++;
+    if (passCount_ == expectedPassCount_) {
+      break;
+    }
+  }
+}
+
+PassType SparseRemoteParameterUpdater::startBatch(int64_t batchSize) {
+  batchSize_ = batchSize;
+  return PASS_TRAIN;
+}
+
+void SparseRemoteParameterUpdater::finishBatch(real cost) {
+  const std::string& algorithm = config_.algorithm();
+  ParameterUpdateMode mode;
+  if (algorithm == TrainAlgorithm::AsyncSGD) {
+    mode = PSERVER_UPDATE_MODE_ASYNC_SGD;
+  } else if (algorithm == TrainAlgorithm::SGD) {
+    mode = PSERVER_UPDATE_MODE_ADD_GRADIENT;
+  } else {
+    LOG(FATAL) << "Unknown algorithm: " << algorithm;
+  }
+
+  ParameterType sendType = PARAMETER_GRADIENT;
+
+  REGISTER_TIMER("sendSparseParam");
+  parameterClient_->sendAndReceiveParameter(mode, sendType, batchSize_,
+                                            0,       // cost = 0
+                                            false);  // sendBackParameter
+
+  // grad zero move to sgd grad machine, before merge grad sparse remote
+}
+
+void SparseRemoteParameterUpdater::startPass() {
+  if (config_.algorithm() == TrainAlgorithm::SGD) {
+    parameterClient_->waitPassStart();
+  } else {
+    if (FLAGS_trainer_id == 0) {
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_START_PASS);
+      parameterClient_->doOperation(ops,
+                                    /* waitForGradient= */ false,
+                                    /* sendBackarameter= */ false);
+    }
+    parameterClient_->asyncStartPass();
+  }
+}
+
+bool SparseRemoteParameterUpdater::finishPass(real cost) {
+  if (config_.algorithm() == TrainAlgorithm::SGD) {
+    parameterClient_->waitPassFinish();
+  } else {
+    if (FLAGS_trainer_id == 0) {
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_FINISH_PASS);
+      parameterClient_->doOperation(ops,
+                                    /* waitForGradient= */ false,
+                                    /* sendBackarameter= */ false);
+    }
+    parameterClient_->asyncFinishPass();
+  }
+
+  return true;
+}
+
+// Trainer will call getParametersRemote at batch start or before save,
+// so we do not get values in apply() and restore().
+void SparseRemoteParameterUpdater::apply() {
+  if (useApplyInPserver_) {
+    PreparedOperations ops;
+    ops.addOperation(PSERVER_OP_APPLY);
+    parameterClient_->doOperation(ops,
+                                  /* waitForGradient= */ false,
+                                  /* sendBackarameter= */ false);
+  }
+}
+
+void SparseRemoteParameterUpdater::restore() {}
+
+void SparseRemoteParameterUpdater::getParametersRemote(bool fullSize,
+                                                       bool apply) {
+  ParameterType sendBackParameterType =
+      (useApplyInPserver_ && apply) ? PARAMETER_APPLY : PARAMETER_VALUE;
+  if (fullSize) {
+    parameterClient_->getParameter(
+        /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
+    if (config_.shrink_parameter_value() > 0) {
+      for (auto& para : parameters_) {
+        if (para->getConfig().decay_rate_l1() > 0) {
+          para->getBuf(PARAMETER_VALUE)
+              ->applyL1(1.0f,                               // learningRate
+                        config_.shrink_parameter_value());  // decayRate
+        }
+      }
+    }
+  } else {
+    REGISTER_TIMER("getParamSparse");
+    parameterClient_->getParameterSparse(
+        /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
+    if (config_.shrink_parameter_value() > 0) {
+      for (auto& para : parameters_) {
+        if (para->getConfig().decay_rate_l1() > 0) {
+          para->getPrefetchMatrix()->applyL1Decay(
+              1.0f,                               // learningRate
+              config_.shrink_parameter_value());  // decayRate
+        }
+      }
+    }
+  }
+}
+
+void SparseRemoteParameterUpdater::randParametersRemote() {
+  CHECK_EQ(FLAGS_trainer_id, 0);
+
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_RANDOMIZE);
+  parameterClient_->doOperation(ops,
+                                /* waitForGradient= */ false,
+                                /* sendBackarameter= */ false);
+}
+
+void SparseRemoteParameterUpdater::loadParametersRemote(
+    const std::string& dirName) {
+  if (FLAGS_trainer_id == 0) {
+    parameterClient_->loadValueVector(dirName);
+  }
+
+  if (testing_) {
+    // we do not use synchronize() here,
+    // because test mode may run only one tester
+    if (FLAGS_trainer_id == 0) {
+      parameterClient_->setStatus(PSERVER_STATUS_PARAMETER_READY);
+    } else {
+      parameterClient_->waitForStatus(PSERVER_STATUS_PARAMETER_READY);
+    }
+  }
+}
+
+void SparseRemoteParameterUpdater::saveParametersRemote(
+    const std::string& dirName) {
+  if (FLAGS_trainer_id == 0) {
+    parameterClient_->saveValueVector(dirName);
+  }
+}
+
+void SparseRemoteParameterUpdaterComposite::init(
+    std::vector<ParameterPtr>& parameters) {
+  parameters_ = parameters;
+
+  std::vector<ParameterPtr> parametersArray[NUMBER_UPDATERS];
+
+  for (auto& para : parameters_) {
+    if (para->isSparseRemoteUpdate()) {
+      parametersArray[UPDATER_SPARSE_REMOTE].push_back(para);
+    } else {
+      parametersArray[UPDATER_NORMAL].push_back(para);
+    }
+  }
+  CHECK(!parametersArray[UPDATER_SPARSE_REMOTE].empty());
+  CHECK(!parametersArray[UPDATER_NORMAL].empty());
+
+  syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
+    updaters_[tid]->init(parametersArray[tid]);
+  });
+
+  parameterTypes_ = updaters_[UPDATER_NORMAL]->getParameterTypes();
+}
+
+std::vector<std::function<ParameterUpdater*(
+    const std::string&, const OptimizationConfig&, bool, size_t)>>
+ParameterUpdaterCreators::constructors_;
+
+}  // namespace paddle
diff --git a/paddle/trainer/RemoteParameterUpdater.h b/paddle/trainer/RemoteParameterUpdater.h
new file mode 100644
index 00000000000000..80b2978358be3b
--- /dev/null
+++ b/paddle/trainer/RemoteParameterUpdater.h
@@ -0,0 +1,416 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <thread>
+#include <functional>
+#include "paddle/pserver/ParameterClient2.h"
+#include "ParameterUpdater.h"
+#include "paddle/utils/Util.h"
+#include "paddle/utils/Queue.h"
+
+namespace paddle {
+
+/**
+ * this module call ParameterClient to exchange parameters among all
+ * parameters servers.
+ */
+
+// TODO:(yanfei)
+// I think that the biggest feature of rdma is packet lossless control
+// feature instead of high bandwiths, zero copy and gpu-direct rdma in
+// theroy.
+// But zero-copy and gpu-direct rdma features can help to reduce latency
+// caused by os system.
+// So, for some specified cluster, such as high density gpu cluster,
+// gpu-direct and zero copy could help to improve cluster communication
+// performance.
+//
+
+/**
+ * normal remote parameter updater for dense parameters.
+ *
+ * it first packs all parameters for all pservers using ParameterClient
+ * module, then wait for return merged parameters data from all pservers.
+ * the synchronization pattern specified by sync-sgd or async-sgd is
+ * achieved by all pservers with the help of the controller within this
+ * remote parameter updater.
+ * this module indeedly bridges the gradient machines and parameter servers.
+ * it help to transfer the parameters from acceleration device to cpu end
+ * for network. it contains additional parameters copy buffers for
+ * acceleration devices at cpu end, such as gpu, otherwise it will
+ * directly use original parameters data to launching.
+ *
+ * this remote parameter updater do not use pipeline mechanism to hide
+ * copy latency from gpu to cpu buffer, as well as the overlapped between
+ * backward and communication is not supported.
+ */
+class RemoteParameterUpdater : public ParameterUpdater {
+public:
+  RemoteParameterUpdater(
+      const OptimizationConfig& config, int expectedPpassCount,
+      std::unique_ptr<ParameterUpdater>&& localUpdater = nullptr);
+  ~RemoteParameterUpdater() {
+    if (controllerThread_) {
+      controllerThread_->join();
+    }
+  }
+
+  /**
+   * initialize the internal parameter client and itself.
+   */
+  virtual void init(std::vector<ParameterPtr>& parameters);
+  /**
+   * start batch
+   * one batch training exhibits stateful feature to help
+   * to do performance tuning, sgd optimization if necessary.
+   */
+  virtual PassType startBatch(int64_t batchSize) {
+    if (localUpdater_) {
+      localUpdater_->startBatch(batchSize);
+    }
+    batchSize_ = batchSize;
+    batchStatus_ = BATCH_START;
+    return PASS_TRAIN;
+  }
+
+  /**
+   * send parameters to pservers and get returned parameters
+   * from all pservers if necessary. it will implictly
+   * cooperate with controller thread for sync-sgd.
+   */
+  virtual void finishBatch(real cost);
+  virtual void startPass();
+  virtual bool finishPass(real cost);
+
+#ifndef PADDLE_DISABLE_TIMER
+  virtual void setForwardbackwardTime(uint64_t delta) {
+    parameterClient_->setForwardbackwardTime(delta);
+  }
+#endif
+
+  virtual void apply();
+  virtual void restore();
+
+protected:
+  /**
+   * control all pservers with all trainers for sync-sgd
+   */
+  virtual void controller();
+
+  /**
+   * work need to do after finishBatch
+   */
+  virtual void updateImpl(Parameter* para);
+
+  void startController();
+
+  /**
+   * @brief copy parameters from cpu host to device, such as gpu.
+   *
+   * @note  return if all data are transfered.
+   */
+  void copyParametersToDevice(ParameterType parameterType);
+
+  /**
+   * @brief copy parameters from device to cpu host
+   *
+   * @note  return if all data are transfered
+   */
+  void copyParametersFromDevice(ParameterType parameterType);
+
+protected:
+  /// Optimization config used to guide initialization and finishBatch
+  OptimizationConfig config_;
+  /// internal parameter client object for exchanging data with pserver
+  std::unique_ptr<ParameterClient2> parameterClient_;
+  /// internal shadow buffer at cpu host end, use original parameters_
+  /// if no acceleration devices are used.
+  std::vector<ParameterPtr> cpuParameters_;
+  /// local updater for aggregating multi-batches local delta
+  std::unique_ptr<ParameterUpdater> localUpdater_;
+  /// the size of mini-batch
+  int64_t batchSize_;
+  /// batches passed
+  int64_t numBatches_;
+  /// for stateful control
+  BatchStatus batchStatus_;
+  /// controller thread for sync-sgd
+  std::unique_ptr<std::thread> controllerThread_;
+  /// passed alread finished
+  int64_t passCount_;
+  /// expected passes to finished
+  int64_t expectedPassCount_;
+  /// use normal synchronization communication if True
+  bool separateSendAndRecv_;
+  /// true if it's first pass
+  bool isFirstPass_;
+  bool useApplyInPserver_;
+
+  static const std::string kAverage;
+  static const std::string kElasticAverage;
+};
+
+// TODO:(yanfei)
+// do parameters level synchronization Optimization at pserver end with
+// ConcurrentRemoteParameterUpdater to get more parallelization, at last
+// to really hide pserver latency in backward computation.
+//
+/**
+ * this updater add additional optimization for overlapping synchronization
+ * from pservers with backward computation.
+ *
+ * parameter can be sent to pservers when related backward stage is finished.
+ * this concurrent udpater does data copy from acceleration device to host
+ * memory aynchronously. In addition internal parameter client reads data in
+ * host memory and send them to all pservers in next stage. So this class
+ * help to pipeline device-to-host copy and host-to-network to hide network
+ * latency in backward stage.
+ * it contains separate send and recv thread for pipeline usage.
+ */
+class ConcurrentRemoteParameterUpdater : public RemoteParameterUpdater {
+public:
+  ConcurrentRemoteParameterUpdater(
+      OptimizationConfig config, int expectedPassCount,
+      std::unique_ptr<ParameterUpdater>&& localUpdater);
+  ~ConcurrentRemoteParameterUpdater();
+
+  /**
+   * @brief send paraemeters to all pservers
+   *
+   * @note  it just signal the end signal to internal parameter client
+   *        to finished the aynchronous send action. In addition it also
+   *        do synchronization for all asynchronous host-to-device copy.
+   */
+  virtual void finishBatch(real cost);
+
+protected:
+  virtual void updateImpl(Parameter* para);
+  /// internal thread called in send thread
+  void send(Parameter* para);  // para == NULL indicate end of a minibatch
+  /// internal function called in recv thread
+  void recv(Parameter* para);
+  /**
+   * @brief send thread for relaying data from gradient to parameter client
+   *
+   * @note  just pipe data to internal parameter client for pipeline
+   */
+  void send();
+  /**
+   * @brief recv thread for relaying data from internal parameter client to
+   *        host memory
+   *
+   * @note  it contains the asynchronous data copy form host to device
+   */
+  void recv();
+  /// copy specified parameter from host to device
+  void copySingleParaToDevice(Parameter* para, ParameterType parameterType);
+  /// copy specified parameter from device to host
+  void copySingleParaFromDevice(Parameter* para, ParameterType parameterType);
+  bool needToUpdateRemotely() {
+    return (numBatches_ + 1) % config_.num_batches_per_send_parameter() == 0;
+  }
+
+private:
+  /// send thread used for overlapping
+  std::unique_ptr<std::thread> sendThread_;
+  /// recv thread used for overlapping
+  std::unique_ptr<std::thread> recvThread_;
+  /// buffer queue for overlapping
+  Queue<int> sendQueue_;
+  /// buffer queue for overlapping
+  Queue<int> recvQueue_;
+  /// flags indicating to stop
+  bool stopping_;
+  /// conditional variable for threads synchronization between the
+  /// thread calling finishBatch and internal recv thread
+  LockedCondition finishBatchCond_;
+  bool oneBatchFinished_;
+};
+
+// TODO:(yanfei)
+// merge sparse updater with dense updater, and could help to reduce
+// the synchronization between sparse and dense udpater. it could also
+// reduce the threads for managing all connections.
+/**
+ * this class is specified for updating sparse parameters.
+ *
+ * it allows part of parameter to be exchanged with all pservers.
+ * if sparse input assigned, part gradients of first hidden layer
+ * could remained zero which can not need to be exchanged within
+ * all pservers. this is the key optimization point for this updater
+ *
+ * for updating sparse parameters, all latest parameters are stored
+ * in pservers instead of keeping full copy at train end, so need
+ * prefetch parameters weight value which can be changed in next-batch
+ * before doing next forwardbackward. Also, with above fact that the
+ * parameters can be stored in pserver instead of trainer, we can
+ * fetch specified parmeters if necessary, and can support huge
+ * parameters which is larger enough than  the RAM size in single
+ * node.
+ *
+ * Internally, this updater will direct internal parameter client
+ * to encapsulate sparse specified message for all pservers.
+ */
+class SparseRemoteParameterUpdater : public ParameterUpdater {
+public:
+  SparseRemoteParameterUpdater(const OptimizationConfig& config,
+                               int expectedPassCount, bool testing);
+  ~SparseRemoteParameterUpdater() {
+    if (controllerThread_) {
+      controllerThread_->join();
+    }
+  }
+
+  /// initialization
+  virtual void init(std::vector<ParameterPtr>& parameters);
+
+  /// stateful batch control
+  virtual PassType startBatch(int64_t batchSize);
+  /// send all sparse related parameters to all pservers
+  virtual void finishBatch(real cost);
+  virtual void startPass();
+  virtual bool finishPass(real cost);
+
+  virtual void apply();
+  virtual void restore();
+
+  /// load parameters from pservers
+  virtual void loadParametersRemote(const std::string& dirName);
+  /// save parameters to pservers
+  virtual void saveParametersRemote(const std::string& dirName);
+  /**
+   * @brief get latest sparse parameters value from all pservers
+   *
+   * @note  call it before next mini-batch
+   */
+  virtual void getParametersRemote(bool fullSize, bool apply);
+  virtual void randParametersRemote();
+#ifndef PADDLE_DISABLE_TIMER
+  virtual void setForwardbackwardTime(uint64_t delta) {
+    parameterClient_->setForwardbackwardTime(delta);
+  }
+#endif
+
+protected:
+  /// update implimentation, not implemented
+  virtual void updateImpl(Parameter* para) {}
+
+  /// internal controller routine for controller thread
+  virtual void controller();
+
+  /// start controller thread
+  void startController();
+
+protected:
+  /// optimization config
+  OptimizationConfig config_;
+  /// internal parameter client
+  std::unique_ptr<ParameterClient2> parameterClient_;
+  int64_t batchSize_;
+  std::unique_ptr<std::thread> controllerThread_;
+  int64_t passCount_;
+  int64_t expectedPassCount_;
+  bool testing_;
+  bool useApplyInPserver_;
+};
+
+/**
+ * class for supporting normal updater and sparse updater
+ *
+ * not all parts of one model are sparse, so it exists dense updater
+ * for normal layers which sparse updater is for sparse layers.
+ *
+ * it directly call internal dense and sparse udpater individually.
+ */
+class SparseRemoteParameterUpdaterComposite : public ParameterUpdaterComposite {
+public:
+  enum {
+    UPDATER_SPARSE_REMOTE = 0,  // execute in sync thread pool(tid:0)
+    UPDATER_NORMAL = 1,         // execute in Owner thread(tid:1)
+    NUMBER_UPDATERS = 2,
+  };
+  /**
+   * @brief create one dense updater and one sparse updater
+   *
+   * @note  use syncThreadPool to synchronize these two updaters
+   */
+  SparseRemoteParameterUpdaterComposite(
+      const OptimizationConfig& config, int expectedPassCount, bool testing,
+      std::unique_ptr<ParameterUpdater>&& normalUpdater) {
+    updaters_.resize(NUMBER_UPDATERS);
+    updaters_[UPDATER_SPARSE_REMOTE].reset(
+        new SparseRemoteParameterUpdater(config, expectedPassCount, testing));
+    updaters_[UPDATER_NORMAL] = std::move(normalUpdater);
+
+    syncThreadPool_.reset(new SyncThreadPool(NUMBER_UPDATERS - 1));
+  }
+
+  /// initialization of dense and sparse updaters
+  virtual void init(std::vector<ParameterPtr>& parameters);
+};
+
+class ParameterUpdaterCreators {
+public:
+  /**
+   * @brief add a creator to create custom ParameterUpdater while training.
+   *        The creator is a function with type (alogrithm, optConfig, isLocal,
+   *        numPasses) -> ParameterUpdater*. Trainer will use this
+   *        ParameterUpdater if creator can create a no nullptr
+   *        ParameterUpdater. Return nullptr will use trainer's default
+   *        updaters.
+   *
+   * @param creator method which can create ParameterUpdater.
+   */
+  static void addCreator(
+      const std::function<ParameterUpdater*(
+          const std::string&,  // algo
+          const OptimizationConfig&,  // optConfig
+          bool,  // isLocal
+          size_t  // numPasses
+        )>& creator) {    // NOLINT  explicit move closing ) in this line
+                          // for readability
+    constructors_.push_back(creator);
+  }
+
+  /**
+   * @brief Try to create an updater by given algo, optConfig, isLocal,
+   *        numPasses. Return nullptr if cannot create anyone.
+   * @param algo algorithm string.
+   * @param optConfig optimization config.
+   * @param isLocal is in local mode or not.
+   * @param numPasses total passes that trainer will train.
+   * @return nullptr if fail, not nullptr if we can create an updater.
+   */
+  static ParameterUpdater* tryCreateUpdater(const std::string& algo,
+                                            const OptimizationConfig& optConfig,
+                                            bool isLocal,
+                                            size_t numPasses) {
+    for (auto & c : constructors_) {
+      if (auto updater = c(algo, optConfig, isLocal, numPasses)) {
+        return updater;
+      }
+    }
+    return nullptr;
+  }
+
+private:
+  static std::vector<std::function<ParameterUpdater*(
+      const std::string&, const OptimizationConfig&, bool, size_t)>>
+  constructors_;
+};
+
+}  // namespace paddle
diff --git a/paddle/trainer/Tester.cpp b/paddle/trainer/Tester.cpp
new file mode 100644
index 00000000000000..ccf06e1d84edc4
--- /dev/null
+++ b/paddle/trainer/Tester.cpp
@@ -0,0 +1,372 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "Tester.h"
+
+#include <fenv.h>
+#include <stdio.h>
+
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+#include <limits>
+
+#include <google/protobuf/text_format.h>
+
+#include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
+#include "paddle/utils/GlobalConstants.h"
+
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/gserver/layers/ValidationLayer.h"
+#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
+#include "TesterConfig.h"
+
+namespace paddle {
+
+Tester::Tester(const std::shared_ptr<TrainerConfigHelper> &config,
+               std::unique_ptr<TesterConfig> &&intconfig,
+               const GradientMachinePtr &gradientMachine,
+               const std::shared_ptr<ParameterUpdater> &parameterUpdater,
+               std::shared_ptr<DataProvider> testDataProvider):
+               config_(config),
+               intconfig_(std::move(intconfig)),
+               gradientMachine_(gradientMachine),
+               parameterUpdater_(parameterUpdater),
+               testDataProvider_(testDataProvider) {
+  testEvaluator_.reset(gradientMachine_ ->makeEvaluator());
+  if (intconfig_->distributeTest) {
+    testParameterClient_.reset(new ParameterClient2(true));
+  }
+
+  if (testParameterClient_) {
+    testParameterClient_->init(
+        gradientMachine_->getParameters());
+  }
+
+  std::unique_ptr<ParameterUtilConfig> paramConfig(
+      new ParameterUtilConfig(
+          intconfig_->saveOnlyOne,
+          intconfig_->savingPeriod,
+          intconfig_->loadsaveParametersInPserver,
+          intconfig_->config));
+
+  paramUtil_.reset(new ParameterUtil(
+      config_,
+      std::move(paramConfig),
+      gradientMachine_,
+      parameterUpdater_));
+}
+
+void Tester::testOnePeriod() {
+  DataBatch dataBatch;
+  int64_t batchSize = config_->getOptConfig().batch_size();
+  testEvaluator_->start();
+  real cost = 0;
+  int64_t numSamples = 0;
+  bool testAllData =
+      intconfig_->testPeriod == 0 || intconfig_->testAllDataInOnePeriod;
+
+  int batches =
+      testAllData ? std::numeric_limits<int>::max() : intconfig_->testPeriod;
+
+  parameterUpdater_->apply();
+  if (intconfig_->prevBatchState) {
+    gradientMachine_->getState(*intconfig_->trainState);
+    gradientMachine_->setState(*intconfig_->testState);
+  }
+
+  for (int i = 0; i < batches; ++i) {
+    int num = testDataProvider_->getNextBatch(batchSize, &dataBatch);
+    if (num == 0) {
+      testDataProvider_->reset();
+      if (intconfig_->prevBatchState) {
+        gradientMachine_->resetState();
+      }
+      if (testAllData) {
+        break;
+      } else {
+        num = testDataProvider_->getNextBatch(batchSize, &dataBatch);
+      }
+    }
+    cost += testOneBatch(dataBatch, testEvaluator_.get());
+    numSamples += num;
+  }
+  testEvaluator_->finish();
+  CHECK_GT(numSamples, 0) << "There is no samples in your test batch. Possibly "
+                             "wrong implementation of DataProvidor.reset()";
+  LOG(INFO) << " Test samples=" << numSamples << " cost=" << cost / numSamples
+            << " Eval: " << *testEvaluator_;
+  parameterUpdater_->restore();
+  if (intconfig_->prevBatchState) {
+    gradientMachine_->getState(*intconfig_->testState);
+    gradientMachine_->setState(*intconfig_->trainState);
+  }
+}
+
+int64_t Tester::testOneBatchById(int64_t batchId) {
+  DataBatch dataBatch;
+  int32_t batchSize = config_->getOptConfig().batch_size();
+
+  testDataProvider_->getNextBatch(batchSize, &dataBatch);
+
+  int64_t actualBatchSize = dataBatch.getSize();
+  if (actualBatchSize == 0) {
+    return 0;
+  }
+
+  stats_ += std::pair<int64_t, real>{
+      actualBatchSize,
+      testOneBatch(dataBatch, testEvaluator_.get())};
+
+  if (((batchId + 1) % intconfig_->logPeriod) == 0) {
+    LOG(INFO) << " Batch=" << batchId + 1 << " " << stats_.getStats(false);
+  }
+
+  return actualBatchSize;
+}
+
+real Tester::testOneBatch(const DataBatch &dataBatch, Evaluator *evaluator) {
+  const std::vector<Argument>& inArgs = dataBatch.getStreams();
+  if (intconfig_->loadsaveParametersInPserver) {
+    REGISTER_TIMER("prefetch");
+    gradientMachine_->prefetch(inArgs);
+    parameterUpdater_->getParametersRemote(false /*full parameter*/,
+                                           true /*after apply*/);
+  }
+
+  std::vector<Argument> outArgs;
+  gradientMachine_->forward(inArgs, &outArgs, PASS_TEST);
+
+  // write features if set this flag and outArgs is not empty
+  std::string featFile = intconfig_->featFile;
+  if (!featFile.empty() && !outArgs.empty()) {
+    size_t numOutputs = outArgs.size();
+    std::vector<MatrixPtr> featMatrices;
+    featMatrices.resize(numOutputs);
+    for (size_t i = 0; i < numOutputs; ++i) {
+      featMatrices[i] = Matrix::create(outArgs[i].value->getHeight(),
+                                       outArgs[i].value->getWidth(), false,
+                                       false);  // CPU data buffer
+      featMatrices[i]->copyFrom(*(outArgs[i].value), HPPL_STREAM_DEFAULT);
+    }
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+    FILE* fp = fopen(featFile.c_str(), "ab+");
+    PCHECK(!ferror(fp)) << "Fail to open " << featFile;
+
+    size_t sampleNum = featMatrices[0]->getHeight();
+    for (size_t i = 0; i < sampleNum; ++i) {
+      for (size_t j = 0; j < numOutputs; ++j) {
+        size_t dim = featMatrices[j]->getWidth();
+        fwrite(featMatrices[j]->getData() + i * dim, sizeof(real), dim, fp);
+      }
+    }
+    fclose(fp);
+  }
+  if (evaluator) {
+    gradientMachine_->eval(evaluator);
+  }
+
+  // Save the output layers if predict_output_dir is not empty
+  std::string predictOutputDir = intconfig_->predictOutputDir;
+  if (!predictOutputDir.empty() && !outArgs.empty()) {
+    CHECK(intconfig_->testing) << "Only valid in test mode";
+    if (!os_.is_open()) {
+      // TODO(yuyang18): Refactor these lines.
+      constexpr int kBufLen = 100;
+      char buf[kBufLen];
+      snprintf(buf, kBufLen, "rank-%05d", intconfig_->trainerId);
+      mkDir(predictOutputDir.c_str());
+      std::string filename = path::join(predictOutputDir, buf);
+      os_.open(filename, std::ofstream::trunc);
+      CHECK(os_.is_open()) << "Failed to open file " << filename;
+    }
+    printOutput(outArgs, os_);
+    return 0.0;  // In this case, there is no meaning to calculate cost
+  }
+
+  return Argument::sumCosts(outArgs);
+}
+
+
+void Tester::testOnePassBatch(int passId) {
+  stats_.reset();
+  const std::vector<Argument> inArgs;
+  gradientMachine_->forward(inArgs, nullptr, PASS_TEST);
+  int64_t num; real cost;
+  gradientMachine_->getStats(cost, num);
+  stats_ += std::pair<int64_t, real> {num, cost};
+  gradientMachine_->onPassEnd();
+
+  LOG(INFO) << " Pass=" << passId << " " << stats_.getStats(false);
+}
+
+
+void Tester::testOnePass(int passId) {
+  stats_.reset();
+  int64_t batchId = 0;
+  int num = 0;
+  if (intconfig_->prevBatchState) {
+    gradientMachine_->resetState();
+  }
+
+  testEvaluator_->start();
+
+  do {
+    num = testOneBatchById(batchId);
+    ++batchId;
+  } while (num > 0);
+
+  gradientMachine_->onPassEnd();
+  testEvaluator_->finish();
+
+  LOG(INFO) << " Pass=" << passId << " " << stats_.getStats(false)
+            << " Eval: " << *testEvaluator_;
+
+  if (intconfig_->distributeTest) {
+    testEvaluator_->distributeEval(testParameterClient_.get());
+    if (0 == intconfig_->trainerId) {
+      LOG(INFO) << "distribute eval: " << *testEvaluator_;
+    }
+  }
+}
+
+
+void Tester::test() {
+  CHECK(testDataProvider_) << "TestData is not specified";
+  testDataProvider_->setSkipShuffle();
+  testDataProvider_->reset();
+  gradientMachine_->start(*config_, testDataProvider_);
+
+  // For evaluation
+  std::vector<std::string> modelList;
+  std::string modelListFromConfig = intconfig_->modelList;
+  std::string initModelPath = intconfig_->initModelPath;
+  if (!modelListFromConfig.empty()) {
+    loadFileList(modelListFromConfig, modelList);
+    intconfig_->testPass = 0;
+    intconfig_->numPasses = modelList.size();
+    intconfig_->savingPeriod = 1;
+    CHECK_EQ(intconfig_->testWait, 0) <<
+      "--test_wait must be 0 for evaluation";
+  } else if (!initModelPath.empty()) {
+    modelList.push_back(initModelPath);
+    intconfig_->testPass = 0;
+    intconfig_->numPasses = 1;
+    intconfig_->savingPeriod = 1;
+    CHECK_EQ(intconfig_->testWait, 0) <<
+      "--test_wait must be 0 for evaluation";
+  }
+
+  for (int i = intconfig_->testPass; i < intconfig_->numPasses; ++i) {
+    int passId = i;
+    if (passId % intconfig_->savingPeriod == 0) {
+      if (intconfig_->testWait) {
+        while (paramUtil_->loadParameters(passId,
+                true /*local*/, true /*remote*/) == false) {
+          LOG(INFO) << "Waiting for parameters of pass " << passId;
+          sleep(60);  // sleep 60s
+        }
+      } else {
+        if (modelList.size() == 0) {
+          CHECK_EQ(paramUtil_->loadParameters(passId,
+                  true /*local*/, true /*remote*/), true);
+        } else {
+          paramUtil_->loadParametersWithPath(modelList[i],
+                                      true /*local*/, true /*remote*/);
+        }
+      }
+      if (IGradientMachineMode::trainWholeDataInOneBatch(intconfig_->mode)) {
+        testOnePassBatch(passId);
+      } else {
+        testOnePass(passId);
+      }
+      if (passId + intconfig_->savingPeriod < intconfig_->numPasses) {
+        // if there is at least 1 more pass to test, then call reset,
+        // otherwise not.
+        testDataProvider_->reset();
+      }
+    }
+  }
+
+  gradientMachine_->finish();
+}
+
+
+void Tester::printOutput(const std::vector<Argument>& outArgs,
+                          std::ostream& os) {
+  size_t numOutputs = outArgs.size();
+  size_t numIns = outArgs[0].getBatchSize();
+  if (cpuMat_.size() != numOutputs || cpuVec_.size() != numOutputs) {
+    cpuMat_.resize(numOutputs, nullptr);
+    cpuVec_.resize(numOutputs, nullptr);
+  }
+
+  for (size_t i = 0; i < numOutputs; ++i) {
+    if (outArgs[i].value != nullptr) {
+      if (outArgs[i].value->useGpu()) {
+        if (dynamic_cast<GpuMatrix*>(outArgs[i].value.get())) {
+          size_t dim = outArgs[i].value->getWidth();
+          Matrix::resizeOrCreate(cpuMat_[i], numIns, dim, false, false);
+          cpuMat_[i]->copyFrom(*outArgs[i].value);
+        } else if (dynamic_cast<GpuSparseMatrix*>(outArgs[i].value.get())) {
+          auto sparseMat =
+              dynamic_cast<GpuSparseMatrix*>(outArgs[i].value.get());
+          cpuMat_[i] = Matrix::createSparseMatrix(
+              sparseMat->getHeight(), sparseMat->getWidth(),
+              sparseMat->getElementCnt(), sparseMat->getValueType(),
+              sparseMat->format_, false, /* trans */
+              false);                    /* useGpu */
+          hl_stream_t stream = HPPL_STREAM_DEFAULT;
+          cpuMat_[i]->copyFrom(*sparseMat, stream);
+        } else {
+          LOG(WARNING) << "Not supported gpu matrix type";
+        }
+      }
+    } else if (outArgs[i].ids != nullptr) {
+      if (outArgs[i].ids->useGpu()) {
+        IVector::resizeOrCreate(cpuVec_[i], outArgs[i].ids->getSize(), false);
+        cpuVec_[i]->copyFrom(*outArgs[i].ids);
+      }
+    } else if (outArgs[i].strs != nullptr) {
+      continue;
+    } else {
+      LOG(WARNING) << "outArgs[" << i << "] has no data to print";
+    }
+  }
+
+  for (size_t i = 0; i < numIns; ++i) {
+    for (size_t j = 0; j < numOutputs; ++j) {
+      if (outArgs[j].value != nullptr) {
+        if (outArgs[j].value->useGpu()) {
+          cpuMat_[j]->printOneRow(os, i);
+        } else {
+          outArgs[j].value->printOneRow(os, i);
+        }
+      } else if (outArgs[j].ids != nullptr) {
+        if (outArgs[j].ids->useGpu()) {
+          cpuVec_[j]->printOneElement(os, i);
+        } else {
+          outArgs[j].ids->printOneElement(os, i);
+        }
+      } else if (outArgs[j].strs != nullptr) {
+        os << (*outArgs[j].strs)[i] << ";";
+      }
+    }
+    os << std::endl;
+  }
+}
+}  // namespace paddle
diff --git a/paddle/trainer/Tester.h b/paddle/trainer/Tester.h
new file mode 100644
index 00000000000000..9663b8def9145b
--- /dev/null
+++ b/paddle/trainer/Tester.h
@@ -0,0 +1,142 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "paddle/utils/Util.h"
+
+#include <stdio.h>
+
+#include "hl_gpu.h"
+#include "paddle/gserver/dataproviders/DataProvider.h"
+#include "paddle/gserver/gradientmachines/GradientMachine.h"
+
+#include "TrainerConfig.pb.h"
+
+#include "ParameterUpdater.h"
+#include "ParamUtil.h"
+#include "TesterConfig.h"
+#include "TrainerInternalConfig.h"
+#include <fstream>
+#include <stdlib.h>
+
+namespace paddle {
+
+/**
+ * Neural Network test logics code.
+ * It is a private class for Trainer.
+ */
+class Tester {
+public:
+  /**
+   * Ctor
+   * @param config Trainer Config.
+   * @param intconfig Tester Config.
+   * @param gradientMachine Gradient machine(neuralnetwork) that will be tested.
+   * @param parameterUpdater Parameter Updater. Not for updating parameter, just
+   *                         for getting parameter from parameter-server.
+   * @param testDataProvider Test data provider.
+   */
+  Tester(const std::shared_ptr<TrainerConfigHelper> &config,
+         std::unique_ptr<TesterConfig> &&intconfig,
+         const GradientMachinePtr &gradientMachine,
+         const std::shared_ptr<ParameterUpdater> &parameterUpdater,
+         std::shared_ptr<DataProvider> testDataProvider);
+
+  /**
+   * test one period.
+   *
+   * One period means 2 things.
+   *   if test_period !=0 and not test_all_data_in_one_period, then
+   *      will test test_period * batch_size data.
+   *   else
+   *      will test whole test data.
+   *
+   * It is convenience to test small set of data when test data set is large and
+   * is training at same time.
+   */
+  void testOnePeriod();
+
+  /**
+   * Test for given data batch.
+   * @param dataBatch Data batch.
+   * @param evaluator Evaluator
+   * @return cost
+   */
+  real testOneBatch(const DataBatch &dataBatch, Evaluator *evaluator);
+
+
+  /**
+   * performance the full pass of test given test data provider
+   */
+  void test();
+
+
+protected:
+  std::shared_ptr<ParameterClient2> testParameterClient_;
+  std::shared_ptr<TrainerConfigHelper> config_;
+  std::unique_ptr<TesterConfig> intconfig_;
+  GradientMachinePtr gradientMachine_;
+  std::shared_ptr<ParameterUpdater> parameterUpdater_;
+  std::unique_ptr<Evaluator> testEvaluator_;
+  std::unique_ptr<ParameterUtil> paramUtil_;
+  DataProviderPtr testDataProvider_;
+  TrainerStats stats_;
+
+  // Used for saving the values of output layers
+  std::ofstream os_;
+  std::vector<MatrixPtr> cpuMat_;
+  std::vector<IVectorPtr> cpuVec_;
+
+private:
+  /**
+   * Test one batch by batchId. It is only used for testOnePass.
+   *
+   * Durning testOnePass, each log_period will print cost statistics.
+   *
+   * @param batchId current batch id (from 0)
+   * @return num of tested samples. Zero if end of pass.
+   */
+  int64_t testOneBatchById(int64_t batchId);
+
+  /**
+   * Test whole pass in one batch.
+   *
+   *
+   * @param passId current pass id (from 0)
+   */
+  void testOnePassBatch(int passId);
+
+  /**
+   * test for one pass in several mini-batches.
+   *
+   * Used for sgd method.
+   *
+   * @param passId current pass id (from 0)
+   */
+  void testOnePass(int passId);
+
+  /**
+   * print the outArgs to a stream
+   *
+   * used for save feature file
+   *
+   * @param [in] outArgs output arguments for network.
+   * @param [in,out] os output stream.
+   */
+  void printOutput(const std::vector<Argument>& outArgs, std::ostream& os);
+};
+
+}  //  namespace paddle
diff --git a/paddle/trainer/TesterConfig.h b/paddle/trainer/TesterConfig.h
new file mode 100644
index 00000000000000..d5e644ce612471
--- /dev/null
+++ b/paddle/trainer/TesterConfig.h
@@ -0,0 +1,144 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "paddle/utils/Util.h"
+
+#include <stdio.h>
+
+#include "hl_gpu.h"
+#include "paddle/gserver/gradientmachines/GradientMachine.h"
+
+#include "TrainerConfig.pb.h"
+
+#include "ParameterUpdater.h"
+#include <fstream>
+#include <stdlib.h>
+
+namespace paddle {
+
+/**
+ * TesterConfig
+ * general configs for training
+ */
+struct TesterConfig {
+  /**
+   * indicate test period
+   */
+  int testPeriod;
+
+  /**
+   * indicate whether testing data in one period
+   */
+  bool testAllDataInOnePeriod;
+
+  /**
+   * indicate whether to save previous batch state
+   */
+  bool prevBatchState;
+
+  /**
+   * log period
+   */
+  int logPeriod;
+
+  /**
+   * loadsave parameters in pserver
+   */
+  bool loadsaveParametersInPserver;
+
+  /**
+   * feat file
+   */
+  std::string featFile;
+
+  /**
+   * predict output dir
+   */
+  std::string predictOutputDir;
+
+  /**
+   * trianer id
+   */
+  int trainerId;
+
+  /**
+   * distribute test
+   */
+  bool distributeTest;
+
+  /**
+   * training state
+   */
+  MachineState* trainState;
+
+  /**
+   * test state
+   */
+  MachineState* testState;
+
+  /**
+   * model list
+   */
+  std::string modelList;
+
+  /**
+   * test passes
+   */
+  int testPass;
+
+  /**
+   * num passes
+   */
+  int numPasses;
+
+  /**
+   * saving period
+   */
+  int savingPeriod;
+
+  /**
+   * test wait
+   */
+  int testWait;
+
+  /**
+   * init model path
+   */
+  std::string initModelPath;
+
+  /**
+   * save only one
+   */
+  bool saveOnlyOne;
+
+  /**
+   * testing mode
+   */
+  bool testing;
+
+  /**
+   * mode
+   */
+  int mode;
+
+  /**
+   * config loc
+   */
+  std::string config;
+};
+
+}  //  namespace paddle
diff --git a/paddle/trainer/ThreadParameterUpdater.cpp b/paddle/trainer/ThreadParameterUpdater.cpp
new file mode 100644
index 00000000000000..65d827787ee78f
--- /dev/null
+++ b/paddle/trainer/ThreadParameterUpdater.cpp
@@ -0,0 +1,290 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "ThreadParameterUpdater.h"
+
+#include "paddle/utils/Logging.h"
+
+#include "paddle/math/SparseRowMatrix.h"
+#include "paddle/utils/Thread.h"
+
+namespace paddle {
+
+SgdThreadUpdater::SgdThreadUpdater(const OptimizationConfig& optConfig)
+    : config_(optConfig), numSamplesProcessed_(0) {
+  // fill types
+  auto types = sgdOptimizerGetTypes(optConfig, false /*inPserver*/);
+  for (auto type : types) {
+    addParameterType(type);
+  }
+}
+
+void SgdThreadUpdater::init(std::vector<ParameterPtr>& parameters) {
+  ParameterUpdater::init(parameters);
+
+  // calc max parameter id
+  size_t maxId = 0;
+  for (auto& para : parameters_) {
+    maxId = std::max(maxId, para->getID());
+  }
+
+  optimizers_.resize(maxId + 1);
+  for (auto& para : parameters_) {
+    int pid = para->getID();
+    optimizers_[pid].reset(sgdOptimizerCreate(config_, para->getConfig(),
+                                              para->isGradSparseUpdate(),
+                                              false /*inPserver*/));
+    size_t numRows = para->isGradSparseUpdate() ? para->getConfig().dims(0) : 0;
+    optimizers_[pid]->init(numRows, &para->getConfig());
+  }
+}
+
+void SgdThreadUpdater::startPass() {
+  for (auto& para : parameters_) {
+    int pid = para->getID();
+    optimizers_[pid]->startPass();
+  }
+}
+
+bool SgdThreadUpdater::finishPass(real cost) {
+  catchUpWith();
+
+  for (auto& para : parameters_) {
+    int pid = para->getID();
+    optimizers_[pid]->finishPass();
+  }
+  return true;
+}
+
+void SgdThreadUpdater::updateImpl(Parameter* para) {
+  if (!para->useGpu()) return;
+  SetDevice setDevice(para->getDeviceId());
+  ParameterOptimizer* optimizer = optimizers_[para->getID()].get();
+  optimizer->update(para->getBufs(), para->getConfig());
+  if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
+    callback(para->getBufs(), para->getConfig(), -1LU);
+  }
+
+  para->setValueUpdated();
+  para->clearGradient();
+}
+
+void SgdThreadUpdater::threadTraverse(
+    const ParameterOptimizer::TraverseCallback& callback, int tid,
+    size_t numThreads, Parameter* para) {
+  VectorPtr* vecs = Parameter::getTlsTempBufs();
+  if (para->isGradSparseUpdate()) {
+    size_t height = para->getConfig().dims(0);
+    size_t width = para->getConfig().dims(1);
+    for (size_t i = tid; i < height; i += numThreads) {
+      // setup sub bufs
+      for (auto type : parameterTypes_) {
+        vecs[type]->subVecFrom(*para->getBuf(type), i * width, width);
+      }
+      callback(vecs, para->getConfig(), i);
+    }
+  } else {  // dense
+    // setup sub bufs
+    auto interval = calcSplitArrayInterval(para->getSize(), (size_t)tid,
+                                           numThreads, 8LU /*for avx*/);
+    for (auto type : parameterTypes_) {
+      vecs[type]->subVecFrom(*para->getBuf(type), interval);
+    }
+
+    callback(vecs, para->getConfig(), -1LU);
+  }
+}
+
+void SgdThreadUpdater::traverse(GetTraverseCallback getTraverseCallback) {
+  bool hasCpuPara = false;
+  bool hasGpuPara = false;
+  for (auto& para : parameters_) {
+    if (para->useGpu()) {
+      hasGpuPara = true;
+    } else {
+      hasCpuPara = true;
+    }
+  }
+
+  auto cpuTraverse = [&](int tid, size_t numThreads) {
+    for (auto& para : parameters_) {
+      if (auto callback = getTraverseCallback(para.get())) {
+        threadTraverse(callback, tid, numThreads, para.get());
+      }
+    }
+  };
+  auto gpuTraverse = [&](int tid, size_t numThreads) {
+    for (auto& para : parameters_) {
+      if (para->useGpu()) {
+        if (auto callback = getTraverseCallback(para.get())) {
+          SetDevice setDevice(para->getDeviceId());
+          callback(para->getBufs(), para->getConfig(), -1LU);
+        }
+      }
+    }
+  };
+
+  if (hasCpuPara && hasGpuPara) {
+    getGlobalSyncThreadPool()->exec(cpuTraverse, gpuTraverse);
+  } else if (hasCpuPara) {
+    getGlobalSyncThreadPool()->exec(cpuTraverse);
+  } else if (hasGpuPara) {
+    cpuTraverse(0, 0);
+  }
+}
+
+void SgdThreadUpdater::catchUpWith() {
+  traverse([this](Parameter* para) {
+    return optimizers_[para->getID()]->startCatchUpWith();
+  });
+
+  for (auto& para : parameters_) {
+    int pid = para->getID();
+    optimizers_[pid]->finishCatchUpWith();
+  }
+}
+
+void SgdThreadUpdater::apply() {
+  catchUpWith();
+
+  traverse([this](Parameter* para) {
+    return optimizers_[para->getID()]->apply();
+  });
+}
+
+void SgdThreadUpdater::restore() {
+  traverse([this](Parameter* para) {
+    return optimizers_[para->getID()]->restore();
+  });
+}
+
+PassType SgdThreadUpdater::startBatch(int64_t batchSize) {
+  numSamplesProcessed_ += batchSize;
+  for (auto& para : parameters_) {
+    int pid = para->getID();
+    optimizers_[pid]->startBatch(numSamplesProcessed_);
+  }
+  return PASS_TRAIN;
+}
+
+void SgdThreadUpdater::finishBatch(real cost) {
+  getGlobalSyncThreadPool()->exec([&](int tid, size_t numThreads) {
+    for (auto& para : parameters_) {
+      if (para->isGradSparseUpdate()) {
+        threadUpdateSparse(tid, numThreads, para.get());
+      } else if (!para->useGpu()) {
+        threadUpdateDense(tid, numThreads, para.get());
+      }
+    }
+  });
+
+  for (auto& para : parameters_) {
+    int pid = para->getID();
+    optimizers_[pid]->finishBatch();
+  }
+}
+
+void SgdThreadUpdater::threadUpdateSparse(
+    int tid, size_t numThreads, Parameter* para) {
+
+  int pid = para->getID();
+  ParameterOptimizer* optimizer = optimizers_[pid].get();
+  VectorPtr* vecs = Parameter::getTlsTempBufs();
+
+  size_t height = para->getConfig().dims(0);
+  size_t width = para->getConfig().dims(1);
+
+  if (dynamic_cast<SparseRowIdsCpuMatrix*>(
+        para->getMat(PARAMETER_GRADIENT).get())) {
+    // From MultiGradientMachine
+    SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>(
+      para->getMat(PARAMETER_GRADIENT).get());
+    const std::vector<uint32_t>& sparseIds = mainMat->getIds(tid);
+
+    for (auto id : sparseIds) {
+      // setup sub bufs
+      for (auto type : parameterTypes_) {
+        vecs[type]->subVecFrom(*para->getBuf(type), id * width, width);
+      }
+      optimizer->update(vecs, para->getConfig(), id);
+      vecs[PARAMETER_GRADIENT]->zeroMem();
+    }
+  } else if (dynamic_cast<SparseRowCpuMatrix*>(
+               para->getMat(PARAMETER_GRADIENT).get())) {
+    // From NeuralNetwork
+    SparseRowCpuMatrix* mainMat = dynamic_cast<SparseRowCpuMatrix*>(
+      para->getMat(PARAMETER_GRADIENT).get());
+
+    std::vector<unsigned int>& localIndices =
+        mainMat->getIndexDictHandle()->localIndices;
+
+    auto interval = calcSplitArrayInterval(
+      localIndices.size(), tid, numThreads);
+    for (size_t i = interval.first; i < interval.second; ++i) {
+      auto id = localIndices[i];
+      real* row = mainMat->getLocalRow(i);
+      // setup sub bufs
+      for (auto type : parameterTypes_) {
+        if (type == PARAMETER_GRADIENT) {
+          vecs[type]->subVecFrom(row, 0, width);
+        } else {
+          vecs[type]->subVecFrom(*para->getBuf(type), id * width, width);
+        }
+      }
+      optimizer->update(vecs, para->getConfig(), id);
+      vecs[PARAMETER_GRADIENT]->zeroMem();
+    }
+  } else {
+    auto & m = *para->getMat(PARAMETER_GRADIENT).get();
+    LOG(FATAL) << "Internal error: " << para->getName() << " "
+               << typeid(m).name();
+  }
+
+
+  if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
+    for (size_t i = tid; i < height; i += numThreads) {
+      // setup sub bufs
+      for (auto type : parameterTypes_) {
+        vecs[type]->subVecFrom(*para->getBuf(type), i * width, width);
+      }
+      callback(vecs, para->getConfig(), i);
+    }
+  }
+}
+
+void SgdThreadUpdater::threadUpdateDense(int tid, size_t numThreads,
+                                         Parameter* para) {
+  int pid = para->getID();
+  ParameterOptimizer* optimizer = optimizers_[pid].get();
+  VectorPtr* vecs = Parameter::getTlsTempBufs();
+
+  auto interval = calcSplitArrayInterval(para->getSize(), (size_t)tid,
+                                         numThreads, 8LU /*for avx*/);
+
+  // setup sub bufs
+  for (auto type : parameterTypes_) {
+    vecs[type]->subVecFrom(*para->getBuf(type), interval);
+  }
+
+  // update
+  optimizer->update(vecs, para->getConfig());
+  vecs[PARAMETER_GRADIENT]->zeroMem();
+
+  if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
+    callback(vecs, para->getConfig(), -1LU);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/trainer/ThreadParameterUpdater.h b/paddle/trainer/ThreadParameterUpdater.h
new file mode 100644
index 00000000000000..f47d3b08c16777
--- /dev/null
+++ b/paddle/trainer/ThreadParameterUpdater.h
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "paddle/utils/Util.h"
+#include "paddle/parameter/AverageOptimizer.h"
+#include "paddle/parameter/FirstOrderOptimizer.h"
+#include "paddle/parameter/OptimizerFunctions.h"
+#include "paddle/parameter/OptimizerWithRegularizer.h"
+#include "paddle/parameter/Parameter.h"
+#include "paddle/parameter/Regularizer.h"
+
+#include <memory>
+#include <vector>
+
+
+namespace paddle {
+
+/**
+ * \brief A parameter updater that uses multiple threads to update parameters.
+   This parameter updater handles GPU and CPU updates differently,
+   because at the current moment, the merging on CPU is happening on the
+   main thread, and the its parameter size can be much larger than the one GPU.
+   Thus, for GPU, the parameter updates happens in updateImpl() function, which
+   is called by gradient machines as a callback function as a callback function
+   supplied to backward() and forwardBackward().
+   For CPU, the parameter updates happens in separate threads maintained by this
+   class.
+ */
+class SgdThreadUpdater : public ParameterUpdater {
+public:
+  explicit SgdThreadUpdater(const OptimizationConfig& optConfig);
+  virtual ~SgdThreadUpdater() {}
+
+
+  // Use the startPass() function of the base optimizer.
+  virtual void startPass();
+
+  // Use the finishPass() function of the base optimizer.
+  virtual bool finishPass(real cost);
+
+
+  virtual void init(std::vector<ParameterPtr>& parameters);
+  virtual PassType startBatch(int64_t batchSize);
+  // Call finishBatch for each optimizer.
+  virtual void finishBatch(real cost);
+  virtual void catchUpWith();
+  virtual void apply();
+  virtual void restore();
+
+protected:
+  // This is the function that will be eventualy called by the GradientMachine.
+  // used only for GPU update.
+  virtual void updateImpl(Parameter* para);
+  OptimizationConfig config_;
+  int64_t numSamplesProcessed_;
+
+  // One optimizers for each parameter.
+  std::vector<std::unique_ptr<ParameterOptimizer>> optimizers_;
+
+  // The update function for CPU sparse parameters.
+  void threadUpdateSparse(int tid, size_t numThreads, Parameter* para);
+
+  // The update function for CPU dense parameters.
+  void threadUpdateDense(int tid, size_t numThreads, Parameter* para);
+  // The update function for after update operations, such as averager.
+  void threadTraverse(const ParameterOptimizer::TraverseCallback& callback,
+                      int tid, size_t numThreads, Parameter* para);
+  typedef std::function<const ParameterOptimizer::TraverseCallback&(Parameter*)>
+    GetTraverseCallback;
+  void traverse(GetTraverseCallback getTraverseCallback);
+};
+
+}  // namespace paddle
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
new file mode 100644
index 00000000000000..b8997725dcef9b
--- /dev/null
+++ b/paddle/trainer/Trainer.cpp
@@ -0,0 +1,617 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "Trainer.h"
+
+#include <fenv.h>
+#include <stdio.h>
+
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+#include <limits>
+
+#include <google/protobuf/text_format.h>
+
+#include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
+#include "paddle/utils/GlobalConstants.h"
+
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
+#include "paddle/gserver/layers/ValidationLayer.h"
+#include "TesterConfig.h"
+#include "ThreadParameterUpdater.h"
+#include "RemoteParameterUpdater.h"
+#include "TrainerConfigHelper.h"
+
+P_DEFINE_string(config, "", "Trainer config file");
+P_DEFINE_int32(test_period, 1000,
+               "Run test every so many train batches."
+               " 0 for testing after each pass."
+               " If not 0, test log_period batches."
+               " If 0, test on all test data");
+
+P_DEFINE_bool(local, true, "Train in local mode or not");
+
+P_DEFINE_bool(
+    test_all_data_in_one_period, false,
+    "true will test all data in one test peroid."
+    "Otherwise test (batch_size * log_peroid) data in one test period.");
+
+P_DEFINE_int32(average_test_period, 0,
+               "Do test on average parameter every so"
+               " many batches. MUST be devided by FLAGS_log_period."
+               " Default 0 means do not test average parameter");
+
+P_DEFINE_int32(saving_period, 1, "Save parameteres every so many passes");
+P_DEFINE_int64(saving_period_by_batches, 0,
+               "Save parameters every so many batches in one pass");
+P_DEFINE_string(save_dir, "", "Directory for saving model parameter");
+P_DEFINE_int32(start_pass, 0,
+               "Start training from this pass. "
+               "Will load parameter from the previous pass");
+P_DEFINE_int32(test_pass, -1,
+               "Will load parameter start from this pass to test");
+P_DEFINE_int32(test_wait, 0, "Waiting for pass parameter if not exist");
+P_DEFINE_bool(with_cost, true, "enable cost layer or not");
+P_DEFINE_bool(distribute_test, false, "test in distribute mode");
+
+P_DEFINE_int32(num_passes, 100, "train for so many passes");
+
+P_DEFINE_string(config_args, "",
+                "arguments passed to config file."
+                "Format: key1=value1,key2=value2");
+
+P_DEFINE_bool(save_only_one, false,
+              "Save only parameters in last pass, remove previous.");
+
+P_DEFINE_string(feat_file, "", "File name of extracted feature.");
+P_DEFINE_string(predict_output_dir, "",
+                "Directory that saves the predicted results of output layers");
+P_DEFINE_string(model_list, "",
+                "File that saves the model list when evaluation");
+
+namespace paddle {
+
+void Trainer::init(int argc, char** argv) {
+  initMain(argc, argv);
+  initPython(argc, argv);
+
+  auto config = TrainerConfigHelper::createFromFlagConfig();
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+
+  init(config);
+}
+
+void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
+                   bool testing,
+                   const std::shared_ptr<GradientMachine> &gradientMachine,
+                   const std::shared_ptr<DataProvider> &dataProvider,
+                   const std::shared_ptr<DataProvider> &testDataProvider) {
+  this->stats_ = std::make_shared<TrainerStats>();
+
+  config_ = config;
+
+  config_->updateConfigFromFlags();
+
+  testing_ = testing;
+
+  // in testing, mode_ may GradientMachine::kTesting or
+  // GradientMachine::kSgdSparseCpuTraining
+
+  if (FLAGS_local) {
+    CHECK(!FLAGS_loadsave_parameters_in_pserver)
+        << "local and loadsave_parameters_in_pserver can not both true";
+    if (config_->getOptConfig().use_sparse_remote_updater()) {
+      config_->disableRemoteSparseUpdaterForEachParams();
+      LOG(INFO) << "ignore sparse_remote_update=true due to  --local=true";
+    }
+  }
+  if (FLAGS_loadsave_parameters_in_pserver) {
+    CHECK(config_->getOptConfig().use_sparse_remote_updater())
+        << "no parameter to load from pserver, please check network config";
+  }
+  if (testing && !FLAGS_loadsave_parameters_in_pserver) {
+    if (config_->getOptConfig().use_sparse_remote_updater()) {
+      config_->disableRemoteSparseUpdater();
+      LOG(INFO) << "because parameter is loaded local,"
+                << "tester ignore sparse_remote_update flag";
+    }
+  }
+
+  CHECK(TrainAlgorithm::isValid(config_->getOptConfig().algorithm()))
+      << "invalid algorithm configuration: "
+      << config_->getOptConfig().algorithm();
+
+  bool useSparseUpdater = false;
+  for (auto& paraConfig : config_->getModelConfig().parameters()) {
+    if (paraConfig.sparse_update() || paraConfig.sparse_remote_update()) {
+      useSparseUpdater = true;
+    }
+  }
+
+  if (testing) {
+    LOG(INFO) << "trainer: in testing mode";
+    if (config_->getOptConfig().use_sparse_remote_updater() ||
+        FLAGS_trainer_count > 1) {
+      mode_ = GradientMachine::kSgdSparseCpuTraining;
+      LOG(INFO) << "trainer mode: SgdSparseCpuTraining";
+    } else {
+      mode_ = GradientMachine::kTesting;
+      LOG(INFO) << "trainer mode: Testing";
+    }
+  } else if (IGradientMachineMode::tryGetMode(
+               (int*)&mode_, config_->getOptConfig().algorithm(),
+               FLAGS_trainer_count,
+               FLAGS_local, FLAGS_use_gpu)) {
+    LOG(INFO) << "Custom trainer mode.";
+  } else if ((config_->getOptConfig().algorithm() == TrainAlgorithm::SGD ||
+              config_->getOptConfig().algorithm() == TrainAlgorithm::AsyncSGD)
+             && useSparseUpdater) {
+    mode_ = GradientMachine::kSgdSparseCpuTraining;
+    LOG(INFO) << "trainer mode: SgdSparseCpuTraining";
+  } else {
+    mode_ = GradientMachine::kNormal;
+    LOG(INFO) << "trainer mode: Normal";
+  }
+
+  // initialize trainer internal
+  trainerInternal_.init(config_, gradientMachine,
+                        TrainerInternalConfig::createFromMode(mode_),
+                        stats_, testing);
+  std::unique_ptr<ParameterUtilConfig> paramConfig(
+          new ParameterUtilConfig(FLAGS_save_only_one,
+                                  FLAGS_saving_period,
+                                  FLAGS_loadsave_parameters_in_pserver,
+                                  FLAGS_config));
+
+  paramUtil_.reset(
+      new paddle::ParameterUtil(
+          config_,
+          std::move(paramConfig),
+          trainerInternal_.getGradientMachine(),
+          trainerInternal_.getParameterUpdater()));
+
+
+  bool gpuData = FLAGS_use_gpu && (!FLAGS_parallel_nn) &&
+                 (!IGradientMachineMode::dataMustInCpu(mode_, 
+                                                       FLAGS_trainer_count));
+
+  dataProvider_ = dataProvider;
+  if (!dataProvider_ && config_->hasDataConfig()) {
+    dataProvider_.reset(DataProvider::create(*config_, gpuData));
+  }
+  if (dataProvider_) {
+    evaluator_.reset(trainerInternal_.getGradientMachine()->makeEvaluator());
+    currentEvaluator_.reset(
+        trainerInternal_.getGradientMachine()->makeEvaluator());
+    if (FLAGS_average_test_period > 0 && FLAGS_trainer_id == 0 &&
+        config_->getOptConfig().average_window() > 0) {
+      CHECK_EQ(FLAGS_average_test_period % FLAGS_log_period, 0)
+          << "FLAGS_average_test_period must be divided by FALGS_log_period";
+      averageEvaluator_.reset(
+          trainerInternal_.getGradientMachine()->makeEvaluator());
+    }
+  }
+
+  testDataProvider_ = testDataProvider;
+  if (!testDataProvider_ && config_->hasTestDataConfig()) {
+    testDataProvider_.reset(
+        DataProvider::create(config_->getTestDataConfig(), gpuData));
+  }
+  if (testDataProvider_) {
+    tester_.reset(new Tester(config_, createTesterConfig(),
+                 trainerInternal_.getGradientMachine(),
+                 trainerInternal_.getParameterUpdater(),
+                 testDataProvider_));
+  }
+
+  if (!testing &&
+      (trainerInternal_.getGradientMachine()->hasStaticParameters())) {
+    CHECK(!FLAGS_loadsave_parameters_in_pserver)
+        << "is_static and loadsave_parameters_in_pserver can not both true";
+  }
+  if (testing) {
+    // will load per pass for tester
+  } else if (paramUtil_->tryLoadParametersFromConfig()) {
+    // load from config already.
+  } else {
+    trainerInternal_.getGradientMachine()->randParameters();
+  }
+
+  // Only non static parameters need to be updated
+  std::vector<ParameterPtr>& parameters =
+      trainerInternal_.getGradientMachine()->getNonStaticParameters();
+  if (trainerInternal_.getParameterUpdater()) {
+    trainerInternal_.getParameterUpdater()->init(parameters);
+
+    if (FLAGS_loadsave_parameters_in_pserver && FLAGS_trainer_id == 0) {
+      if (testing) {
+        // will load per pass for tester
+      } else if (!config_->getConfig().init_model_path().empty() &&
+                 (FLAGS_local || FLAGS_trainer_id == 0)) {
+        paramUtil_->loadParametersWithPath(
+              config_->getConfig().init_model_path(),
+              false /*local*/, true /*remote*/);
+      } else if (config_->getConfig().start_pass() > 0 &&
+                 (FLAGS_local || FLAGS_trainer_id == 0)) {
+        CHECK(paramUtil_->loadParameters(config_->getConfig().start_pass() - 1,
+              false /*local*/, true /*remote*/));
+      } else {
+        trainerInternal_.getParameterUpdater()->randParametersRemote();
+      }
+    }
+  }
+
+
+  // set current evaluator and evalutor
+  trainerInternal_.setCurrentEvaluator(currentEvaluator_.get());
+  trainerInternal_.setEvaluator(evaluator_.get());
+}
+
+void Trainer::train(size_t numPasses) {
+  srand(config_->getConfig().start_pass() + 1);
+  dataProvider_->reset();
+
+  if (this->testDataProvider_) {
+    this->testDataProvider_->reset();
+  }
+
+  trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
+
+  for (size_t i = 0; i < numPasses; ++i) {
+    if (IGradientMachineMode::trainWholeDataInOneBatch(mode_)) {
+      trainOnePassBatch(config_->getConfig().start_pass() + i);
+    } else {
+      trainOnePass(config_->getConfig().start_pass() + i);
+    }
+    if (i < numPasses - 1) {
+      dataProvider_->reset();
+    }
+  }
+
+  trainerInternal_.getGradientMachine()->finish();
+}
+
+
+static double genPerturbation(real* d, real* grad, size_t dim) {
+  auto & reng = ThreadLocalRandomEngine::get();
+  std::uniform_real_distribution<double> dist(-1, 1);
+  double gradNorm = 0, dNorm = 0;
+  for (size_t i = 0; i < dim; ++i) {
+    d[i] = dist(reng);
+    dNorm += d[i] * d[i];
+    gradNorm += grad[i] * grad[i];
+  }
+  if (gradNorm > 0) {
+    real s = 0.5 * sqrt(gradNorm / dNorm);
+    for (size_t i = 0; i < dim; ++i) {
+      d[i] = s * d[i] + grad[i];
+    }
+  }
+  double delta = 0;
+  for (size_t i = 0; i < dim; ++i) {
+    delta += grad[i] * d[i];
+  }
+  return delta;
+}
+
+real Trainer::checkGradient() {
+  trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
+  std::vector<ParameterPtr>& parameters =
+      trainerInternal_.getGradientMachine()->getNonStaticParameters();
+  DataBatch dataBatch;
+  int32_t batchSize = config_->getOptConfig().batch_size();
+
+  dataProvider_->getNextBatch(batchSize, &dataBatch);
+
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+  std::vector<Argument>& inArgs = dataBatch.getStreams();
+  std::vector<Argument> outArgs;
+
+  trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
+  real cost = Argument::sumCosts(outArgs);
+  LOG(INFO) << "original cost=" << cost;
+  trainerInternal_.getGradientMachine()->backward();
+
+  real maxDiff = 0;
+  char fill = ' ';
+  for (auto& parameter : parameters) {
+    CpuVector oldPara(parameter->getSize());
+    CpuVector newPara(parameter->getSize());
+    oldPara.copyFrom(*parameter->getBuf(PARAMETER_VALUE));
+    real* newp = newPara.getData();
+    real* oldp = oldPara.getData();
+    CpuVector cpuGrad(*parameter->getBuf(PARAMETER_GRADIENT));
+    real* grad = cpuGrad.getData();
+    size_t dim = parameter->getSize();
+    std::vector<real> d(dim);
+
+    double delta = genPerturbation(d.data(), grad, dim);
+
+    // use a step such that delta / cost is FLAGS_checkgrad_eps
+    real step =
+        (delta != 0) ? cost / delta * FLAGS_checkgrad_eps : FLAGS_checkgrad_eps;
+    delta *= step;
+    for (size_t i = 0; i < dim; ++i) {
+      newp[i] = oldp[i] + step * d[i];
+    }
+
+    parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
+    parameter->setValueUpdated();
+    trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
+    real newCost1 = Argument::sumCosts(outArgs);
+
+    for (size_t i = 0; i < dim; ++i) {
+      newp[i] = oldp[i] - step * d[i];
+    }
+
+    parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
+    parameter->setValueUpdated();
+    trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
+    real newCost2 = Argument::sumCosts(outArgs);
+
+    real trueDelta = 0.5 * (newCost1 - newCost2);
+    real diff = (1e-20 + trueDelta) / (1e-20 + delta) - 1;
+    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(fill)
+              << std::setw(20) << parameter->getName()
+              << "step=" << std::setw(15) << step << "cost1=" << std::setw(10)
+              << newCost1 << "cost2=" << std::setw(10) << newCost2
+              << "true_delta=" << std::setw(15) << trueDelta
+              << "analytic_delta=" << std::setw(15) << delta << "diff=" << diff
+              << (std::abs(diff) > 0.01 ? " ***" : "");
+
+    maxDiff = std::max(maxDiff, std::abs(diff));
+
+    // restore parameter
+    parameter->getBuf(PARAMETER_VALUE)->copyFrom(oldPara);
+    parameter->setValueUpdated();
+
+    fill = (fill == ' ') ? '.' : ' ';
+  }
+  return maxDiff;
+}
+
+void Trainer::trainOnePass(int passId) {
+  this->stats_->reset();
+  int64_t batchId = 0;
+  int32_t batchSize = config_->getOptConfig().batch_size();
+  real avgTestCost = 0;
+  int64_t numAvgTests = 0;
+  int passInnerId = 1;
+
+  trainerInternal_.getParameterUpdater()->startPass();
+  evaluator_->start();
+  if (FLAGS_prev_batch_state) {
+    trainerInternal_.getGradientMachine()->resetState();
+    trainerInternal_.getGradientMachine()->getState(testState_);
+  }
+  while (true) {
+    DataBatch dataBatch;
+
+    int num = 0;
+    {
+      REGISTER_TIMER("getTrainBatch");
+      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+    }
+    if (num == 0) break;
+
+    if (averageEvaluator_) {
+      int64_t mod = batchId % FLAGS_average_test_period;
+      if (mod >= FLAGS_average_test_period - FLAGS_log_period) {
+        if (mod == FLAGS_average_test_period - FLAGS_log_period) {
+          averageEvaluator_->start();
+        }
+        trainerInternal_.getParameterUpdater()->apply();
+        if (FLAGS_prev_batch_state) {
+          trainerInternal_.getGradientMachine()->getState(trainState_);
+        }
+        avgTestCost +=
+            tester_->testOneBatch(dataBatch, averageEvaluator_.get());
+        if (FLAGS_prev_batch_state) {
+          trainerInternal_.getGradientMachine()->setState(trainState_);
+        }
+        numAvgTests += num;
+        trainerInternal_.getParameterUpdater()->restore();
+      }
+    }
+    {
+      REGISTER_TIMER("TrainBatch");
+      trainerInternal_.trainOneBatch(batchId, dataBatch);
+    }
+
+    if (averageEvaluator_ &&
+        batchId % FLAGS_average_test_period == FLAGS_average_test_period - 1) {
+      averageEvaluator_->finish();
+      LOG(INFO) << " Averaged parameter:"
+                << " cost=" << avgTestCost / numAvgTests
+                << " Eval: " << *averageEvaluator_;
+      numAvgTests = 0;
+      avgTestCost = 0;
+    }
+
+    ++batchId;
+
+    if (batchId % FLAGS_log_period == 0) {
+      FOR_TIMING(globalStat.setThreadInfo(true));
+      FOR_TIMING(globalStat.printAllStatus());
+      FOR_TIMING(globalStat.reset());
+    }
+
+    if (testDataProvider_ && FLAGS_test_period > 0 &&
+        batchId % FLAGS_test_period == 0) {
+      tester_->testOnePeriod();
+    }
+
+    if (FLAGS_saving_period_by_batches > 0 &&
+        batchId > FLAGS_saving_period_by_batches * passInnerId &&
+        0 == FLAGS_trainer_id) {
+      trainerInternal_.getParameterUpdater()->catchUpWith();
+      if (testDataProvider_) {
+        tester_->testOnePeriod();
+      }
+      paramUtil_->saveParametersOnePass(passId, passInnerId);
+      ++passInnerId;
+    }
+  }
+
+  if (batchId == 0) {
+    // This means no more data from DataProvider
+    return;
+  }
+
+  trainerInternal_.finishTrainPass(passId, batchId);
+
+  FOR_TIMING(globalStat.setThreadInfo(true));
+  FOR_TIMING(globalStat.printAllStatus());
+  FOR_TIMING(globalStat.reset());
+
+  if (testDataProvider_) {
+    tester_->testOnePeriod();
+  }
+
+  if (passId % FLAGS_saving_period == 0 && FLAGS_trainer_id == 0) {
+    paramUtil_->saveParametersOnePass(passId);
+  }
+}
+
+void Trainer::trainOnePassBatch(int passId) {
+  this->stats_->reset();
+
+  trainerInternal_.getParameterUpdater()->startPass();
+  const std::vector<Argument> inArgs;
+  {
+    REGISTER_TIMER("onePass");
+    trainerInternal_.getGradientMachine()->forwardBackward(inArgs, nullptr,
+                                                        PASS_TRAIN, nullptr);
+  }
+
+  real cost = .0;
+  int64_t num = 0;
+  trainerInternal_.getGradientMachine()->getStats(cost, num);
+  *stats_ += {num, cost};
+
+  trainerInternal_.getGradientMachine()->onPassEnd();
+
+  bool accepted =
+    trainerInternal_.getParameterUpdater()->finishPass(cost);
+
+  globalStat.setThreadInfo(true);
+  globalStat.printAllStatus();
+  globalStat.reset();
+
+  LOG(INFO) << " Pass=" << passId
+            << " AcceptedPass=" << (accepted ? acceptedPassId_ : -1)
+            << stats_->getStats(false /*withCurrentCost*/);
+
+  if (accepted) {
+    if (acceptedPassId_ % FLAGS_saving_period == 0 && FLAGS_trainer_id == 0) {
+      paramUtil_->saveParameters(acceptedPassId_);
+    }
+    acceptedPassId_++;
+    if (FLAGS_save_only_one && acceptedPassId_ >= FLAGS_saving_period) {
+      paramUtil_->deleteParameters(acceptedPassId_ - FLAGS_saving_period);
+    }
+  }
+}
+
+real Trainer::calcGradient(const DataBatch& dataBatch, const Vector& value,
+                           Vector& gradient) {
+  CHECK_EQ(value.getSize(), gradient.getSize());
+  std::vector<ParameterPtr>& parameters =
+    trainerInternal_.getGradientMachine()->getParameters();
+
+  clearGradient();
+
+  size_t offset = 0;
+  size_t valueSize = value.getSize();
+
+  for (auto& para : parameters) {
+    CHECK_LE(offset + para->getSize(), valueSize);
+    VectorPtr val =
+        Vector::create(para->getSize(), value.getMemoryHandle(), offset);
+    para->getBuf(PARAMETER_VALUE)->copyFrom(*val);
+    para->setValueUpdated();
+    offset += para->getSize();
+  }
+
+  CHECK_EQ(offset, valueSize);
+
+  std::vector<Argument> inArgs = dataBatch.getStreams();
+  std::vector<Argument> outArgs;
+
+  trainerInternal_.getGradientMachine()->forwardBackward(inArgs, &outArgs,
+                                                         PASS_TRAIN);
+  real cost = Argument::sumCosts(outArgs);
+
+  offset = 0;
+  for (auto& para : parameters) {
+    VectorPtr grad =
+        Vector::create(para->getSize(), gradient.getMemoryHandle(), offset);
+    if (para->getBuf(PARAMETER_GRADIENT)) {
+      grad->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
+    }
+    offset += para->getSize();
+  }
+
+  return cost;
+}
+
+void Trainer::clearGradient() {
+  std::vector<ParameterPtr>& parameters =
+      trainerInternal_.getGradientMachine()->getNonStaticParameters();
+  for (auto& parameter : parameters) {
+    parameter->clearGradient();
+  }
+}
+
+int Trainer::getBatchSize() { return config_->getOptConfig().batch_size(); }
+
+void Trainer::test() {
+  tester_->test();
+}
+
+std::unique_ptr<TesterConfig> Trainer::createTesterConfig() {
+  TesterConfig* conf = new TesterConfig;
+  conf->testPeriod = FLAGS_test_period;
+  conf->testAllDataInOnePeriod = FLAGS_test_all_data_in_one_period;
+  conf->prevBatchState = FLAGS_prev_batch_state;
+  conf->logPeriod = FLAGS_log_period;
+  conf->loadsaveParametersInPserver = FLAGS_loadsave_parameters_in_pserver;
+  conf->featFile = FLAGS_feat_file;
+  conf->predictOutputDir = FLAGS_predict_output_dir;
+  conf->trainerId = FLAGS_trainer_id;
+  conf->distributeTest = FLAGS_distribute_test;
+  conf->config = FLAGS_config;
+  conf->modelList = FLAGS_model_list;
+  conf->testPass = FLAGS_test_pass;
+  conf->numPasses = FLAGS_num_passes;
+  conf->savingPeriod = FLAGS_saving_period;
+  conf->testWait = FLAGS_test_wait;
+  conf->initModelPath = FLAGS_init_model_path;
+  conf->saveOnlyOne = FLAGS_save_only_one;
+  conf->testing = testing_;
+  conf->mode = mode_;
+  conf->trainState = &trainState_;
+  conf->testState = &testState_;
+  return std::unique_ptr<TesterConfig>(conf);
+}
+
+ParameterUtil* Trainer::getParameterUtilPtr() {
+  return paramUtil_.get();
+}
+}  // namespace paddle
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
new file mode 100644
index 00000000000000..9bfd6d107a2043
--- /dev/null
+++ b/paddle/trainer/Trainer.h
@@ -0,0 +1,199 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "paddle/utils/Util.h"
+
+#include <stdio.h>
+
+#include "hl_gpu.h"
+#include "paddle/gserver/dataproviders/DataProvider.h"
+#include "paddle/gserver/gradientmachines/GradientMachine.h"
+
+#include "TrainerConfigHelper.h"
+#include "ParameterUpdater.h"
+#include "TrainerInternal.h"
+#include "Tester.h"
+#include "ParamUtil.h"
+#include <fstream>
+#include <stdlib.h>
+
+#ifdef PADDLE_METRIC_LEARNING
+#include "paddle/internals/metric_learning/MetricTrainer.h"
+#endif
+
+P_DECLARE_int32(num_passes);
+
+namespace paddle {
+
+/**
+ * Trainer Class
+ *
+ * Trainer combines GradientMachine, ParameterUpdater, DataProvider together to
+ * train/test a NeuralNetwork.
+ */
+class Trainer {
+public:
+  /**
+   * Ctor.
+   * @return
+   */
+  Trainer() : acceptedPassId_(0) {}
+
+  virtual ~Trainer() {}
+
+  /**
+   * initialize a new trainer using config
+   *
+   * @param config TrainerConfig.
+   * @param testing true if only for testing
+   * @param gradientMachine GradientMachine that will be trained.
+   *                        nullptr if create from config.
+   * @param dataProvider Train Data Provider. null if create from config.
+   * @param testDataProvider Test Data Provider. null if create from config.
+   */
+  virtual void init(
+      const std::shared_ptr<TrainerConfigHelper> &config,
+      bool testing = false,
+      const std::shared_ptr<GradientMachine> &gradientMachine = nullptr,
+      const std::shared_ptr<DataProvider> &dataProvider = nullptr,
+      const std::shared_ptr<DataProvider> &testDataProvider = nullptr);
+
+  /**
+   * Initialize Trainer from command line flags.
+   */
+  void init(int argc, char** argv);
+
+
+  /**
+   * Train until num_passes reached.
+   * One pass means neural network train through all training data.
+   *
+   * @param numPasses the number of traning pass.
+   * @note Durning neural network training, the num passes may set a very large
+   * value, and kill training process when result is good enough.
+   */
+  void train(size_t numPasses = (size_t)FLAGS_num_passes);
+
+  /**
+   * compare the gradient from bp with finite difference
+   * @return  the maximal difference
+   */
+  real checkGradient();
+
+
+  /**
+   * given a dataBatch and the current parameter value
+   * calculate its gradient and return the cost.
+   *
+   * TODO(yuyang18): I think this method is deprecated and buggy. Should it be
+   * removed?
+   */
+  real calcGradient(const DataBatch& dataBatch, const Vector& value,
+                    Vector& gradient);
+
+  /**
+   * Get Trainer Config.
+   */
+  const TrainerConfig& getConfig() const { return config_->getConfig(); }
+
+  /**
+   * Get Train Data Provider
+   */
+  const DataProviderPtr& getDataProvider() { return dataProvider_; }
+
+  /**
+   * Get Gradient Machine.
+   */
+  const GradientMachinePtr& getGradientMachine() {
+    return trainerInternal_.getGradientMachine();
+  }
+
+  /**
+   * Get batch size in optimization config.
+   * @note This method didn't return the actual batch size. Just batch size
+   * set in the optimization config. The actual batch size in one trainer may
+   * less than batch size in config due to there are not enough data.
+   */
+  int getBatchSize();
+
+  /**
+   * Do test job
+   */
+  void test();
+
+  /**
+   * Get parameter util ptr
+   *
+   * TODO(yuyang18): Make it return a smart pointer.
+   */
+  ParameterUtil* getParameterUtilPtr();
+
+protected:
+  /**
+   * Train one pass of data. passId starts from 0
+   *
+   * SGD Method.
+   */
+  void trainOnePass(int passId);
+
+  /**
+   * Train one pass in one batch.
+   *
+   */
+  void trainOnePassBatch(int passId);
+
+  /**
+   * set parameter gradient to zero
+   */
+  void clearGradient();
+
+private:
+  std::unique_ptr<TesterConfig> createTesterConfig();
+
+protected:
+  std::shared_ptr<TrainerConfigHelper> config_;
+  std::shared_ptr<TrainerStats> stats_;
+
+  DataProviderPtr dataProvider_;
+  DataProviderPtr testDataProvider_;
+  MachineState trainState_;
+  MachineState testState_;
+
+  std::unique_ptr<Evaluator> evaluator_;
+  std::unique_ptr<Evaluator> currentEvaluator_;
+  std::unique_ptr<Evaluator> averageEvaluator_;
+  // training mode
+  // used to decide which GradientMachine and ParameterUpdater to create
+  GradientMachine::CreateMode mode_;
+  int testing_;
+  int acceptedPassId_;
+
+  // trainer tester
+  std::unique_ptr<Tester> tester_;
+
+  // parameter util
+  std::unique_ptr<ParameterUtil> paramUtil_;
+
+  #ifdef PADDLE_METRIC_LEARNING
+  MetricTrainer trainerInternal_;
+  #else
+  // trainer Internal
+  TrainerInternal trainerInternal_;
+  #endif
+};
+
+}  // namespace paddle
diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp
new file mode 100644
index 00000000000000..98197e7988517a
--- /dev/null
+++ b/paddle/trainer/TrainerConfigHelper.cpp
@@ -0,0 +1,208 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "TrainerConfigHelper.h"
+#include "ParamUtil.h"
+#include "TrainerConfig.pb.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/PythonUtil.h"
+
+P_DECLARE_string(config);
+P_DECLARE_string(init_model_path);
+P_DECLARE_int32(start_pass);
+P_DECLARE_string(save_dir);
+P_DECLARE_int32(trainer_id);
+P_DECLARE_bool(local);
+P_DECLARE_bool(with_cost);
+P_DECLARE_bool(with_gpu);
+P_DECLARE_bool(parallel_nn);
+P_DECLARE_string(config_args);
+
+
+const char* kConfigParserModuleName = "paddle.trainer.config_parser";
+const char* kConfigParserFuncName = "parse_config_and_serialize";
+
+namespace paddle {
+
+struct TrainerConfigHelperPrivate {
+  TrainerConfig conf;
+};
+
+TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath)
+  :m(new TrainerConfigHelperPrivate()) {
+  std::ostringstream configArgs;
+  configArgs << "trainer_id=" << FLAGS_trainer_id
+             << ",local=" << FLAGS_local
+             << ",with_cost=" << FLAGS_with_cost
+             << ",use_gpu=" << FLAGS_use_gpu
+             << ",parallel_nn=" << FLAGS_parallel_nn
+             << ",cudnn_version=" << hl_get_cudnn_lib_version();
+  if (!FLAGS_config_args.empty()) {
+    configArgs << "," << FLAGS_config_args;
+  }
+
+  VLOG(3) << "Parsing trainer config " << configFilePath;
+  std::string configProtoStr =
+      callPythonFunc(kConfigParserModuleName, kConfigParserFuncName,
+                     {configFilePath, configArgs.str()});
+  CHECK(m->conf.ParseFromString(configProtoStr));
+}
+
+TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig& config)
+  :m(new TrainerConfigHelperPrivate()) {
+  m->conf = config;
+}
+
+
+TrainerConfigHelper::~TrainerConfigHelper() {
+  if (m) {
+    delete m;
+  }
+}
+
+const TrainerConfig &
+TrainerConfigHelper::getConfig() const {
+  return m->conf;
+}
+
+TrainerConfig& TrainerConfigHelper::getMutableConfig() {
+  return m->conf;
+}
+
+const OptimizationConfig &TrainerConfigHelper::getOptConfig() const {
+  return m->conf.opt_config();
+}
+
+const ModelConfig &TrainerConfigHelper::getModelConfig() const {
+  return m->conf.model_config();
+}
+
+const DataConfig *TrainerConfigHelper::getDataConfigPtr() const {
+  if (m->conf.has_data_config()) {
+    return &m->conf.data_config();
+  } else {
+    return nullptr;
+  }
+}
+
+const DataConfig &TrainerConfigHelper::getTestDataConfig() const {
+  CHECK(m->conf.has_test_data_config());
+  return m->conf.test_data_config();
+}
+
+bool TrainerConfigHelper::hasDataConfig() const {
+  return m->conf.has_data_config();
+}
+
+bool TrainerConfigHelper::hasTestDataConfig() const {
+  return m->conf.has_test_data_config();
+}
+
+void TrainerConfigHelper::updateConfigFromFlags() {
+  if (!FLAGS_save_dir.empty()) {
+    m->conf.set_save_dir(FLAGS_save_dir);
+  }
+  if (!FLAGS_init_model_path.empty()) {
+    m->conf.set_init_model_path(FLAGS_init_model_path);
+  }
+  if (FLAGS_start_pass != 0) {
+    m->conf.set_start_pass(FLAGS_start_pass);
+  }
+}
+
+void TrainerConfigHelper::disableRemoteSparseUpdater() {
+  m->conf.mutable_opt_config()->set_use_sparse_remote_updater(false);
+}
+
+void TrainerConfigHelper::disableRemoteSparseUpdaterForEachParams() {
+  this->disableRemoteSparseUpdater();
+  for (int i = 0; i < m->conf.model_config().parameters_size(); ++i) {
+    m->conf.mutable_model_config()
+        ->mutable_parameters(i)
+        ->set_sparse_remote_update(false);
+  }
+}
+
+OptimizationConfig &TrainerConfigHelper::getOptConfig() {
+  return *m->conf.mutable_opt_config();
+}
+
+void TrainerConfigHelper::setSaveDir(const std::string &saveDir) {
+  m->conf.set_save_dir(saveDir);
+}
+
+const std::string &TrainerConfigHelper::getSaveDir() const {
+  return m->conf.save_dir();
+}
+
+std::string TrainerConfigHelper::getConfigNameFromPath(
+    const std::string &modelPath) {
+  std::ifstream s(path::join(modelPath, "path.txt"));
+  CHECK(s.is_open()) << " fail to open path.txt";
+  std::string ss;
+  getline(s, ss);
+  VLOG(3) << "fileName " << path::join(modelPath, ss);
+  s.close();
+  return path::join(modelPath, ss);
+}
+
+std::string TrainerConfigHelper::getConfigNameFromPassId(
+    int passId, const std::string &modelPath) {
+  constexpr int kBufLen = 100;
+  char buf[kBufLen];
+  snprintf(buf, kBufLen, "pass-%05d", passId);
+  return TrainerConfigHelper::getConfigNameFromPath(path::join(modelPath, buf));
+}
+
+std::string TrainerConfigHelper::getConfigName(bool *ok) const {
+  std::string retv = "";
+
+  if (!m->conf.config_file().empty()) {
+    retv = m->conf.config_file();
+  } else if (!m->conf.init_model_path().empty()) {
+    retv = getConfigNameFromPath(m->conf.init_model_path());
+  } else if (m->conf.start_pass() >= 1) {
+    retv = getConfigNameFromPassId(m->conf.start_pass(),
+                                   m->conf.save_dir());
+  }
+
+  if (ok) {
+    *ok = !retv.empty();
+  }
+
+  return retv;
+}
+
+std::shared_ptr<TrainerConfigHelper> TrainerConfigHelper::createFromFlags() {
+  std::string configPath;
+  if (!FLAGS_config.empty()) {
+    configPath = FLAGS_config;
+  } else if (!FLAGS_init_model_path.empty()) {
+    configPath = getConfigNameFromPath(FLAGS_init_model_path);
+  } else if (FLAGS_start_pass >= 1) {
+    configPath = getConfigNameFromPassId(FLAGS_start_pass - 1,
+                                         FLAGS_init_model_path);
+  } else {
+    return nullptr;
+  }
+  return std::make_shared<TrainerConfigHelper>(configPath);
+}
+
+std::shared_ptr<TrainerConfigHelper>
+TrainerConfigHelper::createFromFlagConfig() {
+  CHECK(!FLAGS_config.empty());
+  return std::make_shared<TrainerConfigHelper>(FLAGS_config);
+}
+
+}  // namespace paddle
diff --git a/paddle/trainer/TrainerConfigHelper.h b/paddle/trainer/TrainerConfigHelper.h
new file mode 100644
index 00000000000000..d3ad1eeeb43bc6
--- /dev/null
+++ b/paddle/trainer/TrainerConfigHelper.h
@@ -0,0 +1,216 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <memory>
+#include <paddle/utils/Logging.h>
+#include <paddle/utils/Util.h>
+
+namespace paddle {
+
+class TrainerConfig;
+class OptimizationConfig;
+struct TrainerConfigHelperPrivate;
+class ModelConfig;
+class DataConfig;
+
+
+/**
+ * @brief TrainerConfig Helper. A class wrap protobuf's TrainerConfig Object,
+ * simplize the usage for TrainerConfig.
+ *
+ * The all operation to TrainerConfig object should use this object. It remove
+ * many copy & paste code in trainer.
+ *
+ * @TODO(yuyang18): Make cmake check compiler support keyword 'final' or not.
+ * Define a macro to unify 'final' keyword
+ */
+class TrainerConfigHelper /*final*/ {
+public:
+  DISABLE_COPY(TrainerConfigHelper);
+
+  /**
+   * @brief Ctor, Create a TrainerConfig from config file
+   * @param configFilePath Config file path.
+   */
+  explicit TrainerConfigHelper(const std::string &configFilePath);
+  explicit TrainerConfigHelper(const TrainerConfig& config);
+
+  /**
+   * Dtor
+   * @warning this class is a final class. Should not be inherited.
+   */
+  ~TrainerConfigHelper();
+
+  /**
+   * @brief Get Trainer Config itself.
+   */
+  const TrainerConfig& getConfig() const;
+
+  TrainerConfig& getMutableConfig();
+
+  /**
+   * @brief Get Optimizer Config.
+   */
+  const OptimizationConfig& getOptConfig() const;
+
+  /**
+   * @brief Get Model Config.
+   */
+  const ModelConfig& getModelConfig() const;
+
+  /**
+   * @brief Get Train Data Config Pointer.
+   * @return nullptr if there is no train data. Else will return pointer
+   */
+  const DataConfig* getDataConfigPtr() const;
+
+  /**
+   * @brief Get Tain Data Config.
+   * @warning Core when there is no train data.
+   */
+  const DataConfig& getDataConfig() const {
+    CHECK(this->hasDataConfig());
+    auto conf = this->getDataConfigPtr();
+    return *conf;
+  }
+
+  /**
+   * @brief Get test data config
+   * @warning Core when there is no test data.
+   */
+  const DataConfig& getTestDataConfig() const;
+
+  /**
+   * @brief Has train data config or not.
+   * @return true if has train data.
+   */
+  bool hasDataConfig() const;
+
+  /**
+   * @brief Has test data config or not.
+   * @return true if has test data.
+   */
+  bool hasTestDataConfig() const;
+
+
+  /**
+   * @brief Update trainer config from command line flags.
+   *        Override config's (save_dir, init_model_path, start_pass) if command
+   *        flags is existed.
+   */
+  void updateConfigFromFlags();
+
+
+  /**
+   * @brief Disable optimization's sparse remote update.
+   */
+  void disableRemoteSparseUpdater();
+
+  /**
+   * @brief Disable optimization and each parameter's sparse remote update.
+   */
+  void disableRemoteSparseUpdaterForEachParams();
+
+
+  /**
+   * @brief implicit conversion.
+   */
+  inline operator const TrainerConfig&() const {
+    return this->getConfig();
+  }
+
+  /**
+   * @brief implicit conversion.
+   */
+  inline operator const OptimizationConfig&() const {
+    return this->getOptConfig();
+  }
+
+  /**
+   * @brief implicit conversion.
+   */
+  inline operator const DataConfig&() const {
+    return this->getDataConfig();
+  }
+
+  /**
+   * @brief implicit conversion.
+   */
+  inline operator const ModelConfig&() const {
+    return this->getModelConfig();
+  }
+
+  /**
+   * @brief Get mutable optimization config.
+   */
+  OptimizationConfig& getOptConfig();
+
+  /**
+   * @brief set model save directory.
+   * @param saveDir Directory path.
+   */
+  void setSaveDir(const std::string& saveDir);
+
+  /**
+   * @brief get model save directory.
+   * @return save directory path.
+   */
+  const std::string& getSaveDir() const;
+
+  /**
+   * @brief Get config file name from model path.
+   *
+   * Paddle save model to a directory, and write a file 'path.txt' which save
+   * config filename.
+   *
+   * @param modelPath model saved directory.
+   * @return config file name.
+   */
+  static std::string getConfigNameFromPath(const std::string& modelPath);
+
+  /**
+   * @brief Get config file name from this config instance.
+   * @param[out] ok true if no error.
+   * @return config file name.
+   */
+  std::string getConfigName(bool* ok = nullptr) const;
+
+  /**
+   * @brief Try to create TrainerConfigHelper from all command line flags.
+   *        Try to load from --config, --init_model_path, --start_pass one by
+   *        one. Return nullptr if cannot load TrainerConfigHelper from all
+   *        these place.
+   * @return nullptr if cannot load, otherwise return a TrainerConfigHelper.
+   */
+  static std::shared_ptr<TrainerConfigHelper> createFromFlags();
+
+  /**
+   * @brief Try to create TrainerConfigHelper only from '--config' flag.
+   * @return nullptr if cannot load, otherwise return a TrainerConfigHelper.
+   */
+  static std::shared_ptr<TrainerConfigHelper> createFromFlagConfig();
+
+private:
+  static std::string getConfigNameFromPassId(int passId,
+                                             const std::string& modelPath);
+
+  TrainerConfigHelperPrivate* m;
+};
+
+typedef std::shared_ptr<TrainerConfigHelper> TrainerConfigHelperPtr;
+
+}  // namespace paddle
diff --git a/paddle/trainer/TrainerInternal.cpp b/paddle/trainer/TrainerInternal.cpp
new file mode 100644
index 00000000000000..76b6b9bc3ee383
--- /dev/null
+++ b/paddle/trainer/TrainerInternal.cpp
@@ -0,0 +1,302 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "TrainerInternal.h"
+
+#include <fenv.h>
+#include <stdio.h>
+
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+#include <limits>
+
+#include <google/protobuf/text_format.h>
+
+#include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
+#include "paddle/utils/GlobalConstants.h"
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/gserver/layers/ValidationLayer.h"
+
+#include "ThreadParameterUpdater.h"
+#include "RemoteParameterUpdater.h"
+
+namespace paddle {
+
+void TrainerInternal::init(const std::shared_ptr<TrainerConfigHelper> &config,
+                           const GradientMachinePtr &gradientMachine,
+                           std::unique_ptr<TrainerInternalConfig> &&intconfig,
+                           const std::shared_ptr<TrainerStats> &stats,
+                           bool testing) {
+    config_ = config;
+    intconfig_ = std::move(intconfig);
+    stats_ = stats;
+
+    //! in training will use parameter updater definitly.
+    //! But only use parameter in testing mode when some parameter in pserver.
+    if (!testing || (config_->getOptConfig().use_sparse_remote_updater() &&
+                   intconfig_->loadsave_parameters_in_pserver)) {
+      createParameterUpdater(testing);
+    }
+
+    gradientMachine_ = gradientMachine;
+    if (!gradientMachine) {
+      gradientMachine_.reset(GradientMachine::create(
+        config_->getConfig().model_config(), intconfig_->mode,
+        parameterUpdater_->getParameterTypes()));
+    }
+}
+
+void TrainerInternal::trainOneBatch(int64_t batchId,
+                                    const DataBatch& dataBatch) {
+  // true means updating parameter whenever gradient is ready during backward()
+  bool doPipelineUpdate =
+      (intconfig_->mode != GradientMachine::kSgdSparseCpuTraining) &&
+      (intconfig_->local || intconfig_->use_gpu ||
+       intconfig_->trainer_count <= 1);
+
+  int64_t actualBatchSize = dataBatch.getSize();
+  if (actualBatchSize == 0) {
+    return;
+  }
+
+  bool showStats = intconfig_->show_param_stats_period > 0 &&
+                   (batchId + 1) % intconfig_->show_param_stats_period == 0 &&
+                   intconfig_->trainer_id == 0;
+
+  std::vector<ParaStat> paraStats;
+  if (showStats) {
+    paraStats.resize(gradientMachine_->getParameters().size());
+  }
+
+  const std::vector<Argument>& inArgs = dataBatch.getStreams();
+  std::vector<Argument> outArgs;
+
+  PassType passType = parameterUpdater_->startBatch(actualBatchSize);
+
+  if (config_->getOptConfig().use_sparse_remote_updater()) {
+    REGISTER_TIMER("prefetch");
+    gradientMachine_->prefetch(inArgs);
+    parameterUpdater_->getParametersRemote();
+  }
+
+  UpdateCallback updateCallback =
+      [this, showStats, &paraStats](Parameter* para) {
+    if (showStats) {
+      //! @TODO(yuyang18) Show stats is actually a ParameterHook, refactor
+      // it
+      //! to ParameterHook.
+      auto& grad = para->getBuf(PARAMETER_GRADIENT);
+      paraStats[para->getID()].avgAbsGrad = grad->getAbsSum() / para->getSize();
+      paraStats[para->getID()].maxAbsGrad = grad->getAbsMax();
+    }
+    parameterUpdater_->update(para);
+  };
+
+  {
+#ifndef PADDLE_DISABLE_TIMER
+    Timer timer;
+    timer.start();
+#endif
+    REGISTER_TIMER("forwardBackward");
+    forwardBackwardBatch(inArgs, outArgs, passType, updateCallback,
+                         doPipelineUpdate);
+#ifndef PADDLE_DISABLE_TIMER
+    timer.stop();
+    parameterUpdater_->setForwardbackwardTime(timer.get());
+#endif
+  }
+
+  if (!doPipelineUpdate) {
+    auto& parameters = gradientMachine_->getNonStaticParameters();
+    for (auto& para : parameters) {
+      updateCallback(para.get());
+    }
+  }
+
+  real cost = 0;
+  {
+    REGISTER_TIMER("sumCost");
+    cost = Argument::sumCosts(outArgs);
+  }
+
+  if (batchId % intconfig_->log_period == 0) {
+    currentEvaluator_->start();
+    stats_->resetCurrentStat();
+  }
+  {
+    REGISTER_TIMER("eval");
+    gradientMachine_->eval(currentEvaluator_);
+    gradientMachine_->eval(evaluator_);
+  }
+
+  *stats_ += { actualBatchSize, cost };
+  {
+    REGISTER_TIMER("finishBatch");
+    parameterUpdater_->finishBatch(cost);
+  }
+
+  if (showStats) {
+    showParameterStats(paraStats);
+  }
+  if ((batchId + 1) % intconfig_->log_period == 0) {
+    currentEvaluator_->finish();
+
+    if (intconfig_->dot_period > 0) {
+      std::cerr << std::endl;
+    }
+    LOG(INFO) << " Batch=" << batchId + 1 << " "
+              << *stats_
+              << " Eval: " << *evaluator_
+              << " CurrentEval: " << *currentEvaluator_;
+  } else if (intconfig_->dot_period > 0 &&
+            (batchId + 1) % intconfig_->dot_period == 0) {
+    std::cerr << ".";
+  }
+}
+
+/**
+ * finish train pass
+ */
+void TrainerInternal::finishTrainPass(int passId, int batchId) {
+  gradientMachine_->onPassEnd();
+  parameterUpdater_->finishPass();
+  evaluator_->finish();
+  LOG(INFO) << " Pass=" << passId << " Batch=" << batchId
+            << " " << stats_->getStats(false /*without current cost*/)
+            << " Eval: " << *evaluator_;
+}
+
+void TrainerInternal::showParameterStats(const std::vector<ParaStat>&
+                                        paraStats) {
+  std::vector<ParameterPtr>& parameters = gradientMachine_->getParameters();
+  for (auto& parameter : parameters) {
+    SetDevice device(parameter->getDeviceId());
+    real sum = parameter->getBuf(PARAMETER_VALUE)->getAbsSum();
+    const auto& lr = parameter->getBuf(PARAMETER_LEARNING_RATE);
+    std::ostringstream osLrHistogram;
+    if (lr) {
+      if (VLOG_IS_ON(2)) {
+        osLrHistogram << " lr_histogram: ";
+        lr->histogram(osLrHistogram);
+      } else {
+        osLrHistogram << " max_lr=" << std::setw(11) << lr->getMax()
+                      << " min_lr=" << std::setw(11) << lr->getMin()
+                      << " avg_lr=" << std::setw(11)
+                      << lr->getSum() / parameter->getSize();
+      }
+    }
+    int pid = parameter->getID();
+    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
+              << std::setw(20) << parameter->getName()
+              << " avg_abs_val=" << std::setw(11) << sum / parameter->getSize()
+              << " max_val=" << std::setw(11)
+              << parameter->getBuf(PARAMETER_VALUE)->getAbsMax()
+              << " avg_abs_grad=" << std::setw(11) << paraStats[pid].avgAbsGrad
+              << " max_grad=" << std::setw(11) << paraStats[pid].maxAbsGrad
+              << osLrHistogram.str();
+  }
+}
+
+void TrainerInternal::createParameterUpdater(bool testing) {
+  const std::string& alg = config_->getOptConfig().algorithm();
+  parameterUpdater_.reset(ParameterUpdaterCreators::tryCreateUpdater(
+                            alg, config_->getOptConfig(), intconfig_->local,
+                            intconfig_->num_passes));
+  if (parameterUpdater_) { return; }
+
+  if (!intconfig_->local) {
+    if (testing && config_->getOptConfig().use_sparse_remote_updater()) {
+      std::unique_ptr<ParameterUpdater> localUpdater;
+      localUpdater.reset(
+          new SgdLocalUpdater(config_->getOptConfig()));  // do nothing
+      parameterUpdater_.reset(new SparseRemoteParameterUpdaterComposite(
+          config_->getOptConfig(), intconfig_->num_passes, testing,
+          std::move(localUpdater)));
+    } else {
+      if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode &&
+          !intconfig_->use_old_updater) {
+        intconfig_->use_old_updater = true;
+        LOG(INFO) << "Sgd sparse training can not work with"
+                  << " ConcurrentRemoteParameterUpdater,"
+                  << " automatically reset --use_old_updater=true";
+      }
+
+      std::unique_ptr<ParameterUpdater> localUpdater;
+      if (config_->getOptConfig().num_batches_per_send_parameter() > 1) {
+        CHECK(alg == TrainAlgorithm::SGD || alg == TrainAlgorithm::AsyncSGD)
+            << "Unsupported algorithm in remote-local mode: " << alg;
+        if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode) {
+          localUpdater.reset(new SgdThreadUpdater(*config_));
+        } else {
+          localUpdater.reset(new SgdLocalUpdater(*config_));
+        }
+      }
+
+      localUpdater.reset(
+              intconfig_->use_old_updater
+              ? new RemoteParameterUpdater(
+                      *config_,
+                      intconfig_->num_passes,
+                      std::move(localUpdater))
+              : new ConcurrentRemoteParameterUpdater(
+                      *config_,
+                      intconfig_->num_passes,
+                      std::move(localUpdater)));
+
+
+      if (config_->getOptConfig().use_sparse_remote_updater()) {
+        localUpdater.reset(new SparseRemoteParameterUpdaterComposite(
+            *config_, intconfig_->num_passes, testing,
+            std::move(localUpdater)));
+      }
+
+      this->parameterUpdater_ = std::move(localUpdater);
+    }
+  } else {
+    CHECK_EQ(config_->getOptConfig().num_batches_per_send_parameter(), 1)
+        << "num_batches_per_send_parameter should be one in local mode!";
+
+    if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode) {
+      parameterUpdater_.reset(new SgdThreadUpdater(*config_));
+    } else if (alg == TrainAlgorithm::SGD || alg == TrainAlgorithm::AsyncSGD) {
+      if (config_->getModelConfig().type() == "recursive_nn") {
+        parameterUpdater_.reset(new SgdCpuUpdater(*config_));
+      } else if (intconfig_->use_gpu &&
+                 config_->getOptConfig().do_average_in_cpu() &&
+                 config_->getOptConfig().average_window() > 0) {
+        parameterUpdater_.reset(
+            new SgdUpdaterWithCpuAverager(*config_));
+      } else {
+        parameterUpdater_.reset(new SgdLocalUpdater(*config_));
+      }
+    } else {
+      LOG(FATAL) << "Unsupported algorithm in local mode: " << alg;
+    }
+  }
+}
+
+void TrainerInternal::forwardBackwardBatch(const std::vector<Argument>& inArgs,
+                                   std::vector<Argument>& outArgs,
+                                   PassType& passType,
+                                   UpdateCallback updateCallback,
+                                   bool doPipelineUpdate) {
+  gradientMachine_->forwardBackward(
+      inArgs, &outArgs, passType, doPipelineUpdate ? updateCallback : nullptr);
+}
+
+}  // namespace paddle
diff --git a/paddle/trainer/TrainerInternal.h b/paddle/trainer/TrainerInternal.h
new file mode 100644
index 00000000000000..17011c4d2e46fe
--- /dev/null
+++ b/paddle/trainer/TrainerInternal.h
@@ -0,0 +1,145 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "paddle/utils/Util.h"
+
+#include <stdio.h>
+#include <fstream>
+#include <stdlib.h>
+
+#include "hl_gpu.h"
+#include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "TrainerConfig.pb.h"
+#include "ParameterUpdater.h"
+#include "TrainerConfigHelper.h"
+#include "TrainerInternalConfig.h"
+
+
+namespace paddle {
+
+/**
+ * TrainerInteral
+ * the core training class for driving training logic
+ */
+class TrainerInternal {
+public:
+  struct ParaStat {
+    real maxAbsGrad;
+    real avgAbsGrad;
+    ParaStat() :maxAbsGrad(.0), avgAbsGrad(.0){
+    }
+  };
+
+  TrainerInternal() {
+  }
+
+  /**
+   * Intializes trainer internal class
+   * @param config network config
+   * @param machine gradient machine
+   * @param intconfig training config
+   * @param stats training stats
+   * @param testing if it is in testing phase
+   */
+  void init(const std::shared_ptr<TrainerConfigHelper> &config,
+            const GradientMachinePtr &machine,
+            std::unique_ptr<TrainerInternalConfig> &&intconfig,
+            const std::shared_ptr<TrainerStats> &stats,
+            bool testing);
+
+  virtual ~TrainerInternal() {}
+
+  /**
+   * CreateParameterUpdater
+   * @param testing if it is in testing phase
+   */
+  void createParameterUpdater(bool testing);
+
+  /**
+   * FinishTrainPass
+   * @param passId current pass id
+   * @param batchId current batch id, starts from 0
+   */
+  void finishTrainPass(int passId, int batchId);
+
+  /**
+   * trainOneBatch
+   * @param batchId current batch id
+   * @param dataBatch data for the batch
+   */
+  void trainOneBatch(int64_t batchId, const DataBatch& dataBatch);
+
+  /**
+   * showParameterStats
+   * @param paraStats training stats
+   */
+  void showParameterStats(const std::vector<ParaStat>& paraStats);
+
+  /**
+   * getGradientMachine
+   */
+  inline const GradientMachinePtr & getGradientMachine() const {
+    return gradientMachine_;
+  }
+
+  /**
+   * getParameterUpdater
+   */
+  inline const std::shared_ptr<ParameterUpdater>& getParameterUpdater() {
+    return parameterUpdater_;
+  }
+
+  /**
+   * setCurrentEvaluator
+   * @param eval evaluator to set
+   */
+  inline void setCurrentEvaluator(Evaluator* eval) {
+    currentEvaluator_ = eval;
+  }
+
+  /**
+   * setEvaluator
+   * @param eval evaluator to set
+   */
+  inline void setEvaluator(Evaluator* eval) {
+    evaluator_ = eval;
+  }
+
+  /**
+   * forwardBackwardBatch
+   * @param inArgs input argument for data batch
+   * @param outArgs output argument from neural network
+   * @param updateCallback layerwise parameter gradient statistics
+   * @param doPipelineUpdate whether to do pipeline update
+   */
+  virtual void forwardBackwardBatch(const std::vector<Argument>& inArgs,
+                                    std::vector<Argument>& outArgs,
+                                    PassType& passType,
+                                    UpdateCallback updateCallback,
+                                    bool doPipelineUpdate);
+
+protected:
+  std::shared_ptr<ParameterUpdater> parameterUpdater_;
+  GradientMachinePtr gradientMachine_;
+  std::shared_ptr<TrainerConfigHelper> config_;
+  std::unique_ptr<TrainerInternalConfig> intconfig_;
+  std::shared_ptr<TrainerStats> stats_;
+  Evaluator* currentEvaluator_;
+  Evaluator* evaluator_;
+};
+
+}  // namespace paddle
diff --git a/paddle/trainer/TrainerInternalConfig.cpp b/paddle/trainer/TrainerInternalConfig.cpp
new file mode 100644
index 00000000000000..4a829a4df9e345
--- /dev/null
+++ b/paddle/trainer/TrainerInternalConfig.cpp
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "TrainerInternalConfig.h"
+
+P_DEFINE_int32(show_parameter_stats_period, 0,
+               "Whether to show parameter stats during training");
+
+P_DEFINE_int32(dot_period, 1, "Print '.' every so many batches");
+
+P_DEFINE_bool(use_old_updater, false, "Use the old RemoteParameterUpdater");
+
+P_DECLARE_int32(num_passes);
+
+P_DECLARE_bool(local);
+
+namespace paddle {
+
+std::unique_ptr<TrainerInternalConfig> TrainerInternalConfig::createFromMode(
+    GradientMachine::CreateMode mode) {
+  auto config = new TrainerInternalConfig();
+  config->mode = mode;
+  config->local = FLAGS_local;
+  config->use_gpu = FLAGS_use_gpu;
+  config->trainer_count = FLAGS_trainer_count;
+  config->show_param_stats_period = FLAGS_show_parameter_stats_period;
+  config->trainer_id = FLAGS_trainer_id;
+  config->log_period = FLAGS_log_period;
+  config->dot_period = FLAGS_dot_period;
+  config->num_passes = FLAGS_num_passes;
+  config->use_old_updater = FLAGS_use_old_updater;
+  config->loadsave_parameters_in_pserver = FLAGS_loadsave_parameters_in_pserver;
+
+  return std::unique_ptr<TrainerInternalConfig>(config);
+}
+
+}  // namespace paddle
diff --git a/paddle/trainer/TrainerInternalConfig.h b/paddle/trainer/TrainerInternalConfig.h
new file mode 100644
index 00000000000000..9b59143bade737
--- /dev/null
+++ b/paddle/trainer/TrainerInternalConfig.h
@@ -0,0 +1,238 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "paddle/utils/Util.h"
+
+#include <stdio.h>
+
+#include "hl_gpu.h"
+#include "paddle/gserver/gradientmachines/GradientMachine.h"
+
+#include "TrainerConfig.pb.h"
+
+#include "ParameterUpdater.h"
+#include <fstream>
+#include <sstream>
+#include <stdlib.h>
+
+namespace paddle {
+/**
+ * @brief TrainerStats object will statistics sample processed and total cost.
+ *
+ * There are two stats in it, the 'AvgCost' and 'CurrentAvgCost'. 'AvgCost'
+ * means cost through one pass(all mini-batches). 'CurrentAvgCost' means cost
+ * through one mini-batch.
+ */
+class TrainerStats {
+public:
+  /**
+   * @brief reset all stats.
+   *
+   * often used before pass start.
+   */
+  inline void reset() {
+    numProcessed_ = 0;
+    totalCost_ = .0;
+    this->resetCurrentStat();
+  }
+
+  /**
+   * @brief reset current stat.
+   *
+   * 'current' means the most recent --log_period mini-batches
+   */
+  inline void resetCurrentStat() {
+    currentCost_ = .0;
+    currentSamples_ = 0;
+  }
+
+  /**
+   * @brief add cost to stat.
+   * @param numProcessed current mini-batch size
+   * @param cost current mini-batch cost
+   */
+  inline void addCost(int64_t numProcessed, real cost) {
+    this->numProcessed_ += numProcessed;
+    this->totalCost_ += cost;
+    this->currentSamples_ += numProcessed;
+    this->currentCost_ += cost;
+  }
+
+  /**
+   * @brief get average cost through on pass(all processed mini-batches)
+   * @return pass average cost
+   */
+  inline real getAvgCost() const {
+    CHECK_NE(this->numProcessed_, 0);
+    return this->totalCost_ / this->numProcessed_;
+  }
+
+  /**
+   * @brief get current mini-batch's average cost.
+   * @return mini-batch average cost
+   */
+  inline real getCurrentAvgCost() const {
+    CHECK_NE(this->currentSamples_, 0);
+    return this->currentCost_ / this->currentSamples_;
+  }
+
+  /**
+   * @brief get all processed samples' number
+   * @return all processed samples' number
+   */
+  inline int64_t getNumProcessed() const {
+    return this->numProcessed_;
+  }
+
+  /**
+   * @brief same function as addCost. But it is simple to invoke.
+   * For example:
+   *
+   * @code{.cpp}
+   * TrainerStats stat;
+   * cost = neuralNetwork.forward(batchSize);
+   * stat += {batchSize, cost};
+   * @endcode
+   *
+   * @param p a pair of parameter, first is numProcessed, second is cost.
+   * @return *this
+   */
+  inline TrainerStats& operator += (const std::pair<int64_t, real>& p) {
+    this->addCost(p.first, p.second);
+    return *this;
+  }
+
+  /**
+   * @brief TrainerStats Constructor.
+   *
+   * reset stat when constructed.
+   */
+  inline TrainerStats() {
+    this->reset();
+  }
+
+  /**
+   * @brief show stats to ostream.
+   *
+   * If there is no need to print current cost, set withCurrentCost to False.
+   *
+   * @param os output stream.
+   * @param withCurrentCost print current cost or not.
+   */
+  void showStats(std::ostream& os, bool withCurrentCost = true) const {
+    os << "samples=" << this->getNumProcessed()
+       << " AvgCost=" << this->getAvgCost();
+    if (withCurrentCost) {
+       os << " CurrentCost=" << this->getCurrentAvgCost();
+    }
+  }
+
+  /**
+   * @brief get stats to std::string
+   * @param withCurrentCost return current cost or not
+   * @return stats string
+   */
+  std::string getStats(bool withCurrentCost = true) const {
+    std::ostringstream os;
+    this->showStats(os, withCurrentCost);
+    return os.str();
+  }
+
+private:
+  int64_t numProcessed_;
+  real totalCost_;
+  real currentCost_;
+  int64_t currentSamples_;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const TrainerStats& stats) {
+  stats.showStats(os);
+  return os;
+}
+
+/**
+ * TrainerInternalConfig
+ * general configs for training
+ */
+struct TrainerInternalConfig {
+  /**
+   * @brief Create TrainerInternalConfig from GradientMachine::CreateMode and
+   * command line arguments.
+   * @param mode
+   * @return
+   */
+  static std::unique_ptr<TrainerInternalConfig> createFromMode(
+      GradientMachine::CreateMode mode);
+
+  /**
+   * indicate whether the training is local
+   * if local, no parameter server is used
+   */
+  bool local;
+
+  /**
+   * indicate whether training uses GPU
+   */
+  bool use_gpu;
+
+  /**
+   * indicate number of trainer
+   */
+  int trainer_count;
+
+  /**
+   * how frequently to show param stats
+   */
+  int show_param_stats_period;
+
+  /**
+   * current trainer id
+   */
+  int trainer_id;
+
+  /**
+   * frequency to dump log
+   */
+  int log_period;
+
+  /**
+   * dot period
+   */
+  int dot_period;
+
+  /**
+   * num passes for training
+   */
+  int num_passes;
+
+  /**
+   * use old updater
+   */
+  bool use_old_updater;
+
+  /**
+   * whether to load and save parameter in pserver
+   */
+  bool loadsave_parameters_in_pserver;
+
+  /**
+   * training mode
+   */
+  GradientMachine::CreateMode mode;
+};
+
+}  //  namespace paddle
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
new file mode 100644
index 00000000000000..dd30b2c8a5b453
--- /dev/null
+++ b/paddle/trainer/TrainerMain.cpp
@@ -0,0 +1,110 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <fenv.h>
+#include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/StringUtil.h"
+#include "paddle/pserver/ParameterServer2.h"
+
+#include "ParamUtil.h"
+#include "Trainer.h"
+#include "paddle/pserver/RDMANetwork.h"
+
+P_DEFINE_bool(start_pserver, false, "Whether to start pserver");
+P_DECLARE_int32(gpu_id);
+P_DEFINE_string(job, "train", "one of (train, test, checkgrad)");
+P_DECLARE_int32(start_pass);
+P_DECLARE_string(config);
+P_DECLARE_string(init_model_path);
+P_DECLARE_string(rdma_tcp);
+
+using namespace paddle;  // NOLINT
+
+int main(int argc, char** argv) {
+  // write logs instantly (never buffer log messages)
+#ifdef PADDLE_USE_GLOG
+  FLAGS_logbuflevel = -1;
+#endif
+  initMain(argc, argv);
+  initPython(argc, argv);
+
+  std::vector<std::unique_ptr<ParameterServer2>> pservers;
+  std::vector<std::string> devices;
+
+  if (FLAGS_start_pserver) {
+    // round robin to loadbalance RDMA server ENGINE
+    int rdmaCpu = 0;
+    int onlineCpus = rdma::numCpus();
+    int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
+    if (FLAGS_nics.empty()) {
+      pservers.resize(numPorts);
+      for (int i = 0; i < numPorts; ++i) {
+        if (FLAGS_rdma_tcp == "rdma") {
+          pservers[i].reset(
+              new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++));
+          rdmaCpu = rdmaCpu % onlineCpus;
+        } else {
+          pservers[i].reset(
+              new ParameterServer2(std::string(), FLAGS_port + i));
+        }
+
+        CHECK(pservers[i]->init()) << "Fail to initialize parameter server"
+                                   << FLAGS_port + i;
+        LOG(INFO) << "pserver started : " << FLAGS_port + i;
+        pservers[i]->start();
+      }
+    } else {
+      str::split(FLAGS_nics, ',', &devices);
+      pservers.resize(devices.size() * numPorts);
+      for (int i = 0; i < numPorts; ++i) {
+        for (size_t j = 0; j < devices.size(); ++j) {
+          if (FLAGS_rdma_tcp == "rdma") {
+            pservers[i * devices.size() + j].reset(new ParameterServer2(
+                getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++));
+            rdmaCpu = rdmaCpu % onlineCpus;
+          } else {
+            pservers[i * devices.size() + j].reset(
+                new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i));
+          }
+
+          CHECK(pservers[i * devices.size() + j]->init())
+              << "Fail to initialize parameter server" << devices[j]
+              << FLAGS_port + i;
+          LOG(INFO) << "pserver started : " << devices[j] << ":"
+                    << FLAGS_port + i;
+          pservers[i * devices.size() + j]->start();
+        }
+      }
+    }
+  }
+  Trainer trainer;
+  auto config = TrainerConfigHelper::createFromFlags();
+  CHECK(config != nullptr) << "no valid config";
+
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+  trainer.init(config, FLAGS_job == "test");
+
+  if (FLAGS_job == "train") {
+    trainer.train();
+  } else if (FLAGS_job == "checkgrad") {
+    trainer.checkGradient();
+  } else if (FLAGS_job == "test") {
+    trainer.test();
+  } else {
+    LOG(FATAL) << "Unknown job type: " << FLAGS_job;
+  }
+
+  return 0;
+}
diff --git a/paddle/trainer/tests/.gitignore b/paddle/trainer/tests/.gitignore
new file mode 100644
index 00000000000000..79f701203671cd
--- /dev/null
+++ b/paddle/trainer/tests/.gitignore
@@ -0,0 +1,2 @@
+dump_text.test
+test_pydata_provider_wrapper.json
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
new file mode 100644
index 00000000000000..370f0b4b4113a3
--- /dev/null
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -0,0 +1,84 @@
+################# test_Prediction ######################
+add_unittest_without_exec(test_Prediction
+    test_Prediction.cpp)
+add_test(NAME test_Prediction
+  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python
+        ${CMAKE_CURRENT_BINARY_DIR}/test_Prediction --merger=${CMAKE_CURRENT_BINARY_DIR}/../paddle_merge_model
+    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+
+################# test_Compare ############################
+add_unittest_without_exec(test_Compare
+    test_Compare.cpp)
+add_test(NAME test_Compare
+  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python
+        ${CMAKE_CURRENT_BINARY_DIR}/test_Compare
+    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+
+################# test_Trainer ###########################
+add_unittest_without_exec(test_Trainer
+    test_Trainer.cpp)
+set(diy_dll_dir ${CMAKE_CURRENT_BINARY_DIR}/../../gserver/tests)
+add_test(NAME test_Trainer
+  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        ${CMAKE_CURRENT_BINARY_DIR}/test_Trainer
+    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+
+############### test_TrainerOnePass ##########################
+add_unittest_without_exec(test_TrainerOnePass
+    test_TrainerOnePass.cpp)
+add_test(NAME test_TrainerOnePass
+  COMMAND  ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        ${PROJ_ROOT}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
+    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+
+################ test_CompareTwoNets ######################
+add_unittest_without_exec(test_CompareTwoNets
+    test_CompareTwoNets.cpp)
+add_test(NAME test_CompareTwoNets
+  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
+            --config_file_a=trainer/tests/sample_trainer_config_qb_rnn.conf --config_file_b=trainer/tests/sample_trainer_config_rnn.conf
+    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+
+############### test_CompareTwoOpts ###################
+add_unittest_without_exec(test_CompareTwoOpts
+    test_CompareTwoOpts.cpp)
+add_test(NAME test_CompareTwoOpts
+  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoOpts
+            --config_file_a=trainer/tests/sample_trainer_config_opt_a.conf --config_file_b=trainer/tests/sample_trainer_config_opt_b.conf
+            --num_passes=1 --need_high_accuracy=1
+    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+
+################# test_CompareSparse ##################
+add_unittest_without_exec(test_CompareSparse
+    test_CompareSparse.cpp)
+add_test(NAME test_CompareSparse
+  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+            ./.set_port.sh -p port -n 6
+                ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
+    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+
+################# test_recurrent_machine_generation ###############
+add_unittest_without_exec(test_recurrent_machine_generation
+    test_recurrent_machine_generation.cpp)
+add_test(NAME test_recurrent_machine_generation
+  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        ${CMAKE_CURRENT_BINARY_DIR}/test_recurrent_machine_generation
+    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+
+#################### test_PyDataProviderWrapper #########################
+add_unittest_without_exec(test_PyDataProviderWrapper
+    test_PyDataProviderWrapper.cpp)
+
+add_test(NAME test_PyDataProviderWrapper
+  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d
+        ${PROJ_ROOT}/python/:${PROJ_ROOT}/paddle/trainer/tests
+        ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProviderWrapper
+    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+
+#################### test_config_parser #########################
+add_test(NAME test_config_parser
+  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        python ${PROJ_ROOT}/paddle/trainer/tests/config_parser_test.py
+    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
diff --git a/paddle/trainer/tests/__init__.py b/paddle/trainer/tests/__init__.py
new file mode 100644
index 00000000000000..7f9e87eee60376
--- /dev/null
+++ b/paddle/trainer/tests/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/paddle/trainer/tests/chunking.conf b/paddle/trainer/tests/chunking.conf
new file mode 100644
index 00000000000000..01c15fab5f7c0f
--- /dev/null
+++ b/paddle/trainer/tests/chunking.conf
@@ -0,0 +1,125 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
+
+TrainData(ProtoData(
+  files = 'trainer/tests/train_files.txt',
+  usage_ratio = 1.0,
+))
+
+TestData(ProtoData(
+  files = 'trainer/tests/test_files.txt'
+))
+
+default_initial_std(1)
+default_decay_rate(4e-4)
+default_device(0)
+
+Inputs("features", "word", "pos", "chunk")
+
+Outputs("crf")
+
+Layer(
+    name = "features",
+    type = "data",
+    size = 4339,
+)
+
+Layer(
+    name = "word",
+    type = "data",
+    size = 478,
+)
+
+Layer(
+    name = "pos",
+    type = "data",
+    size = 45
+)
+
+Layer(
+    name = "chunk",
+    type = "data",
+    size = 23
+)
+
+Layer(
+    name = "output",
+    type = "mixed",
+    size = 23,
+    bias = False,
+    device = -1,
+    inputs = [
+        FullMatrixProjection("features", parameter_name="feature_weights"),
+    #    TableProjection("word"),
+    #    TableProjection("pos"),
+    ],
+)
+
+Layer(
+    name = "crf",
+    type = "crf",
+    size = 23,
+    device = -1,
+    inputs = [
+        Input("output", parameter_name="crfw"),
+        "chunk"
+    ]
+)
+
+Layer(
+    name = "crf_decoding",
+    type = "crf_decoding",
+    size = 23,
+    device = -1,
+    inputs = [
+        Input("output", parameter_name="crfw"),
+        "chunk"
+    ]
+)
+
+Evaluator(
+    name = "error",
+    type = "sum",
+    inputs = "crf_decoding",
+)
+
+'''
+# chuck evaluator cannot be used for GPU training
+Evaluator(
+    name = "chunk_f1",
+    type = "chunk",
+    inputs = ["crf_decoding", "chunk"],
+    chunk_scheme = "IOB",
+    num_chunk_types = 11,
+)
+'''
+
+Settings(
+    algorithm = 'sgd',
+    batch_size = 100,
+    average_window = 0.5,
+    max_average_window = 2500,
+    learning_rate = 1e-1,
+    learning_rate_decay_a = 5e-7,
+    learning_rate_decay_b = 0.75,
+    l1weight = 0,
+    l2weight = 1,
+    c1 = 0.0001,
+    backoff = 0.5,
+    owlqn_steps = 100,
+    max_backoff = 5,
+)
diff --git a/paddle/trainer/tests/config_parser_test.py b/paddle/trainer/tests/config_parser_test.py
new file mode 100644
index 00000000000000..5ca874cec7914a
--- /dev/null
+++ b/paddle/trainer/tests/config_parser_test.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.config_parser import parse_config_and_serialize
+
+if __name__ == '__main__':
+    parse_config_and_serialize('trainer/tests/test_config.conf', '')
+    parse_config_and_serialize(
+        'trainer/tests/sample_trainer_config.conf', 
+        'extension_module_name=paddle.trainer.config_parser_extension')
+    parse_config_and_serialize('gserver/tests/pyDataProvider/trainer.conf', '')
diff --git a/paddle/trainer/tests/data_bin_part b/paddle/trainer/tests/data_bin_part
new file mode 100644
index 00000000000000..66ede391b0cffe
--- /dev/null
+++ b/paddle/trainer/tests/data_bin_part
@@ -0,0 +1,214 @@
+F
+��X
+��X
+��X
+��X
+��X
+��X
+��X
+��X
+���H��C��=��T��F��T��Iַ;��H��=��T��F��T��IYW��.��8��T˔I͚4��8��T��N��8��T��E��9��8��T��W��8��T��&��6ͅT�T��H��C��=��T��F��T��Iַ;><��.��8˔I͚4��8��+��E��9��8��W��8��&��6��8��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I86��H��C��=��T��F��T��Iַ;��W��8��T��;��8��T��J��J��8��T&$��H��=��T��F��T��I��W��8Ю+��J��J��8���H��C��=��T��F��T��Iַ;��H��=��T��F��T��I ��H��C��=��T��F��T��Iַ;��@��?��H��=��T��F��T��I��@��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I86��8��T��8��T��&�9��C��6��H��C��=��T��F��T��Iַ;��B��T&$��8��8��&Ӗ5��H��=��T��F��T��I��B��T���H��C��=��T��F��T��Iַ;��H��=��T��F��T��IVT��H��C��=��T��F��T��Iַ;��8��T��8��TͅT�T��8��T��&�8��6�;��8��T��@�N��8��T��8��T;9��H��=��T��F��T��I��8��8��8��8��&�8��6�;��8��@�N��8��8��H��C��=��T��F��T��Iַ;��H��=��T��F��T��IMK��H��C��=��T��F��T��Iַ;ٟ@��1��7ȣ8��Gȣ8�/��>��7��;��B��A��U��Q��U��T��0A?��H��=��T��F��T��Iٟ@��1��7��G�/��>��7��;��B��A��U��Q��U��T��0���H��C��=��T��F��T��Iַ;��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I����.��8��T˔I͚4��8��T��N��8��T��E��9��8��T��W��8��T��&��6ͅT�T��H��C��=��T��F��T��Iַ;����'���J��A��-��E�J��@��8��T��-��Eބ2�4��8��TYW��.��8˔I͚4��8��+��E��9��8��W��8��&��6��8��H��=��T��F��T��I����A��M��1��8��Mބ2�4��8���H��C��=��T��F��T��Iַ;��H��=��T��F��T��IYW��.��8��T˔I͚4��8��T��N��8��T��E��9��8��T��W��8��T��&��6ͅT�T��H��C��=��T��F��T��Iַ;><��.��8˔I͚4��8��+��E��9��8��W��8��&��6��8��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I ��H��C��=��T��F��T��Iַ;��@��K��H��=��T��F��T��I��@��K���H��C��=��T��F��T��Iַ;��H��=��T��F��T��I ��H��C��=��T��F��T��Iַ;��@��?��H��=��T��F��T��I��@��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I#!��1��4��UƕT��6��.��Q��8��T��@Ԛ<��1��4ƕT��6��.��Q��8��@Ԛ<���H��C��=��T��F��T��Iַ;��H��=��T��F��T��IVT��H��C��=��T��F��T��Iַ;��8��T��8��TͅT�T��8��T��&�8��6�;��8��T��@�N��8��T��8��T;9��H��=��T��F��T��I��8��8��8��8��&�8��6�;��8��@�N��8��8��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;ܥ6��H��=��T��F��T��Iܥ6���H��C��=��T��F��T��Iַ;��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I;9��H��C��=��T��F��T��Iַ;��Q��;��B�� �������������!��H��=��T��F��T��I��Q��B���H��C��=��T��F��T��Iַ;��H��=��T��F��T��IYW��.��8��T˔I͚4��8��T��N��8��T��E��9��8��T��W��8��T��&��6ͅT�T��H��C��=��T��F��T��Iַ;><��.��8˔I͚4��8��+��E��9��8��W��8��&��6��8��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I53��H��W��8��T��;��8��T��8��T��H��C��=��T��F��T��Iַ;#!��H��W��8Ю+��8��H��=��T��F��T��I���H��C��=��T��F��T��Iַ;��H��=��T��F��T��I ��H��C��=��T��F��T��Iַ;��@��?��H��=��T��F��T��I��@��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I&$��H��C��=��T��F��T��Iַ;��V��G��D��; ��H��=��T��F��T��I��V��G��D��;�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G����G͡S�<��%����&б��̣ ��Fۧ1��1ņAǧ1ņAņA�<��6ҥ3߫U��V�K��T��V��U��6��>��V��M��U��F��>��M��5��%��������������̋'wu��G͡S�<��%������̣ ��Fۧ1��1ņAǧ1ņAņA�<��6��U��V�K��T��V��6��>��V��M��U��F��>ʶM��%��������������̋'�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G̣ ��'��@��@��@	���@��@�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G&$��O��4��=ӪN��/��>��K��/��;��8�,��T ��O��4��=ӪN��/��>��K��;��,��T�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G><��,��9��O��8��.̣ ������T��B����0��O��!��.�/��W��D��S��W53��,��9��O��8��.��T��B����0��O��!��.�/��W��D��S��W�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G��:��=��X̣ ��Q��U��T��G܂=��X̣ ��Q��T��G�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G)'��=������	��0̣ ��M��6ͅT��O��,��@Ԛ<#!��=ؐ��0̣ ��M��6ͅT��O��,��@Ԛ<�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G/-��=������	��0̣ ��M��6ͅT��O��,��D��S�D��A)'��=ؐ��0̣ ��M��6ͅT��O��,��D��S�D��A�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G	̣ Ҧ)��G��G���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ20��4��A�Q��.ŞGщQ��H��A��V��T��J��D��8��D��A��P&$��4��A�Q.щQ��H��A��V��T��D��8��A���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ&$��R��4�Q��>��.ŞG��GщQ��6��?��@Ԛ<#!��R��4�Q��>.��GщQ��6��?��@Ԛ<���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ&$��4�Q��.ŞG��J�I��GщQ��D��S�D��A#!��4�Q.��J�I��GщQ��D��S�D��A���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ&$��.ŞGٟ@��6��G��5�I��GщQ��A�7��B.ٟ@��6��G��5�I��GщQ��+���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ53��4�Q��>��.ŞG��D��A��P��;��0��T��?��6��T��)����! ��4�Q��>.��A��;��T��6��T��)���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ53��4�Q��>��.ŞG��D��A��P��;��0��T��?��6��T��)����! ��4�Q��>.��A��;��T��6��T��)���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ><��4��9��K�Q��.ŞG��R��G��D��9��H�O��K�J��A��.ŞG��=�R��J/-��4�-�Q.��R��G��D��9��H��K�J��A.�R��J���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ53��4��A��I�Q��.ŞGщQ��H��A��V��T��J��D��8��D��A��P)'��4��A��I�Q.щQ��H��A��V��T��D��8��A���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ/-��4��=�R��4�Q��>��A��E��.ŞG��C��/��W��9��9 ��4�R��4�Q��>����C��/��W��9���C��P��H��5��C��P��H��5;9��H��9��1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ86��H���1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ��C��P��H��5��C��P��H��5��U��P��H���>��G��@Ԛ<��U��P��H���>��G��@Ԛ<���C��P��H��5��C��P��H��5&$��C��P��H��A���>��G��D��S��PԮK߀3#!��C��P��H��A���>��G��D��S��PٮK��C��P��H��5��C��P��H��5��A��H���A��C��P��G��@Ԛ<��A��H���A��C��P��@Ԛ<���C��P��H��5��C��P��H��5;9��H��9��1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ86��H���1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ��C��P��H��5��C��P��H��5MK��H���FșK��>��7��Q��K�H��.��C��Q��R��>J��M��B��>��W��M��L��G��,��@Ԛ<MK��H���FșK��>��7��Q��K�H��.��C��Q��R��>J��M��B��>��W��M��L��G��,��@Ԛ<���C��P��H��5��C��P��H��5&$��C��P��H��A���>��G��D��S��PԮK߀3#!��C��P��H��A���>��G��D��S��PٮK��C��P��H��5��C��P��H��553��A��H��M��D��P�5��8��Qٟ@�H��3��/��A��@��@��@/-��A��H��M��D��P��8��Qٟ@�H��3��/��A��@��@���C��P��H��5��C��P��H��5;9��H��9��1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ86��H���1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ��C��P��H��5��C��P��H��5#!��A��H���A��C��P��G��D��S�D��A ��A��H���A��C��P��D��S�D��A���C��P��H��5��C��P��H��5&$��C��P��H��A���>��G��D��S��PԮK߀3#!��C��P��H��A���>��G��D��S��PٮK��C��P��H��5��C��P��H��5YW��I��=��=�R��>��H���/��/��G�M��>ϪJ�R��K��2��2��U׵A��H��T��U��A��6�����)��ʪYW��I��=��=�R��>��H���/��/��G�M��>ϪJ�R��K��2��2��U׵A��H��T��U��A��6�����)��ʪ���C��P��H��5��C��P��H��5;9��H��9��1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ86��H���1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ��C��P��H��5��C��P��H��5 ��6��P��H����>�5��H��O��A��B ��6��P��H����>�5��H��O��A��B���C��P��H��5��C��P��H��5&$��C��P��H��A���>��G��D��S��PԮK߀3#!��C��P��H��A���>��G��D��S��PٮK��C��P��H��5��C��P��H��5��H���G��2��2��A��@��@��@��H���G��2��2��A��@��@�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O20����N߹-��7��B�O��1ַ;��L߹-��N��A��7��O��Iַ;)'����N߹-��7��B�O��1��;߹-��N��A��7��I�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O,*����N߹-��B�O߹-��7�O߹-ַ;�OʈF��<��4)'����N߹-��B�O߹-��7߹-ַ;�OʈF��<��4�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O&$��A����N߹-��B�O��>��8ֽHٟ@��@Ԛ<#!��A����N߹-��B�O��>��8ٟ@��@Ԛ<�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O/-�
+������N߹-��C��7��F��B�O��R��1��:��?��T)'�
+������Nں-��7��B�O��R��1��:��?��T�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O��߹-��7�O߹-��B��T��߹-��7߹-��B�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O/-����N߹-��B�O��7��F�O��O��?��L߹-��OǧB��T)'����N߹-��B�O��7�O��O��?��L߹-��O��T�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O><߹-����N��L��B��7��F�O��QӮD��D�A��4��0�A��T����(����",*߹-����N��L���O��QӮD��D��A��0�A��T�������B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O,*��߹-��7��B�OİU��1��>��C��B��B�U��Q��4,*��߹-��7��B�OİU��1��>��C��B��B�U��Q��4�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
+��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
+��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/><�
+������/��@ʡH��9��H��1��R��L��A¶7��/��J��D��O��8�,��T#!����N��9��1��L��N��/��J��D��,��T�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
+��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
+��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/b`��1��R��L��D��A¶7��/�
+������J��0��E��K��B��8�/��/��O��E��Kю2��E��,��/��W��T�����)��ʪDB��1��L��D��N��/����J��0��K��B��8�/��O��Eю2��E�������)��ʪ�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
+��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
+��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/20�
+������1��R��L��A¶7��/��J��0��E��O����@��K&$����1��L��N��/��J��0��E��O����@��K�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
+��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
+��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/��>��T��7�O��=��P��;��>��7��=��P�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
+��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
+��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��//-��D��A¶7��/��1��R��L��JʡH��W��W��T�%����! ��D��N��/��1��L��JʡH��W��WՄO�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
+��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
+��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/><�
+������N��1��R��L��A¶7��C��H��2��3��1��R��L��A¶7��/��/&$����N��1��L��N޻/��2��3��1��L��N��/�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
+��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
+��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��//-����L��G��R��1��¶7��/��1��7�>��>��G��<��T)'����L��+��¶7��/��1��7�>��>��G��<��T�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
+��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
+��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/JH��A¶7��/��C��1��R��L��H��7��/����N����=��,��:�:��8��4��S��Q��H�9��T86��N��/��C��1��L��+����N����=��,ў8��4��S��Q��H�9��T�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
+��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
+��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/DB�
+������/��@ʡH��9��H��1��R��L��A¶7��/��J��D��O��E��J��<��B��B,*����N��9��1��L��N��/��J��D��E��J��<��B��B���T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K><����N��T΂:��8��/��C��T΂:��K��T΂:��W��J��T΂:ì,��U��W��J&$����N��T����C��T�:��Tژ<��TЂ:�����T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K\Z��E��@��T΂:��:��T����T΂:��8��/��6����T΂:��8��/��K����T��8��/��;����T΂:��/��8��E��@JH��E��@��T΂:��:��T����T����6����T����K����T��8��;����T΂:��/��E��@���T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K ����N��,΂:��8��/��K��4��?�I����N��,����K��4��?�I���T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K86��΂:��8��/΂:��8��/��6��H΂:��/��8��K΂:��8��/��C��T#!��������6��H΂:��/��K����C��T���T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K><����N��T΂:��8��/��C��T΂:��K��T΂:��W��J��T΂:ì,��U��W��J&$����N��T����C��T�:��Tژ<��TЂ:�����T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K����N΂:��8��/��K�K΂:����N����K�K΂:�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ\Z��R��<��L��#��%��6��K��9��T��V��4��6��V��6��#����6��#��%��6��#����6��$����6��#�8���8GE��R��<��L��#��6��K��9��V��6��V��6��#��6��#��6��#��6����6��#�8���8�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ;9��>�R��>��%��B��>ڜ>��A��9��T��K�9�1��A��#��%��@��@��@20��>�R��>��%��B��>ڜ>��A��9��K�9�1��A��#��@��@�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ#!��#��%��9��T��Kڜ>��B��E�I��U��T��#��9��Kڜ>��B��E�I��U�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ#!��#��%��K��9��T��D��0��6�O��@Ԛ<��#��K��9��D��0��6��@Ԛ<�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ��#��%��9��T��CۚK��@Ԛ<��#��9��CۚK��@Ԛ<�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQGE��6��W��#��%��>��9��T�?��#��%��6��O�/�O��O�/��U��!�'��B�8��>ڜ>;9��6��W��#��>��9�?��#��6��O�/��O�/��U��!�'��B�8��>ڜ>�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQYW��#��%��9��T��>��K��-��A��9��6��T��W��B��:��O��S��R��Q��9��#����%ѾC��H��T��L��6��L��TJH��#��9��>��K��A��9��6��T��W��B��:��O��S��R��Q��9��#����%��5��L��6��L��T�,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��, ؓ���=��BܤK��S��/��C��8��Tœ�=��BܤK��S��8��T,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,DB��G��D��G��>��W��-��3��M�8��F�=��Bٟ@��6��S��9ܤK��ȟN��	��U��686��G��D��G��>��W��3�8��F�=��B��5��S��9ܤK��ȟN��	��U�,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,><��E��R�=��B��6�O��7��>��T��H��H�8��@��9��F��F��S��A��@Ԛ<53��E��R�=��B��6��7��>��H��H�8��@��9��F��F��A��@Ԛ<,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,PN��8��4��C�8�1�=��B��R��V��T��6��C��A��E��/��:��6�L��U��U��NԛL��@��;��6��GDB��8��C�8�1�=��B��R��V��T��C��A��E��:��6�L��U��U��NԛL��@��6��G�,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,JH��H�=��B��/��-�8��>ܤK��D��A��9��=��S˱U�8��Q��TָU��J�����)��ʪDB��H�=��B��/�8��>ܤK��D��A��9��=��S˱U�8��Q��T�U�����)��ʪ,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,GE��A��B��R��B��E�9��A��6��BϜ>�8�=��B��6ץR��R��D��O��6ө����ۆ	;9��A��B��R��B��E�9��A��6��BϜ>�8�=��B��6ץR����6ө��,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,;9��R��Q��S��A��E��M�8�=��B��>ץR��9��)��N��U��6��!��G��J53��R��Q��S��A��E��C�=��B��>ץR��9��)��N��U��6��!��1,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,/-��V��J��V�1�8�=��B��R��6��?��#��%��@��@��@)'��V��J��V�1�8�=��B��R��6��?��#��@��@�,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,����D��>��EȊ5��6��R��T���8��J��F�=��B��K��T��:�8��J�=��B��R��F��K��,��3��4��D��H��@��CӽDҾW��K��?��>��S��@��9�9��I��S��D��P�D��Azx��D��>��Eˊ5��R��T���8��S�=��BАT��:�8��J�=��B��R��F��,��3��4��D��H��@��CӽDҾW��K��?��>��S��@��9��I��S��D��P�D��A,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,����V��6�8��B��Xʉ5�=��B��>ܤK��%��������&��Ξ)ʉ5��V����T��V��E����X��G��V����X��G��V���8��G������&��Ξ)��V��E����B��V��Ɣ>��X��V����U��8P��=ۚK��C��>��J��U̟K��O��4��>��L����V��6�8��B��X�=��B��>ܤK��%��������&��Ξ)ʉ5��V����T��V��E����X��V����X��V���8��G������&��Ξ)��V��E����B��V��۔>��V����U��8��=��C��J�.��4��>���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X����D��C��G��R��@��N����D��C��G��@��N���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X#!����6��C��G��Dʉ5��>��R������#!����6��C��G��Dʉ5��>��R���������H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X86������G��R�>��R��P��>��R��6��9�9��V��A��D��S�D��A20����G��R�>��R��P��>��R��6��9��V��A��D��S�D��A���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X#!����D��R߻W�9��9������@��@��@����D��R߻W�9��9����@��@���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X����U��V�1�;��2��X��4����U��V�1�;��2��X���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��Xnl��>��A����6߻W��$��6��X��T��6�/ҥ3��)��T��:��6��X��-��6��M��E��@��E��U��%�������!�����)�������!MK��>��A����6߻W��$��6XɺRҥ3��?��:��6��X��-��6��E��@��E�������)����P���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��XA?������6��=��C߻W��E��D��>�3��K֟MȬT��T��(����#����$����!,*����6��=��C߻W��E��D��>�3��K֟MȬT��T�����H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X;9����6��6��GȂ3ʉ5��>��R��>��B��������C��T��6��;��3��D53����6��6��GȂ3ʉ5��>��R��>��B������C��T��;��3��D���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X����D��C߻W��R��1��@��K����D��C�W��1��@��K�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6GE��6��/��K�Oٟ@P��=��>��8��E�9��R�B��H��A��V��T��J��D��8��D��A��P53��6��/�Oٟ@��=��>��8��E��R�B��H��A��V��T��D��8��A�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6&$��C��K�O�I�9��R�B��2��S��C��I��9��C�O�I��R�B��2��S��C��9�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6)'��L��P��K�O�9��R�B����6��P����6��T ��L��P�O��R�B����6��P����6�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6PN��6��K�O�9��R�B��E��I��T��6��>��S��K��?��K��IP��=��>��K��I��9��0��C��9��T><��6�O��R�B��E��I��T��6��>��S��K��?��K��=��>��K��9��0��C��T�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6,*кB��P��K�O��K��=��9��F�9��R��H��G��8��T#!кB��P�O��K��=��9��F��R��H��G��8�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6JH��H��K�O��>��6��/P��=�9��R��H��>��D��A��P��;��0��T��?��6��T��)����!/-��H�O��>��6��/��=��R��H��>��A��;��T��6��T��)�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6MK��K�O��6��/P��=��K�O��R��D�B��6�O��K��K�O��6�9��6��K�O��6щQ��@Ԛ<53�O��6��/��=�O��R��D�B��6��K�O��9�O��6щQ��@Ԛ<�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6,*��6��/��K�O�9��R�B��DǬ<��C��I��I�?��9 ��6��/�O��R�B��DǬ<��Cڗ?��9�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6><��A��2��I��P��K�O��K��G��C��=�9��R�B��D�9��6��D��P�D��A20��A��2��I��P�O��K��G��C��R�B��D��6��D��P�D��A���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=SQ��$����(����0�?�=��6��C�9��A��T��0��7��R��L��@��;��I��6��U��L��I��G��8��2��TMK��$������0�?�=��6��C�9��A��T��0��7��R��L��@��I��6��U��L��I��G��8��2��T���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=86��C�9��A��T��0�?�9��-�8ٟ@��6ǽ=��E��X��Eŧ;��>��P/-��C�9��A��T��0�?�9��-�8ٟ@��6��E��E��>��P���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=20ʻ?��<��C�9��A��T��0��6��AщQ��J��@��X��@��@��@/-ʻ?��<��C�9��A��T��0��6��AщQ��J��@��X��@��@���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��0�?��<��6��C�9��A��T��A�7��B��0�?��<��6��C�9��A��T��+���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=JH��D��>��0�?��9��<�9�=��C�9��A��T��V��B��$�����/��?��B��RÙK��B��TA?��D��>��0�?��9��<�9�=��C�9��A��T��V��B��$�����/��B��E��B���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=��0�?��<��C�9��A��T��0�?��<��C�9��A��T���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=PN�9��T��0�?��5��5�=��-щQ��X��S��C��A��E��/��:��6�L��U��U��NԛL��@��;��6��GGE�9��T��0�?��5��5�=��-щQƇX��C��A��E��:��6�L��U��U��NԛL��@��6��G���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=,*��0�?��<��6�9�=��C�9��A��T��D��S�D��A,*��0�?��<��6�9�=��C�9��A��T��D��S�D��A���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=)'��0�?�9��<��C�9��A��T��6�O��P��B��6#!��0�?�9��<��C�9��A��T��6��P��B���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��B��7��Uח>��D��6��@Ԛ<��B��7��Uח>��D��6��@Ԛ<���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��Uח>��@��K��Uח>��@��K���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��B��7��Uח>ͦB��O��E��R��B��7��Uח>ͦB��O��E��R���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��B��7��Uח>��8��;��BٖT��T��B��7��Uח>��8��;��B��T���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>86��A��H��F��S��=��@��=՞R��U��7��0ח>��G��D��S��PԮK߀320��A��H��F��=��@��=՞R��U��7��0ח>��G��D��S��PٮK���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח> ��B��7��Uח>��D��T����(����"��B��7��Uח>��D��T�����6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��6��R��T��U��7��HˮD�D��A��6��T��U��7��HˮD�D��A���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��B��7��Uח>��B��7��Uח>���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>20��A��H��F��S��=��@��=՞R��U��7��0ח>��G��P��B��6,*��A��H��F��=��@��=՞R��U��7��0ח>��G��P��B�	��<��X��	��<��	53��7��<��X��7��N��4��R�W������1��E������A��T��B ӱ��Nߩ7������E������A��B	��<��X��	��<��	MK��7��<��X��7��Q��L��4��4��R�W��5���������Q��-��<��>��;��������G��B;9ӱ��Q��L��4ߩ7��5���������Q��-��<��>��;��������G�	��<��X��	��<��	53��7��<��X��7��N��4��R�W������1��E������A��T��B ӱ��Nߩ7������E������A��B	��<��X��	��<��	��N��<��;��0��@��K��,��N��;��0��K��,�	��<��X��	��<��	53��7��<��X��7��N��4��R�W������1��E������A��T��B ӱ��Nߩ7������E������A��B	��<��X��	��<��	#!��<��X��4��R�W��>��4��8��@��@��@��<ߩ7��>��4��8��@��@�	��<��X��	��<��	53��7��<��X��7��N��4��R�W������1��E������A��T��B ӱ��Nߩ7������E������A��B	��<��X��	��<��	/-�4��P��P��PՈP��R��L��1�4��:����N��P��P��P&$�4��P��P��PՈP���:����N��P��P��P���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K/-��3ϊX��4��C��3��7��Q��7����3ϊX��@����@��@ ԊX��4��C��3��3��ԊX��@����@���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K ��RН?��3ϊX��Q׆N��S��?��4�8��RН?ԊX��Q��N��?��4�8���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K&$��Q��T��3ϊX��7��Q��7׆N��S��?��4�8��QԊX��3��N��?��4�8���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K/-��I��7��Q����F��7��3ϊX��>��F�9�Q��?��WɤK��IԊX��>��F�9�Q��?��WɤK���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K&$��R��3ϊX��4��6߻W��L��Q��G��8��@Ԛ< ��RԊX��4��6߻W��L��Q��G��@Ԛ<���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K><��3ϊX��R��7��Q��7��@��4��7��5�@����:ȥ����B��@����A��T/-ԊX��R��3��@��4��7��5�@����:ȥ����B����A���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K����"����!��F��>��"��F���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��720��D��N��/��E��L��>�7��Aڶ>��F��7��C��Dƹ;��@Ԛ<,*��D��NȜM��L��>�7��Aڶ>��F��7��C��4��@Ԛ<���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7)'��H��N��/��K��N��/ڶ>��F��7��=��A�7��B#!��H��N��/��K��N��/ڶ>��F��7��=��+���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7 ��H��/��6��7P��=��D��G��@��K��H��/��6��7��=��D��@��K���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7ܤK��K��A�7��B	ܤK��K��+���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7�����7��>��1��T֛7ٟ@��9��F��6��U��>ʔ7��1��/��>ٟ@��6��L��D��7��>��/��I��/��>��=щQ��D��D��H��I��N��.��/��5�9Ԛ<ڶ>��S��-��=��D��N��@��U��W��=��-щQܭD�H��T��D��S��=��D��S�D��A�����7��>��1֛7ٟ@��9��F��6��U��>ʔ7��1��/��>��5��L��D��>��/��I��/��>��=щQ��D��D��H��I��N��.��/��5��1��S��-��=��D��N��@��U��W��=��-щQܭD�H��T��D��S��=��D��S�D��A���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��=�8��T��=��4ڶ>��F��7��S��@��@��@)'��D��N��=�8��T��=��4ڶ>��F��7��S��@��@���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7 ��H��/��6��7��Dƹ;��D��G��@��K��H��/��6��7��4��D��@��K���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7><��D��1ƹ;��T��Dƹ;��>��S��=��>��7ʗ7��4��=��>��S��B��7��S��T86��D��1ƹ;��T��4��>��S��=��>��7ʗ7��4��>��S��B��7��S��T���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7��V��/��6��7��=��D��G��@��K��V��/��6��7��=��D��@��K���$��5��H��1��4�9��A��$��5��H��1��4�9��Aec������$��/��4��U��R��5��R��H������$��>��#��=��1��,��1��>��Bٟ@��T�9��A��L��Kٟ@��6��J�=��@Ԛ<\Z������$��/��4��U��5��R��H������$��>��#��=��1��,��1��>��@��T�9��A��L��Kٟ@��6�=��@Ԛ<��$��5��H��1��4�9��A��$��5��H��1��4�9��A><��E��R�9��U��5��1����$��L��2��;��N��@��6�1��O��D��S�D��A86��E��R�9��U��5��1����L��2��N��@��6�1��O��D��S�D��A���$��5��H��1��4�9��A��$��5��H��1��4�9��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A��$��5��H��1��4�9��A��$��5��H��1��4�9��ASQ��$��U��S�/��1��6��1��.��T�9��A��6��A��PɺD��E��X��>��EѾC��T��8��6��V��O��T��BA?��$��U��S�/��6��T�9��A��6��A��PɺD��E��X��E��T��8��V��O��T��B���$��5��H��1��4�9��A��$��5��H��1��4�9��Aec������$��/��4��U��R��5��R��H������$��>��#��=��1��,��1��>��Bٟ@��T�9��A��L��Kٟ@��6��J�=��@Ԛ<\Z������$��/��4��U��5��R��H������$��>��#��=��1��,��1��>��@��T�9��A��L��Kٟ@��6�=��@Ԛ<��$��5��H��1��4�9��A��$��5��H��1��4�9��A53��E��R�9�1��@��5��H��1��Bٟ@��4�9��A��E��@��@��@/-��E��R�9�1��@��5��H��1��@��4�9��A��E��@��@���$��5��H��1��4�9��A��$��5��H��1��4�9��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A��$��5��H��1��4�9��A��$��5��H��1��4�9��A,*��$�9���6��5��6��5��4�9�Q��5؂=��@Ԛ<,*��$�9���6��5��6��5��4�9�Q��5؂=��@Ԛ<���$��5��H��1��4�9��A��$��5��H��1��4�9��Aec������$��/��4��U��R��5��R��H������$��>��#��=��1��,��1��>��Bٟ@��T�9��A��L��Kٟ@��6��J�=��@Ԛ<\Z������$��/��4��U��5��R��H������$��>��#��=��1��,��1��>��@��T�9��A��L��Kٟ@��6�=��@Ԛ<��$��5��H��1��4�9��A��$��5��H��1��4�9��ADB��"��Ξ)����6�9��$��R��5��4�9��A��Iٟ@��T��N��>��C��J��@��@Ԛ<><��"������6�9��$��R��5��4�9��A��@��T��N��>��C��J��@��@Ԛ<���$��5��H��1��4�9��A��$��5��H��1��4�9��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A��$��5��H��1��4�9��A��$��5��H��1��4�9��A;9��E��4��W��N��$��R��B��5��H��4��L��D��L��IĪN��C��S��@��K;9��E��4��W��N��$��R��B��5��H��4��L��D��L��IĪN��C��S��@��K���$��5��H��1��4�9��A��$��5��H��1��4�9��Aec������$��/��4��U��R��5��R��H������$��>��#��=��1��,��1��>��Bٟ@��T�9��A��L��Kٟ@��6��J�=��@Ԛ<\Z������$��/��4��U��5��R��H������$��>��#��=��1��,��1��>��@��T�9��A��L��Kٟ@��6�=��@Ԛ<��$��5��H��1��4�9��A��$��5��H��1��4�9��A53��@��;��5��R��H����$��U��L��T�9��A��6��D��P�D��A/-��@��5��R��H����U��L��T�9��A��6��D��P�D��A���$��5��H��1��4�9��A��$��5��H��1��4�9��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A��$��5��H��1��4�9��A��$��5��H��1��4�9��A86��D��P��>��E��5��H��"����$ĪN��L��=��4�9��6��A�7��B/-�P��>��E��5��H��"����$ĪN��L��=��4�9��6��+���:��/��SʡH��9��9��S��H��:��S��9��HDB��S��W�J��9��?��9��?��:��/��SʡH��9��9��:��/��SʡH��9��9��5ܛ?��M)'��W�J��9��9��:��S��9��:��S��9��5ܛ?��M��:��/��SʡH��9��9��S��H��:��S��9��H/-��:��/��SʡH��9��9��9��?��9��9��?��D����6��T��:��S��9��9��9��9��D����6���:��/��SʡH��9��9��S��H��:��S��9��H&$��S��V��:��/��SʡH��9��9��S����6��T��V��:��S��9����6��:��/��SʡH��9��9��S��H��:��S��9��H#!��S��:��/��SʡH��9��9��9��?Έ;��F��:��S��9��9Έ;��F���:��/��SʡH��9��9��S��H��:��S��9��HDB��S��W�J��9��?��9��?��:��/��SʡH��9��9��:��/��SʡH��9��9��5ܛ?��M)'��W�J��9��9��:��S��9��:��S��9��5ܛ?��M��:��/��SʡH��9��9��S��H��:��S��9��H&$��S��V��:��/��SʡH��9��9��9��?��<��V��V��:��S��9��9��<��V���:��/��SʡH��9��9��S��H��:��S��9��H&$��S��V��:��/��SʡH��9��9��S����6��T��V��:��S��9����6��:��/��SʡH��9��9��S��H��:��S��9��H20��S��:��/��SʡH��9��9��9��?ёC؄/��H��;��0��D��T��:��S��9��9ґC��H��;��D��T���:��/��SʡH��9��9��S��H��:��S��9��HDB��S��W�J��9��?��9��?��:��/��SʡH��9��9��:��/��SʡH��9��9��5ܛ?��M)'��W�J��9��9��:��S��9��:��S��9��5ܛ?��M��:��/��SʡH��9��9��S��H��:��S��9��H)'��:��/��SʡH��9��9��S��:��?��B��6݆.��T��:��S��9��:��B݆.��T���:��/��SʡH��9��9��S��H��:��S��9��H&$��S��V��:��/��SʡH��9��9��S����6��T��V��:��S��9����6��:��/��SʡH��9��9��S��H��:��S��9��H#!��S��9��?��:��/��SʡH��9��9��?��T��9��:��S��9��?��T���:��/��SʡH��9��9��S��H��:��S��9��HDB��S��W�J��9��?��9��?��:��/��SʡH��9��9��:��/��SʡH��9��9��5ܛ?��M)'��W�J��9��9��:��S��9��:��S��9��5ܛ?��M��:��/��SʡH��9��9��S��H��:��S��9��H��B��<��V�?��-��=�R��J��B��<��V�?�R��J���:��/��SʡH��9��9��S��H��:��S��9��H&$��S��V��:��/��SʡH��9��9��S����6��T��V��:��S��9����6��:��/��SʡH��9��9��S��H��:��S��9��H#!��S��:��/��SʡH��9��9��9��?Έ;��F��:��S��9��9Έ;��F���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��Kwu��7��R��D��H��>��<��>��K���,��0��7������R��2��������
+�.��T������ʆ��L��@ϡS��4��,ܢE��M��,�.��O��2��J��6MK������R��2��������
+�.��T������ʆ��L��@ϡS��4��,��E��,�.��O��J��6���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��KMK��R��D��H��>��<��>��K�,��0��I��O��9��4��9��1یV��0P��=P��H��>�.�E��6A?��R��H��>��<��>��K�,��0��I��O��9��4��V��0��=P��H��>�.�E��6���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��K#!��R��D��>��H��<��K��1��Q��@��@��@��R��>��H��<��K��1��Q��@��@���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��K/-��D��H��>��K��=��<��,��D��6�R��=��4��,��@Ԛ<&$��D��5�4��,��D��6�R��=��4��,��@Ԛ<���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��K����7��R��D��H��>��<��>��K���2��>��7�.ʆ��J��6������ʆ��G������1��?������P��=��1��?����I��2����K��7����>��>����M��G����MߎM������6��>��J��Rʆ�.��J��6�~�.ʆ��J��6������ʆ��G������1��?��������=��1��?����I����K��7����>����M��G����MߎM������6��>��J��Rʆ�.��J��6���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��K20��R��D��H��>��<��>��K�2��>��J��6��/��;��I��N��9,*��R��H��>��<��>��K�2��>��J��6��/��;��N��9���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��K_]��R��D��H��>��<��>��K�2��>ʆ��>��I��2́���N��4��TȇN��4��T��I��(Pބ2��>�N��4ʆ�N��4GE��R��H��>��<��>��K�2��>ʆ��>��I��(�N��4ȇN��4��I��(܉2��>��Nʆ��N���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��KGE��R��D��H��>��<��>��K�2��>��J��>��I��2ˏR��3˰(��I��B��>P�3ˏR��2;9��R��H��>��<��>��K�2��>�J��IˏR��3˰(��I��B��>�3ˏR��2�ʰD��B��N��M��G��>��B��M��G��>JHɵO��9��F��D��S��C��4ʰD��B��N��5��>��3��5��-��=�9��O��2���:��@��@��@53ɵO��9��D��S��C��4��B��5��>��I��-��=��O��2�G��@��@ʰD��B��N��M��G��>��B��M��G��> ��L��N��L��BʰD��B��N��@��@��@��L��N��L��B��@��@�ʰD��B��N��M��G��>��B��M��G��>JHɵO��9��F��D��S��C��4ʰD��B��N��5��>��3��5��-��=�9��O��2���:��@��@��@53ɵO��9��D��S��C��4��B��5��>��I��-��=��O��2�G��@��@ʰD��B��N��M��G��>��B��M��G��>)'�$������"��;��0��Q��8ҐJ��9ҽ6��W��H)'�$������"��;��0��Q��8ҐJ��9ҽ6��W��H���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A ��4��A��R��=��J��	��D��G��@��K��4��A��R��=��	��D��@��K���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��/��4��A��R��Q��=��J��B��4��/��4��A��R��Q��=��B���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A><İF��E��1��;��T��V��L��8��A��R��O��8��L��0��A��WН?��/Н?��T,*İF��B��T��V��L��8��A��R��O��8��L��AН?��-���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A20İF��E��1��;��T��V��L��8��A��R��O��8��L��0��A��W&$İF��B��T��V��L��8��A��R��O��8��L��A���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��APNİF��E��1��;��T��V��L��8��A��R��O��8��L��0��A��W��WН?��W��?��U��U��W��TН?��>;9İF��B��T��V��L��8��A��R��O��8��L��A��W��W��?��U��U��W��?���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A,*İF��E��1��;��T��V��L��8��O��3߫UТ@��H��T&$İF��B��T��V��L��8��O��3߫UТ@��H��T���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��A��R��4��J��A��R��4��J���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��L��/��4��A��R��Q��>��L��/��4��A��R��Q��>���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A&$ŷ5��/��B��A��R��4��J��X��>��<��B��B#!ŷ5��/��B��A��R��4��X��>��<��B��B���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9��H��H��9��X�RʹE��>��B��H��H��9��;��>��B���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9��C��R��9��Xǌ8��@Ԛ<��C��R��9ǌ8��@Ԛ<���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9����9��X��U��T��I��9��X��N�S��;��U��O��Iַ;��U��R��I��I��I�K��I��H��B��O��F��;��F��;��<��U��O��U��M��U��B��I��O��U��-��4��I��9��P��;��P��-��7��;��U��R��I��4��;��Vnl��9��U��I��9��S��G��O��I��U��I��I��I��B��O��F��F��<��U��O��U��U��I��O��U��-��I��9��;��P��7��G��R��I��4��;��V���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9&$��9��X��9��C�5��I��9��1ӛ?��6��9�; ��9��F�5��I��9��1ӛ?��6��9�;���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9��9��X��U��C��;��-��9��U��C��-���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9\Z��G��9��X��W�F��5ԎB��@��J��P��1��1�<��T��I��:��2��O��:��9��X��C��E��I��>��.��3��>��7��2PN��G��9��W�F��5�B��J��P��1��1�<��T��I��:��2��O��:��9��C��I��>��.��3��>��7��2���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�986��9��X��B�9ԎB��@��@��O��L��W�F�R��9��B��9��<��C��T/-��9��B�9�B��@��O��L��W�F�R��9��B��9��<��C���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9&$��9��X��9��X��5��9�Q��C��ͦ(����!��9��9��5ƋQ��C�����9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9��9��X��@��?��9��@���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9YW��9��X��B�9ԎB��@��>��5��4��W�F�R��9��B��9�� ��I��C��7��0��FŔ6��A�D��M��Iַ;��7��0DB��9��B�9�B��>��5��4��W�F�R��9��B��9�� ��I�C��0��FŔ6��1��I��7���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9DB��9��X��C��K��2�9�R��5��>��9��X��W��A��/��1��C��2��O��D�K��O��D53��9��C��2��R��5��>��9��W��A��1��C��2��O��D�K��O��D���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9&$��9��X��>��K��T��CΚI�RН?��>��A��T��9��>��K��CΚI�R��?��A���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�920��C��C��T��C��7��V��C��E��I��Cַ;��C��;��-��C��T��C��C�C��VĸIַ;��C��-��C���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9DB��H��W��:��9��X��B�9ԎB��@��=ŉE��D��W�F�R��9��B��9��9��X��C��T86��H��W��:��9��B�9�B��=ŉE��D��W�F�R��9��B��9��9��C���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9��9��X��@��T��9��X��R��0ܥ6��9��@��T��9��Rܥ6���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9��G��7��;��C��T��G��7��;��C���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9A?��Hʜ2��R��A��@�R��S�9��@��>��9��X��3��>��)��כ$��>��;��G��B;9��Hʜ2��R��A��@�R��S�9��@��>��9��3��>��)��כ$��>��;��G���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9><����N�R��=��9��X��C��9��S�9��9׵A��A��K��E��A��A��B��C��/;9����N�R��=��9��C��9��S�9��9׵A��A��K��E��A��A��B��C��/���=��@��K��E��=��=��@��E��=><��@��Q��0��H��@��KûA��Q��H��@��KûA��Q��,��H��P��H��C��B��020��@��0��H��@ûA��Q��H��@ûA��Q��,��H��P��H��B��0��=��@��K��E��=��=��@��E��=��=��@��J��@��K��I��5��@��=��@��J��@��I��5��@���=��@��K��E��=��=��@��E��=/-��7ûA��K��3��@��3��@��K��7��K��K��3������!#!��7ûA��K��3��@��3��@��7��K��3����=��@��K��E��=��=��@��E��=86��=��@��K��A��K��C��K��-��3��O��?��3��3��7��7����C��T)'��=��@��A��K��C��K��-��.��?�.��7����C���=��@��K��E��=��=��@��E��=/-��K��6��S�5��@��K��E��=��4��I��,��S��@��@��@)'��K��6��S�5��@��E��=��4��I��,��S��@��@��=��@��K��E��=��=��@��E��=��@��K��@��?��@��@���=��@��K��E��=��=��@��E��=)'��C��@ַ;��C��@��G��C��@��K��=��@��A��B&$��C��@ַ;��C��@��G��C��@��=��@��A��B��=��@��K��E��=��=��@��E��=DB��I��K��@��K��Q�Oַ;�O��E��6��V��=ԋ����J��>��J��T��7��L��J��653��I��K��@��Q�Oַ;�O��E��6��V��=�J��J��7��L��J��6�ԃP��;ܢE��4��J��Aˑ+86��Q���F��M�1��UܢE��4��NԃP��;��O��4��HН?��U��,��T#!��Q���F��M�1��U����A��O��H��,ԃP��;ܢE��4��J��Aˑ+,*ԃP��;��Q��8ȘI��K��5ܢE��4��N��>��4��O�J��A��Q��8��K��5����>��4�O�ԃP��;ܢE��4��J��Aˑ+ ԃP��;�1ܢE��4��NН?̛<��E��T��A�1����?��E��TԃP��;ܢE��4��J��Aˑ+53��Q�1��U��?��TܢE��4��N��=��D��.��4ԃP��;߽4��G��3 ��Q�1��U��?����D��4��A߽4��G�ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4׶K��2�1��T)'��A��H��Q��8��K��5����C��>��4׶K��2��1ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4б��X��Q��T)'��A��H��Q��8��K��5����C��>��4б��X��Q�ԃP��;ܢE��4��J��Aˑ+><ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��HН?Н?��>��H��T,*��A��H��Q��8��K��5����C��>��4��H��?��H��TԃP��;ܢE��4��J��Aˑ+MKԃP��;��H��Q��8ȘI��K��5��N��C��>��4ԃP��;��Q��:��3��3ȘI��J��8��2��T��H��A><��A��H��Q��8��K��5��N��C��>��4��A��Q��:��3��3ȘI��J��8��2��H�ԃP��;ܢE��4��J��Aˑ+53ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��TН?��T&$��A��H��Q��8��K��5����C��>��4��Tܞ?ԃP��;ܢE��4��J��Aˑ+/-ԃP��;��H��L��-��TܢE��4��N��C��4�1��TН?��> ��A��H��L��-��T����C��4�1��?�ԃP��;ܢE��4��J��Aˑ+��Dֈ;��0��O��F��ԃP��;ܢE��4��J��Aˑ+,*ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4 ��A��H��Q��8��K��5����C��>��4�ԃP��;ܢE��4��J��Aˑ+86��Q���F��M�1��UܢE��4��NԃP��;��O��4��HН?��U��,��T#!��Q���F��M�1��U����A��O��H��,ԃP��;ܢE��4��J��Aˑ+)'ԃP��;��E��7��2��TܢE��4��NŇ7̛<��U��T��A��E��7��T��Ň7̛<��U�ԃP��;ܢE��4��J��Aˑ+ ԃP��;�1ܢE��4��NН?̛<��E��T��A�1����?��E��TԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4б��3��Q��T)'��A��H��Q��8��K��5����C��>��4б��3��Q�ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4׶K��2�1��T)'��A��H��Q��8��K��5����C��>��4׶K��2��1ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��T��?��T��CܢE��0&$��A��H��Q��8��K��5����C��T��?��T��/�ԃP��;ܢE��4��J��Aˑ+><ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��HН?Н?��>��H��T,*��A��H��Q��8��K��5����C��>��4��H��?��H��TԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��XН?��2��J&$��A��H��Q��8��K��5����C��>��4��X��2�ԃP��;ܢE��4��J��Aˑ+53ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��TН?��T&$��A��H��Q��8��K��5����C��>��4��Tܞ?ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4Н?̛<��E��T)'��A��H��Q��8��K��5����C��>��4��?��E��T�ԃP��;ܢE��4��J��Aˑ+��Dֈ;��0��O��F��ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4Н?̛<��U��T&$��A��H��Q��8��K��5����C��>��4��?��U�ԃP��;ܢE��4��J��Aˑ+86��Q���F��M�1��UܢE��4��NԃP��;��O��4��HН?��U��,��T#!��Q���F��M�1��U����A��O��H��,ԃP��;ܢE��4��J��Aˑ+#!��Q�1��NÚQ��8ȘI��K��TԃP��;��4��Q�1��N��8��K��T��A��4�ԃP��;ܢE��4��J��Aˑ+ ԃP��;�1ܢE��4��NН?̛<��E��T��A�1����?��E��TԃP��;ܢE��4��J��Aˑ+53��-ԃP��;��HܢE��4��N��C��;�>��C��1��A��1��J��>��=)'��-��A��H����C��;�>��C��1��Aܹ1��>��=�ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4׶K��2�1��T)'��A��H��Q��8��K��5����C��>��4׶K��2��1ԃP��;ܢE��4��J��Aˑ+�~��4����N��U��.̤3��@��>ϥJ��=��T��.��-��0ܢE��4��N��5��H��0�1ԃP��;��R��:��?��=��N��.̤3��@��>��P��T��T��>��J��F��F��8��G��3b`��4����N��U��.��LϥJ��=��T��.��-��0����5��H��0�1��A��R��:��=��N��.��L��P��T��>��J��F��F��8��G�ԃP��;ܢE��4��J��Aˑ+><ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��HН?Н?��>��H��T,*��A��H��Q��8��K��5����C��>��4��H��?��H��TԃP��;ܢE��4��J��Aˑ+20ԃP��;߽4��Q��8ȘI��K��5ܢE��4��N��,��4��U��/��T&$��A߽4��Q��8��K��5����,��4��U��/��T�ԃP��;ܢE��4��J��Aˑ+53ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��TН?��T&$��A��H��Q��8��K��5����C��>��4��Tܞ?ԃP��;ܢE��4��J��Aˑ+,*��Q�1ʡH��9��B�X��TܢE��4��NН?̛<��7��T��Q�1��9�X��T����?��7�ԃP��;ܢE��4��J��Aˑ+��Dֈ;��0��O��F��ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4Н?��>��R��T&$��A��H��Q��8��K��5����C��>��4��?��R�ԃP��;ܢE��4��J��Aˑ+86��Q���F��M�1��UܢE��4��NԃP��;��O��4��HН?��U��,��T#!��Q���F��M�1��U����A��O��H��,ԃP��;ܢE��4��J��Aˑ+20ԃP��;߽4��Q��8ȘI��K��5ܢE��4��N��,��4��X��Q��T#!��A߽4��Q��8��K��5����,��4��X��Q�ԃP��;ܢE��4��J��Aˑ+ ԃP��;�1ܢE��4��NН?̛<��E��T��A�1����?��E��TԃP��;ܢE��4��J��Aˑ+MKԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��C��-��HН?̛<��&���#��#����!)'��A��H��Q��8��K��5����C��>��4��*��?���ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4׶K��2�1��T)'��A��H��Q��8��K��5����C��>��4׶K��2��1ԃP��;ܢE��4��J��Aˑ+SQԃP��;��H��QʡHɤU��B��U��H��MܢE��4��N��C��T۹/��8��H��M��T��>��J�T��8��:��G��3><��A��H��QʡHɤU��B��U��H����C��T۹/��8��H��T��>��J�T��:��G�ԃP��;ܢE��4��J��Aˑ+><ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��HН?Н?��>��H��T,*��A��H��Q��8��K��5����C��>��4��H��?��H��TԃP��;ܢE��4��J��Aˑ+ecԃP��;߽4��Q��8ȘI��K��5ܢE��4��N��,��4ԃP��;ܢE��4��N��5��NģC��F��4��Q��O�1��M��J��Eа.��TН?��>;9��A߽4��Q��8��K��5����,��4��A������Q��O��-��Eа.��T��?�ԃP��;ܢE��4��J��Aˑ+53ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��TН?��T&$��A��H��Q��8��K��5����C��>��4��Tܞ?ԃP��;ܢE��4��J��Aˑ+/-ԃP��;��H��UܢE��4��N��C��R��K��D��?��TيR̛<&$��A��H��U����C��R��K��D��?��TيR̛<�ԃP��;ܢE��4��J��Aˑ+��Dֈ;��0��O��F��ԃP��;ܢE��4��J��Aˑ+,*��7ԃP��;��E��7��2��TܢE��4��NН?��>��A��T��7��A��E��7��T����?��A�ԃP��;ܢE��4��J��Aˑ+86��Q���F��M�1��UܢE��4��NԃP��;��O��4��HН?��U��,��T#!��Q���F��M�1��U����A��O��H��,ԃP��;ܢE��4��J��Aˑ+86߹-�JН?̛<ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4&$��-��?��A��H��Q��8��K��5����C��>��4�ԃP��;ܢE��4��J��Aˑ+ ԃP��;�1ܢE��4��NН?̛<��E��T��A�1����?��E��TԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4б��X��Q��T)'��A��H��Q��8��K��5����C��>��4б��X��Q�ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4׶K��2�1��T)'��A��H��Q��8��K��5����C��>��4׶K��2��1ԃP��;ܢE��4��J��Aˑ+#!ԃP��;߽4��U��L��6��.��TܢE��4��N��A߽4��U��L��6��T���ԃP��;ܢE��4��J��Aˑ+><ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��HН?Н?��>��H��T,*��A��H��Q��8��K��5����C��>��4��H��?��H��TԃP��;ܢE��4��J��Aˑ+20ԃP��;߽4��U��7��2��TܢE��4��NԃP��;߽4��TН?��T ��A߽4��U��7��T����A߽4��Tܞ?�ԃP��;ܢE��4��J��Aˑ+53ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��TН?��T&$��A��H��Q��8��K��5����C��>��4��Tܞ?ԃP��;ܢE��4��J��Aˑ+;9ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4����0̛<��Q��T)'��A��H��Q��8��K��5����C��>��4����0��Q���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��/-��T��I�O��Tބ2��B��J��7��6��8��T��7��P��4��J#!��T��I�O��Tބ2��BќJ��6��8��7��4���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��JH��R��T��I�O��T��4�/����>��BԚ<��P��D��U���%����!��,��9��=��9��@Ԛ<;9��R��T��I�O��T��4�/����>Ԛ<��P��D��U��,��9��=��9��@Ԛ<���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��#!��T��I�O��Tބ2��B��>��T��V��>��T#!��T��I�O��Tބ2��B��>��T��V��>��T���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��SQ��R��P��4��D��3��T��MɾS��B��T��I�O��T��L��;��U��$����N��,�����%����!��@Ԛ<;9��R��4��D��3��T��M��B��T��I�O��T��L��;��U����N��,��@Ԛ<���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��86��R��9��T��I�O��T��>����B��K��1١-��J��L�;��@��@��@/-��R��9��T��I�O��T��>����B��K��1١-��8��@��@���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��tr��T��>��I�O��Tմ2�O̤@��R�O��W��BǞV��<��>��MɾS��3��D����U��J��D��P��>��W��>��5ֈD��,��D��L��9��A��D��S�D��Aki��T��>��I�O��Tմ2��@��R�O��W��BȞV��>��M��3��D����U��J��D��P��>��W��>��5ֈD��,��D��L��9��A��D��S�D��A���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��53��T��I�O��T��*��B��6��J��7��6��8��T��7��P��4��J��2)'��T��I�O��T��*��B��6ќJ��6��8��7��4��2���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��,*��T��I�O��T��>����,��:���%����!��@Ԛ< ��T��I�O��T��>����,��:��@Ԛ<���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��A?��T��7��I�O��T��>����3��D��,��R��,��S��U��U��P��4��J��@��@��@53��T��7��I�O��T��>����3��D��,��R��,��S��U��4��@��@���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��86��R��T��>��I�O��T��K��>��S��F��>����P��4��J��@��@��@)'��R��T��>��I�O��T��K��S��>����4��@��@���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��/-��T��I�O��T����B��6��2��L��C��P��4��J��>��T#!��T��I�O��T����B��6��2��C��4��>���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��MK��9��Q��D��T��7��I�O��T��>��S��F��D��U��>��F��>����;��/��?��B��RÙK��B��T><��9��Q��D��T��7��I�O��T��S��D��U��>��F��>����;��/��B��E��B���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��20��P��4��J��T��I�O��T��S��U��Xߢ?��U��,��6��X��T&$��4��T��I�O��T��S��U��X��?��6��X��T���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��20��T��I�O��T��4��7��>��4����3��D��F��D��S�D��A,*��T��I�O��T��4��>����3��D��F��D��S�D��A���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��20��T��I�O��T��B��6��J��7��6��8��T��7��P��4��B��T)'��T��I�O��T��B��6ќJ��6��8��7��4��B��T���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��hf��R��T��>��I�O��T��>����U��P��4��>�4P��=��A��N��,��:��L���%����!��*��*��P��4��>٬J��=��$��@Ԛ<SQ��R��T��>��I�O��T��>����U��4��>�4��=��A��N��,��:��L��*��*��4��>٬J��=��$��@Ԛ<���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��DB��R��P��4��J��T��I�O��T�>��M�K��J��I�O��T��K��K����D��P�D��A><��R��4��T��I�O��T�>��M�K��J��I�O��T��K��K����D��P�D��A���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A��%����A��G	��%��A��A���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A������%����A��%��A������A��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M����&����'��������%��I��I��A��$��ۏ"��&����'��������%��I��I����A��G��&����'��������%��I��I����:��A��G��D�3��A��T��(����%����!����A��G}{��&����'��������%��I��I��A����&����'��������%��I��I����A��&����'��������%��I��I����:��A��D�3��A��T��V����A���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A��%����A������ ��%��A��A������ ���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A������%����A��%��A������A��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M20��%��C��V��2��%��0��J��%��2��C��W��F��T��O��W��W)'��%��C��V��2��%��0��%��2��W��F��T��O��9���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M20��%��B��F��%��J��W����D��G��%��A��G��@��F��:��=#!��%��<��%��J����D��%��A��@��:��=���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A������%����A��%��A������A��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��MJH��D��9��G��M��>��A��Qٟ@��D��B�U��,�G߇;�G��3��M��Vٟ@��6��D��P�D��A><��DٚG��>��A��Qٟ@��D��K��,�G߇;�G��3��M��V��5��D��P�D��A���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M/-����A����G��%��;̽>��MŹ��(Źʿ��@��@��@)'����A����G��%��;�>Ź��(Źʿ��@��@���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A������%����A��%��A������A��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M20��%��D��J��W��.��>��=��V��%��J��W����G��%����A)'��%��D��J��W��.��>��=��V��%��J����G��A��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��86��$����&���� ��C��2̙EϪJֈD��T�9��J��9��@����A��B/-������ ��C��2̙EϪJֈD��T��J��9��@����A��B��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��)'�� ��2��E��C��$����&��E̛<��0��>��W��T�� ��2��E��C����E��0��>��W��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��)'��$����&���� ��C��2��G��E��9ֈD��@Ԛ<#!������ ��C��2��G��E��9ֈD��@Ԛ<��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��;9��Sޡ8��$����&��>��&��2̙E�� ֈD��>ܤK��$��'��&��9�Q��')'��S����>��&��2̙E�� ֈD��>ܤK��ƋQ��'��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD���~��6��A��B��6��T�� ��2��EۈX��D��:ۈX��>ў7��&��B��$����&��,��&ίB��>��T��7��>��K��U��V��J�J��K��U��Q��T��I��1���R��/��0��Qec��6��A��B��6��T�� ��2��EۈX��D��:ۈX��>ў7��&��B����,��&ίB��>��T��7��K��V����Q��I��1���R��/��Q��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD����$����&��2��@�� ��8�,��T����2��@�� ��,��T��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��_]��$֗>��A��S�� ��1��9��E��Ź��4��(����>��&��2��4�� ��E��B߻WֈD��1��H��%��,�9��:�� ��>��I\Z��$֗>��A��S�� ��1��9��E��Ź��4��(����>��&��2��4�� ��E��B߻WֈD��1��H��%��,��:�� ��>��I��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��A?��$����&���� ۈX��2��@��Q��T��W��N��EܾW��,��;��P��T����,��T86������ ۈX��2��@��Q��T��W��N��EܾW��,��;ٱP����,��T��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��53��ޥ0��C��E��$����&��0��>�� ��2��EֈD��J��<��=�@,*��ޥ0��C��E����0��>�� ��2��EֈD�J��=�@���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��;��D��Hؕ7��;��E��E��@��;��Dؕ7��;��E��E��@���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��Xŷ5��D��/��D��/��Xŷ5��D��D���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D ��;��D��H��B��U��>��U��W��6��T��;��DΑB��>��U��6���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��;��D��H��D��H��D��H��T��;��D��D��D��T���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��;��D��H��>	��;��D��>���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��B��D��/��>	��B��D��>���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��;��D��H��=��<��T��;��D��=��T���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��;ӈ5��U��D��>��D��H��D��H��;�5��D��>��D��D���E��1��?��0��;��E��1��?��0��;ַ;��E��1��?��,��;��@Ԛ<ַ;��E��1��?��,��;��@Ԛ<��E��1��?��0��;��E��1��?��0��;;9��K��6������>��H��E��1��K��/��Q��4��D�G��K��I����A��B86��K��6������>��H��E��1��K��/��Q��4��G��K��I����A��B���E��1��?��0��;��E��1��?��0��;GE��D��K��O��F��H��E��1��K��/��Q��4��D�G��K��O��JܤK��>��6��D��G��@��K20���H��E��1��K��/��Q��4��G��ܤK��>��6��D��@��K��E��1��?��0��;��E��1��?��0��;#!��D��E��1ߢ?��0��8��I��D��<��B��B ��D��E��1�?��8��I��D��<��B��B���E��1��?��0��;��E��1��?��0��;��1��A�?��Iַ;	��1��A��I��E��1��?��0��;��E��1��?��0��;��E��1ߢ?��0	��E��1�?���E��1��?��0��;��E��1��?��0��;ַ;��E��1��?��,��;��@Ԛ<ַ;��E��1��?��,��;��@Ԛ<��E��1��?��0��;��E��1��?��0��;PNڤ5��5ַ;��>��E��1��?��1��B��T��/��>׆B��/��1��/��6��9��I�P��T��R��;��I��@Ԛ<MKܤ5ַ;��>��E��1��?��1��B��T��/��>׆B��/��1��/��6��9��I�P��T��R��;��I��@Ԛ<���E��1��?��0��;��E��1��?��0��;GE��D��K��O��F��H��E��1��K��/��Q��4��D�G��K��O��JܤK��>��6��D��G��@��K20���H��E��1��K��/��Q��4��G��ܤK��>��6��D��@��K��E��1��?��0��;��E��1��?��0��;A?��A׆B��?��K��U��E��I��3�R��>��7��D��E��1��?��P��;��6��6��@Ԛ<;9��A׆B��?��K��U��E��I��3��>��7��D��E��1��?��P��;��6��@Ԛ<Q��E��1��?��0��;��E��1��?��0��;��1��A�?��Iַ;	��1��A��I    ���E��1��?��0��;��E��1��?��0��;ַ;��E��1��?��,��;��@Ԛ<ַ;��E��1��?��,��;��@Ԛ<��E��1��?��0��;��E��1��?��0��;53��A��U��E��1��A��Iٟ@��;��N��?�9��8��5��D��@��@��@/-��A��U��E��1��A��@��;��N��?�9��8��5��D��@��@���G��=ݰF��B��S��F��G��=�F��F#!��B��N��0ݰF��B��S��F��2��Uа.��T��B��N�F��F��2��*��G��=ݰF��B��S��F��G��=�F��F)'��A�OݰF��B��F��A����S��F��>��L��S��2 ��A�O�F��F��A����F��>��L��S���G��=ݰF��B��S��F��G��=�F��F#!��B��N��0ݰF��B��S��F��2��Uа.��T��B��N�F��F��2��*��G��=ݰF��B��S��F��G��=�F��F86ݰF��B��S��F��Q��B��J��7��6��8��T��7��QݰF��B��S��F��B&$�F��F��Q��BќJ��6��8��7��Q�F��F��B���G��=ݰF��B��S��F��G��=�F��F#!��B��N��0ݰF��B��S��F��2��Uа.��T��B��N�F��F��2��*��G��=ݰF��B��S��F��G��=�F��F ��AסE��T��/ݰF��Bٟ@��3��@Ԛ<��A��T�Fٟ@��3��@Ԛ<���G��=ݰF��B��S��F��G��=�F��F#!��B��N��0ݰF��B��S��F��2��Uа.��T��B��N�F��F��2��*��G��=ݰF��B��S��F��G��=�F��F��S��F��U��R��7��T��F��U��7��T���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J��1��1�F֎T���� ��V��>б��1��1�F֎T���� ��Vб���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J/-��S��A��S��1��1�F֎T��=��>щQ��C��E��@��@��@,*��S��A��S��1��1�F֎T��=��>щQ��C��E��@��@���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J)'������1��1�F֎T��=��?��N��;��7��8��K������1��1�F֎T����7��K���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J����=��?��N��;��C�;����M������C�;����M���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�JJH��$��U��-£-��E��7��-Ҳ0��AʡH��9�D��S��&��1��1�F֎T��7��J��6������!A?��$��U��-£-��E��7��-Ҳ0��AʡH��9�D��S��&��1��1�F֎T��7��6�����1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J;9����1��1�F֎T��B��J��Hį-��H��U��Hڶ>��2��>��A��R��@Ԛ<;9����1��1�F֎T��B��J��Hį-��H��U��Hڶ>��2��>��A��R��@Ԛ<���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J ӪN��1��1�F֎T����E��@��@��@ӪN��1��1�F֎T����E��@��@���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J86��1��1�F֎T��0��3��V��C��J��7��6��8��T��7��1��1�F֎T20��1��1�F֎T��0��3��V��CќJ��6��8��7��1��1�F֎T���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J��1��1�F֎T��@��?��1��1�F֎T��@���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�JSQ��D��R�0��7��>��I�8Ҳ0��2��A��Xڃ��N��>��1��1�F֎T��A��K��Aٟ@�H��D��P�D��APN��D��R�0��7��>��8Ҳ0��2��A��Xڃ��N��>��1��1�F֎T��A��K��Aٟ@�H��D��P�D��A���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J53��1��1�F֎T��W��"�����)����$������I��K��4��6)'��1��1�F֎T��W��"�����)������I��4���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�JJH����7��&��:֎T��1��1�F֎T����T��T��T����=��?��N��;��T����������!53����7��&��:֎T��1��1�F֎T����T��T��T������T��K���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J&$��C��E��>��1��1�F֎T��@��0��=��@Ԛ<&$��C��E��>��1��1�F֎T��@��0��=��@Ԛ<���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�JPN��1��1�F֎T����=��?��N��;����7��=��?��N��;��G����T��T��T������
+�� ����!.,��1��1�F֎T��������7����G����T��T��T�+���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�Jki��1��1�F֎T��K��Sħ;��S�� ��C��9��>��>��4��K��.��T��RҲ0��A��G�B�@��>�����=��?��N��;�����)��ʪ\Z��1��1�F֎T��K��S��S�� ��Cޖ>��>��4��K��.��T��RҲ0��A��G�B�@��>����������)��ʪ���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J53����&��1��1�F֎T��R��B��O��E��V����C��E��@��@��@,*��&��1��1�F֎T��R��B��O��E����C��E��@��@���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J,*����7��D��T��1��1�F֎T��6��U��=��?��N��;����7��T��1��1�F֎T��6�����1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J_]��$��U��-£-��E��7��-Ҳ0��AʡH��9�D��S��&��1��1�F֎T��$����U��-��CɤU��TҲ0�A��B������!YW��$��U��-£-��E��7��-Ҳ0��AʡH��9�D��S��&��1��1�F֎T��$����U��-��CɤU��TҲ0�A��B���86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CVT��X�,��1�9��CʡH��9��7�/�
+������N��W��=��H��D��EģC��K��GģC��:��7����B��W��T53��X��1ʡH��9��7����N��W��/��D�C��GģC��7����B��W86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CJH��X�,��1�9��CʡH��9��7�/��
+������N��W��=��H��D��E�8��K��D��G��@��K/-��X��1ʡH��9��7΂��N��W��/��D�8��K��D��@��K�86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
+������N��W��C��H��D��EģC��K΂:��6��T)'��X��1ʡH��9��7����N��W޻/��D�C΂:��686��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C\Z��-��A��X�,��1�9��CʡH��9��7�/�
+������N��W��=��H��D��EģC��K�
+��:��K��4��8��?��:��T><��-��A��X��1ʡH��9��7����N��W��/��D�C�
+��:��4��8��?��:��T�86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C;9��X�,��1�9��C��K�/�
+������N��W��=��H��D��E��G����I#!��X��1��K����N��W��/��D��G����I86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�Cb`��X�,��1�9��CʡH��9��7�/�
+������N��W��=��H��D��E��K����?����I��U��>��D��E��?��T΂:��C̛<A?��X��1ʡH��9��7����N��W��/��D��E����?����I��U��D��?΂:��C̛<�86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
+������N��W��=��H��D��E�8��K΂:��4��T/-��X��1ʡH��9��7����N��W��/��D�8��K΂:��4��T�86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CVT��X�,��1�9��CʡH��9��7�/�
+������N��W��=��H��D��EģC��K��GģC��:��7����B��W��T53��X��1ʡH��9��7����N��W��/��D�C��GģC��7����B��W86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
+������N��W��=��H��D��E��>��K��X��1��K����N��W��/��D��>�86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
+������N��W��C��H��D��EģC��K΂:��6��T)'��X��1ʡH��9��7����N��W޻/��D�C΂:��686��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CSQ��X�,��1�9��CʡH��9��7�/�
+������N��W��=��H��D��E��O��K��D��O��G��D��O��6��G20��X��1ʡH��9��7����N��W��/��D��O��D��G��D��6��G�86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C;9��X�,��1�9��C��K�/�
+������N��W��=��H��D��E��G����I#!��X��1��K����N��W��/��D��G����I86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CJH��X�,��1�9��CʡH��9��7�/�
+������/��@��C��H��W��D��EģC��KùB��N��L,*��X��1ʡH��9��7����N޻/��W��D�CùB��N��L�86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�Cqo��X�,��1�9��CʡH��9��7�/�
+������N��W��=��H��D��E��>��K����L��2��8��A��W��T��6��O�0��UP��D��7��>��6��;PN��X��1ʡH��9��7����N��W��/��D��>����L��P��A��W��6��O�0��UP��D��7��>��6��;�86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CVT��X�,��1�9��CʡH��9��7�/�
+������N��W��=��H��D��EģC��K��GģC��:��7����B��W��T53��X��1ʡH��9��7����N��W��/��D�C��GģC��7����B��W86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C><��D��X�,��1�9��CʡH��9��7�/�
+������N��W��=��H��D��E��K&$��D��X��1ʡH��9��7����N��W��/��D��E�86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
+������N��W��C��H��D��EģC��K΂:��6��T)'��X��1ʡH��9��7����N��W޻/��D�C΂:��686��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CPN��X�,��1�9��CʡH��9��7�/�
+������N��W��=��H��D��E�/��K��I���/�/��C��T20��X��1ʡH��9��7����N��W��/��D��/��I���/��C��T�86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C;9��X�,��1�9��C��K�/�
+������N��W��=��H��D��E��G����I#!��X��1��K����N��W��/��D��G����I86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
+������N��W��=��H��D��E��P��K ��X��1��K����N��W��/��D��P��K�86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
+������N��W��=��H��D��E�8��K�/��6��T,*��X��1ʡH��9��7����N��W��/��D�8��K�/��6�86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CVT��X�,��1�9��CʡH��9��7�/�
+������N��W��=��H��D��EģC��K��GģC��:��7����B��W��T53��X��1ʡH��9��7����N��W��/��D�C��GģC��7����B��W86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CMK��X�,��1�9��CʡH��9��7�/�
+������N��W��C��H��D��EģC��K��:΂:��<��B��B20��X��1ʡH��9��7����N��W޻/��D�C��:΂:��<��B��B�86��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
+������N��W��C��H��D��EģC��K΂:��6��T)'��X��1ʡH��9��7����N��W޻/��D�C΂:��686��X�,��1�9��C��K�/�
+������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CVT��X�,��1�9��CʡH��9��7�/�
+������N��W��=��H��D��E��P��K��G��D��O��6΂:��P��6��G;9��X��1ʡH��9��7����N��W��/��D��P��K��G��D��6΂:��P��6��G���N��B��-��<��6��6��N��-��<��6��686��-��I�I��6�I��6�I��6��6�O��U��>��E��6��D��S�D��A53��-��I�I��6�I��6�I��6��6��U��>��E��6��D��S�D��A��N��B��-��<��6��6��N��-��<��6��6#!��N��B��-��<��6��6�O��D��S�D��A��N��-��<��6��6��D��S�D��A���N��B��-��<��6��6��N��-��<��6��6_]��-��I��6��D��D��9��D��6��6��>��=��/��,ֈ;��N��?�K�C��L��3��;ނB��/��6��/��7��T��Nؕ7؄/��ESQ��-��I��6��D��9��6��6��>��=��/��,ֈ;��N��?�K�C��L�3ނB��/��6��/��7��T��Nڕ7��E��N��B��-��<��6��6��N��-��<��6��6><��-��I��6��6�O��E��6��0��FǂS��H��A��V��T��J��D��8��D��A��P/-��-��I��6��6��E��6��FǂS��H��A��V��T��D��8��A���N��B��-��<��6��6��N��-��<��6��686��-��I�I��6�I��6�I��6��6�O��U��>��E��6��D��S�D��A53��-��I�I��6�I��6�I��6��6��U��>��E��6��D��S�D��A��N��B��-��<��6��6��N��-��<��6��6#!��N��B��-��<��6��O��C��8��A��9��9��N��-��<��6��O��C��8��A��9���N��B��-��<��6��6��N��-��<��6��6_]��-��I��6��D��D��9��D��6��6��>��=��/��,ֈ;��N��?�K�C��L��3��;ނB��/��6��/��7��T��Nؕ7؄/��ESQ��-��I��6��D��9��6��6��>��=��/��,ֈ;��N��?�K�C��L�3ނB��/��6��/��7��T��Nڕ7��E��N��B��-��<��6��6��N��-��<��6��6��<��C��6��=�R��J��<��C��6�R��J���N��B��-��<��6��6��N��-��<��6��686��-��I�I��6�I��6�I��6��6�O��U��>��E��6��D��S�D��A53��-��I�I��6�I��6�I��6��6��U��>��E��6��D��S�D��A��N��B��-��<��6��6��N��-��<��6��6)'��D��-��I��H��D��6��/��E��6��-��1��1��6)'��D��-��I��H��D��6��/��E��6��-��1��1��6���N��B��-��<��6��6��N��-��<��6��6_]��-��I��6��D��D��9��D��6��6��>��=��/��,ֈ;��N��?�K�C��L��3��;ނB��/��6��/��7��T��Nؕ7؄/��ESQ��-��I��6��D��9��6��6��>��=��/��,ֈ;��N��?�K�C��L�3ނB��/��6��/��7��T��Nڕ7��E��N��B��-��<��6��6��N��-��<��6��6 кB��-��<ԋ/��C��6��6��JƱC��TкB��-��<ԋ/��C��6��6��JϱC���N��B��-��<��6��6��N��-��<��6��686��-��I�I��6�I��6�I��6��6�O��U��>��E��6��D��S�D��A53��-��I�I��6�I��6�I��6��6��U��>��E��6��D��S�D��A��N��B��-��<��6��6��N��-��<��6��6,*��N��B��-��<�I��6��C��?����6��P����6��T&$��N��-��<�I��6��C��?����6��P����6���N��B��-��<��6��6��N��-��<��6��6_]��-��I��6��D��D��9��D��6��6��>��=��/��,ֈ;��N��?�K�C��L��3��;ނB��/��6��/��7��T��Nؕ7؄/��ESQ��-��I��6��D��9��6��6��>��=��/��,ֈ;��N��?�K�C��L�3ނB��/��6��/��7��T��Nڕ7��E��N��B��-��<��6��6��N��-��<��6��686��5��-��<��H��C��?��D��A��P��;��0��T��?��6��T��)����!&$��5��-��<��H��C��?��A��;��T��6��T��)���İU��7�/��İU��7�/ �
+��N��U��İU��7�/�.��@��K �
+��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/53��4��U��İU��7�/��5�.��W��@ßN��W��F�/��U��I��T20��4��U��İU��7�/��5�.��W��@ßN��W�/��U��I��T���İU��7�/��İU��7�/86��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��W��A��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��A��İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐWН?,*��4��U��İU��7�/��5�.��W��@ßN��W�/̐W���İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐW��W/-��4��U��İU��7�/��5�.��W��@ßN��W�/ÐW��W��İU��7�/��İU��7�/;9��U��İU��7�/�.��W��@ßN����ۏ"����(����!��U��E��T(&��U��İU��7�/�.��W��@ßN�[��U��E��T���İU��7�/��İU��7�/ �
+��N��U��İU��7�/�.��@��K �
+��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��9��TŇ7��I��T/-��4��U��İU��7�/��5�.��W��@ßN��9��TŇ7��I���İU��7�/��İU��7�/86��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��W��A��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��A��İU��7�/��İU��7�/;9��4��U��İU��7�/��5�.��W��@ßN��0��W��F�9��G��3��Q��T86��4��U��İU��7�/��5�.��W��@ßN��0��W�9��G��3��Q��T���İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐW��W/-��4��U��İU��7�/��5�.��W��@ßN��W�/ÐW��W��İU��7�/��İU��7�/,*��4��U��İU��7�/��5�.��W��@ßN��W��F�/)'��4��U��İU��7�/��5�.��W��@ßN��W�/���İU��7�/��İU��7�/ �
+��N��U��İU��7�/�.��@��K �
+��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/,*��4��İU��7�/��5��:��S��9İUН?̛<��7��T&$��4��İU��7�/��5��:��S��9İU��?��7���İU��7�/��İU��7�/86��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��W��A��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��A��İU��7�/��İU��7�/��R��>Н?��T	��R��>ܞ?���İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐW��W/-��4��U��İU��7�/��5�.��W��@ßN��W�/ÐW��W��İU��7�/��İU��7�//-����U��İU��7�/���.��W��@ßN�1��T��7̛<,*����U��İU��7�/���.��W��@ßN��1��7̛<���İU��7�/��İU��7�/ �
+��N��U��İU��7�/�.��@��K �
+��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/53��4��İU��7�/��5��:��S��9İU��:��4��K����"����!,*��4��İU��7�/��5��:��S��9İU��:��4��K��"���İU��7�/��İU��7�/86��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��W��A��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��A��İU��7�/��İU��7�/;9��4��U��İU��7�/��5�.��W��@ßN��W��F�?�9��G��HН?��T20��4��U��İU��7�/��5�.��W��@ßN��W�?�9��G��/���İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐW��W/-��4��U��İU��7�/��5�.��W��@ßN��W�/ÐW��W��İU��7�/��İU��7�/GE����U��İU��7�/�.����W��@ßN��W��F�/ɴ9Н?��Tɴ9ʡH��9��?�/��T;9����U��İU��7�/�.����W��@ßN��W�/ɴ9ܞ?ɴ9��9�/��T���İU��7�/��İU��7�/ �
+��N��U��İU��7�/�.��@��K �
+��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/#!��4��U��İU��7�/��5�.��W��@ßN#!��4��U��İU��7�/��5�.��W��@ßN���İU��7�/��İU��7�/86��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��W��A��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��A��İU��7�/��İU��7�/86��4��U��İU��7�/��5�.��W��@ßN��W��F�/̝5̛<��Q��T20��4��U��İU��7�/��5�.��W��@ßN��W�/̝5̛<��Q���İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐW��W/-��4��U��İU��7�/��5�.��W��@ßN��W�/ÐW��W��İU��7�/��İU��7�/53��T��1��4��U��İU��7�/��5�.��:��S��9İU��W��Q��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��W��Q���İU��7�/��İU��7�/ �
+��N��U��İU��7�/�.��@��K �
+��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/A?��4��U��İU��7�/��5�.��W��@ßN��W��F�/��E��:��Tɴ9��:��T�;86��4��U��İU��7�/��5�.��W��@ßN��W�/��E��:ɴ9��:�;�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J20��K��F��Eڶ>��FˎW��B��D��I��K��T�����)��ʪ/-��K��F��Eڶ>��FˎW��B��D��I��K�����)��ʪ�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J&$��Sį-��K��>��J���N��T���C��T��T#!��Sį-��K��>��J�ϞN���C��T��T�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J кB��6��Sį-��K��I��K��T��:��KкB��6��Sį-��K��I��K��:��K�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J#!��;��Kʗ,��/��Sտ7��P��C��@��;��B ��;��Kʗ,��/��Sտ7��P��C��;��B�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J20��3��B��B��D��K��6��S��9��A��@��S��@��0��6��0��T20��3��B��B��D��K��6��S��9��A��@��S��@��0��6��0��T�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J/-��;��Kʗ,��/P��L��>��C��B��F�R��K�A��K��B,*��;��Kʗ,��/P��L��>��C��B��F�R��K�A��K�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J)'��;��Kʗ,��/��S��K��D͙7��I��R��N͙7��T&$��;��Kʗ,��/��S��K��D��I��R��N͙7��T�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J#!��S��Kб��J��7��6��8��T��7��U��>��S��KбќJ��6��8��7��U��>���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;��,��B��R��/��>��4��7��,��B��R��/��>��4��7���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;;9ѹ6��7��,��BƸ=��D��J��7��.��K��/��B��9��A��=��B��@��@��@&$չ6��,��BƸ=��D��J��*��/��B��A��@��@���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;;9��R��/��B��,��B��.��P��԰��'��0��V��A������Uѹ6��F��G,*��R��/��B��,��B��.��P��0��V��A��"��Uݹ6��G���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;zx��R��/��,��B��C��M��R��/��@��B��B�R��-��P��2�K��O��N��J��7��6��8��T��7��;��2��/ޟEŮ<��N��6��9��A��=��Bǭ;��HΆO��-��5_]��R��/��,��B��C��M��R��/��@��B��B�R��-��P��2�K��O��NќJ��6��8��7��;��R��6��N��A�HΆO��-��5���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;86���
+��>ß1��QİL��	��R��/Ξ)��,��B��WβI��3��I��@��K/-��>ß1��QİL��	��R��/Ξ)��,��B��WβI��3��I��@���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;\Z��(����<��7����N��6����B����=��G�;��3��>��7����K���������������������#!��<����K����������������R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;)'��N��6��@��4��,��B��H��A��R��/��D��@Ԛ<&$��N��@��4��,��B��H��A��R��/��D��@Ԛ<���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;/-��R��/��,��B��@��Hٟ@ʜ2��I��A��N��6��@��@��@)'��R��/��,��B��@��Hٟ@ʜ2��I��A��N��@��@���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;,*��/��>��,��B��J��>��,��B��J��>��,��B��A��B,*��/��>��,��B��J��>��,��B��J��>��,��B��A��B���B��7��8��;��U��B��8��;��U��B��;��U��>��C��@��K��B��;��U��>��C��@��K��B��7��8��;��U��B��8��;��U,*ʡH��9��=��7��B��;��U��>��C��E��J��<��B��B,*ʡH��9��=��7��B��;��U��>��C��E��J��<��B��B���B��7��8��;��U��B��8��;��U ��B��;��U��>��C��7��C��<��B��B��B��;��U��>��C��7��<��B��B��B��7��8��;��U��B��8��;��U53��B��7��;��U��S�L��8��7��B��9��7��7��	��N����H��&$��B��;��U��S�L��8����	��N����H�����B��7��8��;��U��B��8��;��U&$��B��7��;��>��8��N��@���>��;��G��B ��B��;��>��8��N��@���>��;��G��B��7��8��;��U��B��8��;��U,*��B��7��;��>��8��N��@Ɓ-��6��7Ɓ-��6��H��T#!��B��;��>��8��N��@ȁ-��7ȁ-��H��T���B��7��8��;��U��B��8��;��U��7��B��;��U��>��C��8�,��T��7��B��;��U��>��C��,��T��B��7��8��;��U��B��8��;��U��B��;��U�L��C��8�,��T��B��;��U�L��C��,��T���B��7��8��;��U��B��8��;��U��B��;��U��>��C��@��K��B��;��U��>��C��@��K��B��7��8��;��U��B��8��;��U)'��7��B��;��U��>��C��B��U��8��J��<��B��B)'��7��B��;��U��>��C��B��U��8��J��<��B��B���B��7��8��;��U��B��8��;��U ��B��;��U��>��C��7��C��<��B��B��B��;��U��>��C��7��<��B��B��B��7��8��;��U��B��8��;��U,*��B��7��>��;��U��N��8��C��.��V��I��<��7��; ��B��>��;��U��N��8��C��.��I��7���F��U��/��J��.ʭB�/��F��J��.ϭBMK��D��V��D��:��J��TʭB�/��>ڶ>��9Ԛ<��Q��D��M��/��F��U��7��>�G��J��E��@Ԛ<A?��D��V��D��:��J��TϭB��>��9Ԛ<��Q��D��M��F��7��>�G��J��E��@Ԛ<��F��U��/��J��.ʭB�/��F��J��.ϭB��J��.ʭB�/��@��?��J��.ϭB��@���F��U��/��J��.ʭB�/��F��J��.ϭB#!��J��.ʭB�/��L����F��U��O��@��K��J��.ϭB��L����F��O��@��K��F��U��/��J��.ʭB�/��F��J��.ϭB)'��J��.ʭB�/��L����F��;��F��?��8�,��T ��J��.ϭB��L����F��;��F��,��T���F��U��/��J��.ʭB�/��F��J��.ϭBMK��D��V��D��:��J��TʭB�/��>ڶ>��9Ԛ<��Q��D��M��/��F��U��7��>�G��J��E��@Ԛ<A?��D��V��D��:��J��TϭB��>��9Ԛ<��Q��D��M��F��7��>�G��J��E��@Ԛ<��F��U��/��J��.ʭB�/��F��J��.ϭB,*��J��.ʭB�/��L����F��U��O��'��G��N��O��C&$��J��.ϭB��L����F��O��'��G��N��O��C���F��U��/��J��.ʭB�/��F��J��.ϭB#!��J��.ʭB�/��L����F��U��O��@��K��J��.ϭB��L����F��O��@��K��F��U��/��J��.ʭB�/��F��J��.ϭB,*��J��.ʭB�/��L����F��U��O��L��B��<��B��B&$��J��.ϭB��L����F��O��L��B��<��B��B���F��U��/��J��.ʭB�/��F��J��.ϭBMK��D��V��D��:��J��TʭB�/��>ڶ>��9Ԛ<��Q��D��M��/��F��U��7��>�G��J��E��@Ԛ<A?��D��V��D��:��J��TϭB��>��9Ԛ<��Q��D��M��F��7��>�G��J��E��@Ԛ<��F��U��/��J��.ʭB�/��F��J��.ϭB��J��.ʭB�/��>��L��J��.ϭB��>��L���F��U��/��J��.ʭB�/��F��J��.ϭB#!��J��.ʭB�/��L����F��U��O��@��K��J��.ϭB��L����F��O��@��K��F��U��/��J��.ʭB�/��F��J��.ϭB53��J��.ʭB�/��8��N��J��.ʭB�/�G��>��9��8��F��>��T,*��J��.ϭB��8��N��J��.ϭB�G��>��9��8��F��>���F��U��/��J��.ʭB�/��F��J��.ϭBMK��D��V��D��:��J��TʭB�/��>ڶ>��9Ԛ<��Q��D��M��/��F��U��7��>�G��J��E��@Ԛ<A?��D��V��D��:��J��TϭB��>��9Ԛ<��Q��D��M��F��7��>�G��J��E��@Ԛ<��F��U��/��J��.ʭB�/��F��J��.ϭB/-��J��.ʭB�/��8��I��C¨0��3��?��;��9��<��>��T��J��.ϭB��8��IϨ0��-��<��>���F��U��/��J��.ʭB�/��F��J��.ϭB#!��J��.ʭB�/��L����F��U��O��@��K��J��.ϭB��L����F��O��@��K��F��U��/��J��.ʭB�/��F��J��.ϭB��J��.ʭB�/�;��J��6��J��.ϭB�;��J��6���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=/-��?��;��8��W��B��=��&��;��WɾS��2��S��C��I��9)'��?��;��8��W��B��=��&��;��W��2��S��C��9���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=/-��H޽B��;��8��A��E��0��W��B��=щQ��U��P��.��T,*��H޽B��;��8��A��E��0��W��B��=щQ��U��P��.���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=JH��W��B��R��P��I��9��=��5��0��<��G��0��G��8��7��W��G��Q��D��2��G��OŒA��TDB��W��B��R��P��I��=��5��0��<��G��0��G��8��7��W��G��Q��D��2��GŒA��T���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=20��P��G��,��D��N��G��8��0��6��W��B��=��C��=��S��7,*��P��G��,��D��N��G��8��5��W��B��=��C��S��7���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=GE��W��=��D��,��?��R��;��G��0��G��8��D��N��@��W��G��7ӽD��I��E��CӽD��I><��W��=��D��7��R��;��G��0��G��8��D��N��@��W��GӽD��I��EӽD��I���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=20޽B��R��0��W��B��>��=�M��>��I��?��;��8щQ��@Ԛ<20޽B��R��0��W��B��>��=�M��>��I��?��;��8щQ��@Ԛ<���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=)'��;��8��0��W��B��=��D��>щQ��D��S�D��A&$��;��8��0��W��B��=ӗ>щQ��D��S�D��A���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=JHԓ4��5��9��D��0ԓ4��B��=����S��R��J�>��E��;��8��6��S��T�!����!����";9ԓ4��5��D��0ԓ4��B��=����S��R��J�>��E��;��8��6��S��T��X���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=#!��;��8��>��E��6��Q��W��B��=��@��N ��;��>��E��6��Q��W��B��=��@��N���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=��R����8��G��8��>��=��>��P��R����8��G��8��>��=��>��P���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=VTԓ4��5��9��D��0ԓ4��B��=��O��<��G��BǄP�B�@��;��8��>׽R��G��6��S��T�!����!����"DBԓ4��5��D��0ԓ4��B��=��O��<��G��BǄP��B��;��8��>׽R��G��6��S��T��X���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=DB��G׫;��@��2��>��H��8��G�K��0��G��8��W��B��=��F��?��H��G��,��H��,DB��G׫;��@��2��>��H��8��G�K��0��G��8��W��B��=��F��?��H��G��,��H��,���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=\Z��7��W��C����Ȼ����������2��2��H��G��/��C��N�K��0��8��W��=ߌ,��3��=��G��Gև9��>��TYW��7��W��C����Ȼ����������2��2��H��G��/��C��N�K��0��8��W��=ߌ,��3��=��G��Gև9��>���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=����H��$��,��G��G��8��8��W��-��B��G��H��H��H��$��,��G��G��8��8��W��-��B��G��H��H��Q��H��$��,��G��G��8��8��W��-��B��G��H��H��D��E��T��L��B��L��=��,��K����H��$��,��G��G��8��8��W��-��B��G��H��H��$��,��G��G��8��8��W��-��B��G��H��Q��H��$��,��G��G��8��8��W��-��B��G��H��D��E��T��L��B��L��,��K���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=86��G��,��D��N��G��8��0��6��W��B��=��C��=��Pֈ;̛<��A��T/-��G��,��D��N��G��8��5��W��B��=��C��Pֈ;̛<��A���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=DB��T��C��R�J��G��<��8��Q��G��8��O��6��0��G��6��U��<��8��Gڶ>��S��=86��C�J��G��<��8��Q��G��8��O��6��0��G��6��<��8��G��S��=���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=DB��S��9��I��/��C��D��<��8�J��Gԓ4��G��W��B��-��R��N��=��
+��K��F�7DB��S��9��I��/��C��D��<��8�J��Gԓ4��G��W��B��-��R��N��=��
+��K��F�7���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��20��������Ͳ��4ʉ5��/��%��D�H��G��A��A��O��C��4ˉ5��%��D�H��A��A��O��C���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5�� ��������ʉ5ޚT��D��G��@��K����5��D��@��K���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��&$��������ۚKʉ5��R��G̛<��"����&��ۚK݉5��G̛<��"���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��ʉ5����������8�,��Tʉ5����,��T���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��20��7������������ʉ5ޚT��4��L��/ȈX��<��B��B��7��5��4��L��/ȈX��<��B��B���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��,*����������H���6��=��>ʉ5��B��-��A��B#!����H���6��=��>ʉ5��B��-��A��B���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��;9��������ۚK��4ʉ5��G��8��O��E��>έ;��L�S��DʡH��9�;,*��ۚK��4ʉ5��G��O��E��>٭;�S��DʡH��9�;���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��#!��@����������>ʉ5��D��S�D��A��@����>ʉ5��D��S�D��A���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��&$������$��6����������6ʉ5��@Ԛ<������$��6����6ʉ5��@Ԛ<�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6��@��K��-��;��@��K��-��;�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��653��.��H�B��@��M��6��4��A��6�O��I��0щQ��U��P��.��T/-��.��H�B��@��M��6��4��A��6��I��0щQ��U��P��.�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6><��.��H�B��@��M��6��4��A��6�O��H��A��V��T��J��D��8��D��A��P20��.��H�B��@��M��6��4��A��6��H��A��V��T��D��8��A�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6)'��.��4�9�B�3��I��6�O��F��U��P��U��T#!��.��4�9�B�3��I��6��F��U��P��U�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6A?ڤ5��5��D��>��.��1�B��@��D��4��A��=��������@��6�O��G��;��P20ܤ5��D��>��.��1�B��@��4��A��=����@��6��G��;��P�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6GE��.��J��S��=��H�B��@��D��H��4��A��D��A��P��;��0��T��?��6��T��)����!,*��.��S��H�B��@��H��4��A��A��;��T��6��T��)�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��620��.�B��K��M��4��A��H��A��V��T��J��D��8��D��A��P)'��.�B��K��M��4��A��H��A��V��T��D��8��A�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��.��H�B��@��D��4��A��=��6�O��G��U��P��9��T#!��.��H�B��@��4��A��S��G��U��P��9���@��G��M��T	��@��G��M><��VܞN��T��>�� ���� ��B�IɤU��1��.��@��G��M��T��C�3��G��9/-��V��N��>��B�IɤU��1��.��@��G��M��C�3��G��9��@��G��M��T	��@��G��MDB��@��G��.��M��T��A�/��B��@��G��.��M��T��Q��8ޚT��N��G��K��T��O��T,*��@��G��6��A�/��@��G��6��Q��8��+��K��T��O���@��G��M��T	��@��G��M;9��@��G��.��M��T��,��;��M��T��7��3��;��E��=��5��7��T��Iַ;)'��@��G��6��,��;��M��7��;��E��5��7��T��I��@��G��M��T	��@��G��M)'��@��G��@��M��T�/��-��5��6��P��9�?ַ;#!��@��G��@��M�/��-��5��6��P��9��?���@��G��M��T	��@��G��M����@��G��.��M��T��G��@��=��@��G��M��T��.��@��M��T��C�3��G�3��G��9ܞN��T��T��O��C�3��G��9��8��Iַ;��@��G��.��M��T��G��@��=��@��G��M��T��.��@��M��Tki��@��G��6��G��=��@��G��M��.��@��M��C�3��G�3��G��9��N��T��C�3��G��9��I��@��G��6��G��=��@��G��M��.��@��M��@��G��M��T	��@��G��M86��@��G��.�4��@ϚL��4��M��T�;��M�4߹-��W��Hԓ6��Iַ;&$��@��G��.��@��4��M��6߹-��W��Hԓ6��I���@��G��M��T	��@��G��M&$��@��G��.��M��T߹-�5��T��O��O��Iַ;��@��G��6߹-�5��T��O��I��@��G��M��T	��@��G��M����.��@��M��T��.��M��T��@��M��TܞN��D��>��.��M��T��E��=��.��M��T��=��.��M��T�IϪJ��1��.��M��@��G��.��@��M��T��D��C�3��G��9��8��Iַ;\Z��.��@��M��6��@��M�N��>��6��E��=��6��=��6�IϪJ��1��.��M��@��G��.��@��M��D��C�3��G��9��I���@��G��M��T	��@��G��M&$��.��M��@��G��M��T�J��-��U��@ؙD��T#!��.��M��@��G��M�J��-��U��@ؙD��T��@��G��M��T	��@��G��M)'��'��=��.��@��G��M��T��I��B��.��<��B��B#!��'��=��.��@��G��M��I��B��<��B��B���@��G��M��T	��@��G��M\Z��.��M��@��G��M��T��.��M��T��.��@��M��T��@��M��T��E��M��T��=��.��M��T��C�3��G��9��8��Iַ;><��.��M��@��G��M��6��.��@��M��@��M��E��M��=��6��C�3��G��9��I��@��G��M��T	��@��G��M/-��@��G��=��@��G��.��M��T��=��.��M��T�I��G��@ ��@��G��=��@��G��6��=��6�I��G���@��G��M��T	��@��G��M><��VܞN��T��>�� ���� ��B�IɤU��1��.��@��G��M��T��C�3��G��9/-��V��N��>��B�IɤU��1��.��@��G��M��C�3��G��9��@��G��M��T	��@��G��MDB��M��U��@��G��@��M��T��@��M��T��M��T��M��U��,��H��P��5ѳBʈF��P��?53��M��@��G��@��M��@��M��M��M��,��H��P��5ѳBʈF��P��?���@��G��M��T	��@��G��M;9��@��G��.��M��T��,��;��M��T��7��3��;��E��=��5��7��T��Iַ;)'��@��G��6��,��;��M��7��;��E��5��7��T��I��@��G��M��T	��@��G��M53��@��G��.��M��Tַ;��@��G��.��M��T��D��,��B��Pַ;Υ6&$��@��G��6ַ;��@��G��6��D��,��Pַ;Υ6c    ��I��6�;��0ڳQ	��+��0ڳQ ��I��6�;ٟ@�9ٟ@��0��A��@Ԛ<��+��9��0��A��@Ԛ<K    ��I��6�;��0ڳQ	��+��0ڳQ��I��6�;��-��N	��+��-��N�    ��I��6�;��0ڳQ	��+��0ڳQ20��I��6�;��0ʭBќ:��-��W��I��6�;��I��6��>��S��2&$��+��0ʭBќ:��-��W��I��6��I��6��>��So    ��I��6�;��0ڳQ	��+��0ڳQ&$��U��I��I��6�;��-��N�1��D��@��@��@��U��I��+��-��N�1ځD��@�    ��I��6�;��0ڳQ	��+��0ڳQSQ��������I��6�;��0��9��6�W��I��-��:��P��U��PޜF��T��IP��R��M��T��I��6ޜF��6JH��������+��0��9��6�W��I��-��:��P��U��PޜF��T��I��R��M��T��I��6ޜF��6�    ��I��6�;��0ڳQ	��+��0ڳQA?��Q��2�?��E��C��=��E��@��.��=��9�Q��C��B��9�Q��C��ͦ(����!)'��Q��2�?��E��C��=��@ƋQ��C��BƋQ��C��i    ��I��6�;��0ڳQ	��+��0ڳQ ��.��I��W��I��6�;��8�T��A��B��.��I��W��+��8�T��A��B�    ��I��6�;��0ڳQ	��+��0ڳQ86��I��6�;��6��U��=�9��=��>��C�<ʡH����6��I��H�<��T&$��+��6��9��>��C�<ʡH����6��I��H��T���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;GE��<��M��N��L��;��;��T��B��T��4��B��T��/��R��6��G��U��K��P��9��PگD��T><��<��M��N��L��;��;��B��4��B��T��/��R��6��G��U��K��9��PگD��T���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;53ä=��F��9��E��N��L��<��M��N��L��M��T��M��=��E��P��>,*ä=��F��B��N��L��<��M��N��L��M��T��M��E��P���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;����.��M�@��D��>��3��PϪJ��B��E҄J�J��9��R��>�9ֈD��C��S��W��9ٟ@��1��9��2��D��>��9��E��<��M��N��L��A��M��7��S�9��=�>��D��>��9��@��S��6��;��,��D��P��>��=��/��U��P��.��T����.��M�@��D��>��3��PϪJ��B��EԄJ��9��R��>��D��C��S��W�@��1��9��2��D��>��B��<��M��N��L��A��M��S��=�>��D��>��9��@��S��6��;��,��D��P��>��=��/��U��P��.���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;YW��J��9��E��<��M��N��L����5��?��J����7��7��E��B��=����H��Q��2��8����@ǆ9��V��T��P��HSQ��J��B��<��M��N��L����5��?��J����7��7��B��=����H��Q��2��8����@ǆ9��V��T��P��H���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;JH��D��9��>��6��E��>��<��M��N��LëO��8��2޽B��4��;щQʡHб��6��T��H�<��TGE��D��9��>��6��E��>��<��M��N��LëO��8��2޽B��4��;щQʡHб��6��T��H��T���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;#!��9��EʕV��<��M��N��LʕV��6��@Ԛ< ��BʕV��<��M��N��LʕV��6��@Ԛ<���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;PN��N��A��=��<��M��N��L��;��;��T�Rڶ>��S��E��>��C��=��,��B��/��7Ȼ;��T��=��.��LGE��N��A��=��<��M��N��L��;��;��T�R��S��E��>��C��,��B��/��7Ȼ;��T��=��L�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G,*��A��>����Q��5��=��Qڶ>��S��ȥW��@��@��@&$��A��>����Q��5��=��Q��S��ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G53��>��W��5��CȥW��G��8��E��<����=��?��N��;M�8��T)'��>��W��5��CȥW��G��8��E��<����M��8�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��GDB��>����@��G��W�C����;��9��Q��6��6��B��W����4����ȥW��@��@��@><��>����@��G��W�C����;��9��Q��6��B��W����4����ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��Ghf����;��>����>��WȥW��,��:��K��>��;����=��?��N��;����7��=��?��N��;��G����T��T��T������
+�� ����!FD����;��>����>��WȥW��,��:��K��>��;��������7����G����T��T��T�+�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G,*��A��>����Q��5��=��Qڶ>��S��ȥW��@��@��@&$��A��>����Q��5��=��Q��S��ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��GDB��>��W��5��C����WȥW��G��8��E��<��=��?��N��;����T��T��T��G�8̛<86��>��W��5��C����WȥW��G��8��E��<������T��T��T��G��8�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��GDB��>����@��G��W�C����;��9��Q��6��6��B��W����4����ȥW��@��@��@><��>����@��G��W�C����;��9��Q��6��B��W����4����ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G20��ȥW��>��W��2��G�/��I֣.ŞG��9�/��;����7�;20��ȥW��>��W��2��G�/��I֣.ŞG��9�/��;����7�;�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G,*��A��>����Q��5��=��Qڶ>��S��ȥW��@��@��@&$��A��>����Q��5��=��Q��S��ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G20����>����>��Q��R��@��8��S֗T��7��ȥW��@��@��@/-����>����>��Q��R��@��8��S֗T��7��ȥW��@��@�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��GDB��>����@��G��W�C����;��9��Q��6��6��B��W����4����ȥW��@��@��@><��>����@��G��W�C����;��9��Q��6��B��W����4����ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G20����>��ȥW��S��8��D��0��;����T����=��?��N��;)'����>��ȥW��S��8��D��0��;����T���������ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G,*��A��>����Q��5��=��Qڶ>��S��ȥW��@��@��@&$��A��>����Q��5��=��Q��S��ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G#!��ȥW��ȥW��K��ȥW��,��:ĝ�� ��ȥW��ȥW��K��ȥW��,��:؝�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��GDB��>����@��G��W�C����;��9��Q��6��6��B��W����4����ȥW��@��@��@><��>����@��G��W�C����;��9��Q��6��B��W����4����ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G_]��N��9��U��L��=��>˾3��ȥW��>����G��/��N��Iǡ6����T��T��T����=��?��N��;��T����������!DB��N��U��=��>˾3��ȥW��>����G��/��N��Iǡ6����T��T��T������T��K�	��?�J�=	��?�J�=)'��V��H��D�J��>��4�=��5��D�3Ȼ;��>��T ��V��D��>��4�=��5��D�3Ȼ;��>	��?�J�=	��?�J�=��D�J��>��?��=��D�J��>��?��=}	��?�J�=	��?�J�=��?�J�=��Uа.��T��?�J�=��*	��?�J�=	��?�J�=�J��?��,��=�J��?��,��=�	��?�J�=	��?�J�=;9��?��E�J�=׍Q��7��E��7��0��	��NʡH��
+��H��0��6��4��T��53��?��E�J�=׍Q��,��0��	��NʡH��
+��H��0��6��4��T��	��?�J�=	��?�J�=��D�J��>��?��=��GĊA��>��T��D�J��>��?��=��GĊA��>�	��?�J�=	��?�J�=/-��D��9��D��D�G��?��>�J��>��,��N��D��S�D��A#!��9��G��?��>�J��>�=��D��S�D��A	��?�J�=	��?�J�=��?��E�J׍Q��D��G��@��K��?��E�J׍Q��D��@��K�	��?�J�=	��?�J�=��D��/��F��;	��D��F��;	��?�J�=	��?�J�=,*��D�J��>��?��=��E��?��N��K��L��F��9��@��K)'��D�J��>��?��=��E��?��N��K��L��F��9��@�	��?�J�=	��?�J�=��?��=��E�J�=׍Q��P��B��6��?��=��E�J�=׍Q��P��B	��?�J�=	��?�J�=����;��?��1��K��E�J��>�=׍Q��C��P��D��C��K��9��K��>ٟ@���9��@��9��W��>��4��R��/ҾW��B��1��.�O��>��N����B�9��K��J��K��>��N�9͝,ڪ3��.��WȻ��B�D��E��A¶7ģC��:��Q����;��?��1��K��E�J��>�=׍Q��C��P��D��C��9��>ٟ@���9��@��9��W��>��4��R��/ҾW��B��1��.�O��>��N����B��8��J��>��N�9Ν,��.��WȻ��B�D��E��NģC��:��Q�	��?�J�=	��?�J�=20����?����>��?�J��>��,��N��1��6��6��=��=��@Ԛ<)'����?����>��?�J��>�=��1��6��=��@Ԛ<	��?�J�=	��?�J�=��D�J��>��?��=��@��K��D�J��>��?��=��@��K�	��?�J�=	��?�J�=><��D�J��>��?��=��4��F��S��CܞN��/����O��������J��-��0��E/-��D�J��>��?��=��4��F��S�N����O����J��7��E	��?�J�=	��?�J�=��?�J�=��4Н?��A��3��A��T��?�J�=��4��A��A�	��?�J�=	��?�J�=)'��V��H��D�J��>��4�=��5��D�3Ȼ;��>��T ��V��D��>��4�=��5��D�3Ȼ;��>	��?�J�=	��?�J�= ��?��E�J�=׍Q��F��K��	��A��B ��?��E�J�=׍Q��F��K��	��A��B�	��?�J�=	��?�J�=��?�J�=��Uа.��T��?�J�=��*	��?�J�=	��?�J�=)'�J��>��?�=ʡH۩R��V��-��T��.��6��.��T&$�J��>��?�=ʡH۩R��V��-��T��.��6��.�	��?�J�=	��?�J�=;9��?��E�J�=׍Q��7��E��7��0��	��NʡH��
+��H��0��6��4��T��53��?��E�J�=׍Q��,��0��	��NʡH��
+��H��0��6��4��T��	��?�J�=	��?�J�=20��U��W��X�=��6��?��K��J�J�=��3��WН?��>��A��T,*��U��W��X�=��6��?��K��J�J�=��3��W��?��A�	��?�J�=	��?�J�=/-��D��9��D��D�G��?��>�J��>��,��N��D��S�D��A#!��9��G��?��>�J��>�=��D��S�D��A	��?�J�=	��?�J�=86��4��?߸3ѝ6��B��5��-��0��I�J��?߸3�=��=��I̛<��Q��T20��4��?߸3ѝ6��B��5��0��I�J��?߸3�=��=��I̛<��Q�	��?�J�=	��?�J�=��D��/��F��;	��D��F��;	��?�J�=	��?�J�=)'��D�J��7��?��>��=����F��>��>��@��>��T#!��D�J��7��>��=����F��>��>��@��>���I��F��E��T��>��I��F��T��>_]��I��F��E��7��1��U��C��5�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��TVT��I��F��7��1��U��C�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��I��F��E��T��>��I��F��T��>&$��1��F��E��W��K��.��W�K��C��:��E��T��1��F��W��K��W�K��C��:��E���I��F��E��T��>��I��F��T��>,*��I��F��E��D��6��A��S��1��F՟?��>��>��@Ԛ<#!��I��F��D��6��Aū1��?��>��>��@Ԛ<��I��F��E��T��>��I��F��T��>;9��I��F��E��A��W̋?�6��F��F��1��U�K��>�6�2��6��:��:��@20��I��F��A̋?�6�.��1��U�K��>�6�2��6��:��:��@���I��F��E��T��>��I��F��T��>_]��I��F��E��7��1��U��C��5�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��TVT��I��F��7��1��U��C�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��I��F��E��T��>��I��F��T��>&$��I�K��M��F��E��-��>��CϨH��Q��R��T��I�K��M��F��-��CΨQ��R��T���I��F��E��T��>��I��F��T��>,*��I��F��E��D��6��A��S��1��F՟?��>��>��@Ԛ<#!��I��F��D��6��Aū1��?��>��>��@Ԛ<��I��F��E��T��>��I��F��T��>20��I��F��E��D��6��A��S��1��F՟?��>��>��D��S�D��A)'��I��F��D��6��Aū1��?��>��>��D��S�D��A���I��F��E��T��>��I��F��T��>_]��I��F��E��7��1��U��C��5�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��TVT��I��F��7��1��U��C�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��I��F��E��T��>��I��F��T��>53��F��E��>��>��M��*��ɬI��*��I��*��5��5��T��H�<��T,*��F��>��>��M��*��ɬI��*��I��*��5��T��H��T���I��F��E��T��>��I��F��T��>,*��I��F��E��D��6��A��S��1��F՟?��>��>��@Ԛ<#!��I��F��D��6��Aū1��?��>��>��@Ԛ<��I��F��E��T��>��I��F��T��>53��H����F��E��>��>��@��I��U��>��J��-��F�>��T��L��P20��H����F��>��>��@��I��U��>��J��-��F�>��T��L��P���I��F��E��T��>��I��F��T��>_]��I��F��E��7��1��U��C��5�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��TVT��I��F��7��1��U��C�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��I��F��E��T��>��I��F��T��>20��IP��=��E��>��>��F��E��D��H��>��Q��I��B��,ܔN)'��I��=��E��>��>��F��D��H��>��Q��I��B��G���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��:��O��;��4��P��@Ԛ<��:��;��4��P��@Ԛ<���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;�
+��:��O��;��W��L�/��?��T�
+��:��;��W��.���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��O��:��4��;��D��G��@��K��O��:��;��D��@��K���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;��:��O�D��>��;��@��K��:�D��>��;��@��K���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��D��O��;��2	��D��;��2���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��; ��:��O��;��2��,��L��D��G��@��K��:��;��2��,��D��@��K���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��:��O��;��2	��:��;��2���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;��:��O��;��2��8�,��T��:��;��2��,��T���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;&$����:��OƔ>��;��2�1��E��T��!����!����:Ɣ>��;��2�1��E��T�����D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;��:��O��;��2��8�,��T��:��;��2��,��T���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��; ��P��:��O��8��;��:��I̺@��:��T��P��:��8��;��:��@���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;#!��:��O��;��J��:��O��4��9��7��4��T��:��;��J��:��4��7��4��T���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��D��O��;��2��:��T��D��;��2��:��T���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;��:��O��;��2��D��G��@��K��:��;��2��D��@��K���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��:��O��;��2��7��C��<��B��B��:��;��2��7��C��<��B��B���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;��:��O��;�I��@��K��:��;�I��@��K���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;#!��:��O��KخG��5��K��;��D��G��@��K��:��KخG��5��K��D��@��K���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?&$��/��I��M��T��S��;ͺ?ٟ@��6��A�7��B��I��T��S��;ͺ?��5��+���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?&$��(��T��S��;��>��6��/��I��M��@��@��@��(��T��S��;��>��6��I��@��@���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?86��/��I��M��T��R��;��>��>��V��Bͺ?�C��7�=��V��-��A��B)'��I��T��R��;��>��>��Bͺ?�C��7��V��A��B���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?��(��T��S��6��4ͺ?��(��T��S��6��4ͺ?���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ? ��/��I��M��T��;ͺ?��D��S�D��A��I��T��;ͺ?��D��S�D��A���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?A?��/��I��M��P�D��;��Fͺ?��M��7��K��/��1�I��-�I��-������@Ԛ<53��I��P�D��;��Fͺ?��M��K��/�I��-�I��-������@Ԛ<���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ? ��/��I��M��F̽>��S��6��>��N��B��I��F̽>��S��6��>��N��B���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?53��;ͺ?�9��T��.��/��I��/��J��@��/��T��A��/��I��M��T,*��;ͺ?�9��T��.��I��/��J��@��/��T��A��I��T���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?86��/��I��M̺ٟ@��6ʔ7��;��Vͺ?��2��(��/��I��M����I��@)'��I̺��5ʔ7��;��Vͺ?��2��(��I����I��@��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8&$��U��J�G��>�S��I��B��E��U��3��H��8��U�G��>�S��I��B��8��H��8��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8zx��7��HܞN��D�G��>�S��E��U��7��HܞN��D�G��>�S��E��U��Q��7��HܞN��D�G��>�S��E��U��D��E��T߹-��8��Lԓ6��Iַ;��C��=��.b`��7��H�N�G��>�S��8��7��H�N�G��>�S��8��Q��7��H�N�G��>�S��8��D��E��T߹-��8��Lԓ6��I��C��=��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8ki�/��K��@�G��>�S��E��U�S��T��S��U��Q��=��W��B�S��E��U�S��I��B��E��U߹-��=��E��M��S��Iַ;��B��U��1��TPN�/��K�G��>�S��8�SŘ<��Q��=��W��B�S��8�S��I��B��8߹-��=��E̠M��I��B��U��1��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8DB��-ܞN��D�G��>��!������)�S��E��U��Q��-����Q�;ۓR��T��C��G�0/-��-�N�G��>�S��8��Q��-����Q�;ۓR��C��G�0��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��886��R�N��U�G��>�S��E��U��I��B�S��E��U��)����:�/��B#!��N�G��>�S��8��I��B�S��8��:�/��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8/-ܞN��D�G��>�S��I��B��E��U�;�S��I��B��E��U&$�N�G��>�S��I��B��8�;�S��I��B��8��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8SQ��-ܞN��D�G��>�S��E��U��Q��D��2��V��Fȣ8��4�X��I��UҔB��<֗T��I�7��Iַ;ŒA��TJH��-�N�G��>�S��8��Q��D��2��V��Fȣ8��4�X��I��UҔB��<֗T��I�7��IŒA��T��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8����P����P��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��820��D�G��I��B��E��U�S��E��U��V��;��E��U��B��E��U#!��D�G��I��B��8�S��8��V��8��B��8� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>,*��B��D��C��D��9��4��>��,��6�O��D��P�D��A&$��B��D��R��9��4��>��,��6��D��P�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>#!��B��C��D��O��9��4��>��6�O��@Ԛ<��B��R��O��9��4��>��6��@Ԛ<� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>DB��D��B��D��>��C��D��.��N��A��>��%��>��R��6��Iٟ@��9��7��D��S�D��A53��B��>��R��.��N��A��>��%��>��R��6��@��4��D��S�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>20��D��C��D�9�7��U��D��E��4��Oٟ@��6��A��A�7��B ��D��R��5��U��D��M��Oٟ@��6��+� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>,*��B��D��C��D��9��4��>��,��6�O��D��P�D��A&$��B��D��R��9��4��>��,��6��D��P�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>PN��D��B��C��D��Iٟ@��9��6��4��E��>йS��D��K�9ٟ@��9��S��M��>��B�U��-щQ��@Ԛ<><��D��B��R��@��5��4��E��>޹S @��9��S��M��>��B�U��-щQ��@Ԛ<� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>DB��D��B��D��>��C��D��.��N��A��>��%��>��R��6��Iٟ@��9��7��D��S�D��A53��B��>��R��.��N��A��>��%��>��R��6��@��4��D��S�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>PN��O��D��6��>��D��=��7��A��D��B��D��C��D��=�9�>��D��Iٟ@��O��D��2�O��@��@��@><��O��D��6��>��D��=��7��A��B��R��9�>��D��@��O��D��2�O��@��@� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>,*��B��D��C��D��9��4��>��,��6�O��D��P�D��A&$��B��D��R��9��4��>��,��6��D��P�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>b`��D��B��D��C��D��Cٟ@��9ɤK��E��7��>��RɤK�/ϪJ��>��H��=��Q��9��9��6ɤK��A��>��A�9��1��0��T��DPN��B��R��@��9ɤK��E��7��>��RɤK�/ϪJ��>��H��=��Q��9��5ɤK��A��>��A��1��0��T��D� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>DB��D��B��D��>��C��D��.��N��A��>��%��>��R��6��Iٟ@��9��7��D��S�D��A53��B��>��R��.��N��A��>��%��>��R��6��@��4��D��S�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>_]��D��B��C��D��N��5�9��O��H��3��4��8��B��D��4��R��4��O��@��4��W��OŮP��O��4�/��T��D����O��TDB��D��B��R��N��5��O��3��8��B��D��4��M��O��@��4��W��X޵+��T����O��T� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>,*��B��D��C��D��9��4��>��,��6�O��D��P�D��A&$��B��D��R��9��4��>��,��6��D��P�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>#!��B��C��D��9��4��>��A��6�O��@Ԛ<��B��R��9��4��>��A��6��@Ԛ<� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>DB��D��B��D��>��C��D��.��N��A��>��%��>��R��6��Iٟ@��9��7��D��S�D��A53��B��>��R��.��N��A��>��%��>��R��6��@��4��D��S�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��> ��U��C��D��9��4��>��A��6��?��,��U��R��9��4��>��A��6��?��,� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��PA?��B��T��S��6˩5؇9��?˩5��O��M��R��9��I����1��F��U��F��F��P��J86��B��S��6؇9��?˩5��O��M��R��9��I����1��F��U��F��P��J ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P/-��B؇9��6˩5ֲR����1��F��Q�?ٟ@��S��P��G��3&$��B؇9��6ֲR����1��F��Q�?ٟ@��S��G� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P20��U��C��>��B��0��6˩5��N��R��3��1��S��FщQ��@Ԛ</-��U��C��>��B��0��6��N��R��3��1��S��FщQ��@Ԛ< ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P#!��B��O��F��R��6˩5֛7��>��3��P��J��B��O��F��R��6�7��3��P��J� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��PA?��B��T��S��6˩5؇9��?˩5��O��M��R��9��I����1��F��U��F��F��P��J86��B��S��6؇9��?˩5��O��M��R��9��I����1��F��U��F��P��J ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��Pki��B��T��S��6˩5��0��Q��N�?�9��H��9��R��I��J��IН?��T��X��L��I��/��I��/��I��/��B��=��6��I��6��B��=��-��0YW��B��S��6��0��Q��N�?�9��H��9��R��I��J��Iܞ?ɜX��I��/��I��I��/��B��=��6��I��6��B��=��0� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P20��U��C��>��B��0��6˩5��N��R��3��1��S��FщQ��@Ԛ</-��U��C��>��B��0��6��N��R��3��1��S��FщQ��@Ԛ< ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��PDB��0�5��OȨK��F��D�9��I��V��B��T��E��LȨK��F�9��I��V��:��TН?��>/-��0�5��OӨK��D��I��V��B��E��LӨK��I��V��:��?� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��PA?��B��T��S��6˩5؇9��?˩5��O��M��R��9��I����1��F��U��F��F��P��J86��B��S��6؇9��?˩5��O��M��R��9��I����1��F��U��F��P��J ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P20��B��T��S��6˩5��0��B��T��6˩5��1��T��7��H��;��T#!��B��S��6��0��B��6��1��7��H��;��T� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P20��U��C��>��B��0��6˩5��N��R��3��1��S��FщQ��@Ԛ</-��U��C��>��B��0��6��N��R��3��1��S��FщQ��@Ԛ< ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P ��B��T��6��6˩5��0��Q��GН?��>��B��6��6��0��Q��G��?� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��PA?��B��T��S��6˩5؇9��?˩5��O��M��R��9��I����1��F��U��F��F��P��J86��B��S��6؇9��?˩5��O��M��R��9��I����1��F��U��F��P��J ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P��B�R��6˩5��1��?��F��B��T��B�R��6��1��?��B��T� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P20��U��C��>��B��0��6˩5��N��R��3��1��S��FщQ��@Ԛ</-��U��C��>��B��0��6��N��R��3��1��S��FщQ��@Ԛ< ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P ��B��T؇9��6˩5��M��5��R��F��F��B؇9��6��M��5��R��F��F���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T��.��T��G��T��6��>��?��>��P��.��T��G��6��>��?��>��P���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T��@��N��>��P��C��@��N��>��P���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T;9��G߹-��.��T��G��TޚT��>��9��B��K��R�9��KϋI��LK��A��B20��G߹-��.��T��GޚT��>��B��R�9��KϋI��LK��A��B���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T)'��.��T��G��T��6��>��7��K��M��?��U��>��T ��.��T��G��6��>��7��K��M��?��,���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T20��.��T�9��Kʉ5�5��>��A��>��B��K��=��U��;Н?��T)'��.��T�9��Kʉ5�5��>��A��>��B��U��;ܞ?���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T/-��.��T�9��Kʉ5�5��>��A��>��B��K��=��3��R��T&$��.��T�9��Kʉ5�5��>��A��>��B��3ҔR���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��TDB��O��<��>��T��R��I��O��.��T��R��I��O�V��T��I��O��B��<ȬT��I��Q��>86��O��<��>��T��R��O��.��T��R��O�V��T��I��O��B��<ЬT��Q���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��TMK��.��T��G��T��9��.��D�S��>�9��>��A��K��@P��B��@��	��A��6�O��:��@��@��@><��.��T��G��9��.��D�S��>��I��A��K��@��B��@��	��A��6��:��@��@���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T��.��T��G��T��6��>��@Ԛ<��.��T��G��6��>��@Ԛ<�̾-�,�A��J��T��0ޡ8;-�A��J��T��0�A��J��T��?��@��K�A��J��T��?��@��K̾-�,�A��J��T��0ޡ8;-�A��J��T��0;9̾-�,��4��F��E��4�A��J��T��5��4��T��?��1��W��Q̛<��7��T/-;-��4��E��4�A��J��T��5��4��T��?��W��Q̛<��7�̾-�,�A��J��T��0ޡ8;-�A��J��T��0\Z̾-��-��,̾-�,��6��.ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��1��W��K��4��0ޡ8��>��1MK̾-��-��,;-��6ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��WåK��0��>��1̾-�,�A��J��T��0ޡ8;-�A��J��T��0PN̾-��-��H��,̾-�,��6��.ΩW��4��L��5��/��B��4��W��5�H��/��O��T��A��Nя7��>��1GE̾-��-��H��,;-��6ΩW��4��L��5��/��B��4��W��5�H��/��O��T��A��N��>��1�̾-�,�A��J��T��0ޡ8;-�A��J��T��0,*��7��H��9��8��4�A��J��T��5��4��>��0ޡ8��7 ̾-�,�A��J��T��0ޡ8;-�A��J��T��0̾-��/��X��T̾-��/��X��T�̾-�,�A��J��T��0ޡ8;-�A��J��T��0	̾-��C��T	̾-��C��T̾-�,�A��J��T��0ޡ8;-�A��J��T��0,*��H��8��4�A��J��T��5��4��0ޡ8��>��1��@��K)'��H��8��4�A��J��T��5��4��0��>��1��@��K�̾-�,�A��J��T��0ޡ8;-�A��J��T��0�A��J��T��?��@��K�A��J��T��?��@��K̾-�,�A��J��T��0ޡ8;-�A��J��T��0\Z��H��,̾-�,��XΩW��8��4�A��L��T��5��4��0ޡ8��>��1��1��D��D��>��7��U��	̾-��X̾-��X��-��TMK��H��,;-��X��8��4�A��L��T��5��4��0��>��1��1��D��>��7��U��	̾-��X̾-��X�-�̾-�,�A��J��T��0ޡ8;-�A��J��T��0\Z̾-��-��,̾-�,��6��.ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��1��W��K��4��0ޡ8��>��1MK̾-��-��,;-��6ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��WåK��0��>��1̾-�,�A��J��T��0ޡ8;-�A��J��T��0JH��8�A��J��T�O��C��6̾-�,��8��4��L��5��/��T��S��:��-�1��Q��B��U��/��;86��8�A��J��T�O��C��6;-��8��4��L��5��/��S��:��-ڠ#��/�̾-�,�A��J��T��0ޡ8;-�A��J��T��0,*��7��H��9��8��4�A��J��T��5��4��>��0ޡ8��7 ̾-�,�A��J��T��0ޡ8;-�A��J��T��0/-̾-��/��/��?��8��4�A��J��T��5��T��;��U��/��T&$̾-��/��/��8��4�A��J��T��5��T��;��*�̾-�,�A��J��T��0ޡ8;-�A��J��T��0	̾-��C��T	̾-��C��T̾-�,�A��J��T��0ޡ8;-�A��J��T��0;9̾-�,��E���A��J��T����T�DɍP��M��A��:��7��.��U��/��T/-;-��E���A��J��T����T�DӍP��A��:��7��.��*�̾-�,�A��J��T��0ޡ8;-�A��J��T��0�A��J��T��?��@��K�A��J��T��?��@��K̾-�,�A��J��T��0ޡ8;-�A��J��T��0��I��L��I��L�̾-�,�A��J��T��0ޡ8;-�A��J��T��0\Z̾-��-��,̾-�,��6��.ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��1��W��K��4��0ޡ8��>��1MK̾-��-��,;-��6ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��WåK��0��>��1̾-�,�A��J��T��0ޡ8;-�A��J��T��0&$̾-��4�A��T��9��5��/��?��V��/��?��T ̾-��4�A��T��9��5��/��@��?��T�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?,*��7��F��:��B��P����1��N��D��?��F��:��@Ԛ<#!��7��:��B��P����1��N��D��?��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?��R��6������!��8�,��T��R��6����,��T�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?20��7��F��:��B��P����1��N��D��?��F��:��6��S��@Ԛ<)'��7��:��B��P����1��N��D��?��6��S��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?53��1��K��>��Q�P��?��F��:��Bб��4��D��=��3��-��A��B,*��1��K��>��Q�P��?��Bб��4��D��=��-��A��B�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?,*��7��F��:��B��P����1��N��D��?��F��:��@Ԛ<#!��7��:��B��P����1��N��D��?��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?;9��4��F��:̔6��B��U��P��V��7����1��5��C��S��?��F��:��@Ԛ<20��4��:̔6��B��U��P��V��7����1��5��C��S��?��@Ԛ<�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?20��7��F��:��B��P����1��N��D��?��F��:��6��S��@Ԛ<)'��7��:��B��P����1��N��D��?��6��S��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?&$̔6ַ;�I��B��U��Vԋ/��C��S��?��F��:��1�I��B��U��Vԋ/��C��S��?�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?,*��7��F��:��B��P����1��N��D��?��F��:��@Ԛ<#!��7��:��B��P����1��N��D��?��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?GE��F��:̔6��B��U��>ȣ8��9��0��7����1��K��K��6��S��?��F��:��D��S�D��A><��:̔6��B��U��>ȣ8��9��0��7����1��K��K��6��S��?��D��S�D��A�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?20��7��F��:��B��P����1��N��D��?��F��:��6��S��@Ԛ<)'��7��:��B��P����1��N��D��?��6��S��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��? �D��A��7�O��=P��Rߑ4��P��T�D��A��7��=��Rߑ4��P��T�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?,*��7��F��:��B��P����1��N��D��?��F��:��@Ԛ<#!��7��:��B��P����1��N��D��?��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��? ��?��F��:��6��S��>��J��<��B��B��?��6��S܃>��<��B��B�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?20��7��F��:��B��P����1��N��D��?��F��:��6��S��@Ԛ<)'��7��:��B��P����1��N��D��?��6��S��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?��?��F��:��6��S��@��K��?��6��S��@��K���U��N��D�H��F��/��U��D�H��F��/PN��U����2��Q��N��5��D�H��F��/��Bٟ@����S��K��D��N��D����S��C��>��K��2��@Ԛ<JH��U����2��Q��N��D�H��F��/��@����S��K��D��N��D����S��C��>��K��2��@Ԛ<��U��N��D�H��F��/��U��D�H��F��/#!��0�-��0��:��Nٟ@�H�F��V��F��T��0�-��:ٟ@�H��F��F��T���U��N��D�H��F��/��U��D�H��F��/GE��M��:��5��<��U��T��D��8��F��/����S��K��N��H��1��E����W��/��@��@��@;9��M��:��<��U��T��D��F����S��K��N��H��1��E����W��/��@��@��U��N��D�H��F��/��U��D�H��F��/,*��NԚ<��B��>��U��N��D��8��F��/��?��P��S��6 ��1��B��>��U��D��F��?��P��S��6���U��N��D�H��F��/��U��D�H��F��/PN��U����2��Q��N��5��D�H��F��/��Bٟ@����S��K��D��N��D����S��C��>��K��2��@Ԛ<JH��U����2��Q��N��D�H��F��/��@����S��K��D��N��D����S��C��>��K��2��@Ԛ<��U��N��D�H��F��/��U��D�H��F��//-ӟ;��N��@��R��>��8��F��S��/��"ҥ3��!��@��;��6&$ӟ;��N��R��>��8��F��S��"ҥ3��!��@��6���U��N��D�H��F��/��U��D�H��F��/GE��M��:��5��<��U��T��D��8��F��/����S��K��N��H��1��E����W��/��@��@��@;9��M��:��<��U��T��D��F����S��K��N��H��1��E����W��/��@��@��U��N��D�H��F��/��U��D�H��F��/#!��U��NۚK��/��D��8��F��D��S�D��A ��U��N��/��D��8��F��D��S�D��A���U��N��D�H��F��/��U��D�H��F��/PN��U����2��Q��N��5��D�H��F��/��Bٟ@����S��K��D��N��D����S��C��>��K��2��@Ԛ<JH��U����2��Q��N��D�H��F��/��@����S��K��D��N��D����S��C��>��K��2��@Ԛ<��U��N��D�H��F��/��U��D�H��F��/&$����F��S��5��<��U��T��=��N��@��>��/����F��S��<��Uǃ=��N��>��/���U��N��D�H��F��/��U��D�H��F��/GE��M��:��5��<��U��T��D��8��F��/����S��K��N��H��1��E����W��/��@��@��@;9��M��:��<��U��T��D��F����S��K��N��H��1��E����W��/��@��@��U��N��D�H��F��/��U��D�H��F��/)'��MƛK��U��2��Q��T��5��D�H��F��/��@Ԛ<#!��MƛK��U��2��T��D�H��F��/��@Ԛ<���U��N��D�H��F��/��U��D�H��F��/PN��U����2��Q��N��5��D�H��F��/��Bٟ@����S��K��D��N��D����S��C��>��K��2��@Ԛ<JH��U����2��Q��N��D�H��F��/��@����S��K��D��N��D����S��C��>��K��2��@Ԛ<��U��N��D�H��F��/��U��D�H��F��/><��U��N��@�9��O����F��/��P��K��Sϥ%��U��N��@��S��/�4��3��D20��U��@��O����F��/��P��K��Sϥ%��U��N��@��S��/��4���U��N��D�H��F��/��U��D�H��F��/GE��M��:��5��<��U��T��D��8��F��/����S��K��N��H��1��E����W��/��@��@��@;9��M��:��<��U��T��D��F����S��K��N��H��1��E����W��/��@��@��U��N��D�H��F��/��U��D�H��F��/��8��F��E��Nڜ>��/��@��K��8��F��E��Nܜ>��@��K�	��,ݠ.��A��,��A_]��O��7��0��C��T��,ݠ.��7��>��D�GܤK��P������0��4��T����V��A��V����0��7��>��?����Q��;��GE��O߫B��>��G��K������0��4��T����V��A��V����0��7��>��?����Q��;��	��,ݠ.��A��,��A86��,ݠ.��>��O��/��1��9��O��6��1��6��A��B��T��G��A�7��B/-��,��>��O��/��1��9��O��6��1��6��A��B��T��G��+�	��,ݠ.��A��,��A��,ݠ.��ݠ.��O��,��ݠ.��O	��,ݠ.��A��,��A��O��,ݠ.��B��:��D��G��@��K��O��,��B��D��@��K�	��,ݠ.��A��,��A_]��O��7��0��C��T��,ݠ.��7��>��D�GܤK��P������0��4��T����V��A��V����0��7��>��?����Q��;��GE��O߫B��>��G��K������0��4��T����V��A��V����0��7��>��?����Q��;��	��,ݠ.��A��,��A ��C��N��,ݠ.��Q��A��D��P�D��A��C��N��,��Q��A��D��P�D��A�	��,ݠ.��A��,��A��,ݠ.��ݠ.��O��,��ݠ.��O	��,ݠ.��A��,��A)'��Iַ;��D��N��0��C��T��,ݠ.��AщQ��@Ԛ<��Iַ;��D��N����AщQ��@Ԛ<�	��,ݠ.��A��,��A_]��O��7��0��C��T��,ݠ.��7��>��D�GܤK��P������0��4��T����V��A��V����0��7��>��?����Q��;��GE��O߫B��>��G��K������0��4��T����V��A��V����0��7��>��?����Q��;��	��,ݠ.��A��,��A,*��>��T��,ݠ.��9��A��B��A�A��4˛5��D�A��4��>��,��9��A��B��A˛5��D��An	��,ݠ.��A��,��A��,ݠ.��ݠ.��O��,��ݠ.��O	��,ݠ.��A��,��A��N��T��,ݠ.��Nĵ*�	��,ݠ.��A��,��A_]��O��7��0��C��T��,ݠ.��7��>��D�GܤK��P������0��4��T����V��A��V����0��7��>��?����Q��;��GE��O߫B��>��G��K������0��4��T����V��A��V����0��7��>��?����Q��;��	��,ݠ.��A��,��APN����X��>��T��9�;��;��>��X��>��Q��A��7��A�O��7��R��N��;��X��7��:��U��>��E�8DB����X��>��;��>��X��>��Q��A��7��A�O��7��N��;��X��7��:��U��>��E�8�	��,ݠ.��A��,��A��,ݠ.��ݠ.��O��,��ݠ.��O	��,ݠ.��A��,��AA?��O߹-��5��,ݠ.߹-��,ݠ.��:߹-��HİU��M��A��N��C�)��O��8�,��T53��O߹-��5��,߹-��,��:߹-��H��M��A��N��C�)��O��,��T���F��IֈD��:��0��D��F��I��:��0��D��IֈD��:��0��@��?��I��:��0��@��F��IֈD��:��0��D��F��I��:��0��D ��IֈD��N��0��D��:��D��G��@��K��I��N��0��D��D��@��K���F��IֈD��:��0��D��F��I��:��0��D20��D��3Ԛ<��I��F��>��IֈD��0��>��D��:��D��S�D��A#!��3��I��F��>��I��>��D��D��S�D��A��F��IֈD��:��0��D��F��I��:��0��D��IֈD��G��C��?��D��I��G��C��?��D���F��IֈD��:��0��D��F��I��:��0��D��IֈD��:��0��@��?��I��:��0��@��F��IֈD��:��0��D��F��I��:��0��D��D��:��IֈD��1��4	��D��I��1���F��IֈD��:��0��D��F��I��:��0��D20��D��3Ԛ<��I��F��>��IֈD��0��>��D��:��D��S�D��A#!��3��I��F��>��I��>��D��D��S�D��A��F��IֈD��:��0��D��F��I��:��0��D20��D��3Ԛ<��I��F��>��IֈD��0��>��D��:��D��S�D��A#!��3��I��F��>��I��>��D��D��S�D��A���F��IֈD��:��0��D��F��I��:��0��D��IֈD��:��0��@��?��I��:��0��@��F��IֈD��:��0��D��F��I��:��0��D><يR��IֈD��:��0��DيR��4��IֈD��:��0��B��IيR��4��T��C��,��>)'يR��I��:��0��D�R��I��:��0��B��I�R��C���F��IֈD��:��0��D��F��I��:��0��D20��D��3Ԛ<��I��F��>��IֈD��0��>��D��:��D��S�D��A#!��3��I��F��>��I��>��D��D��S�D��A��F��IֈD��:��0��D��F��I��:��0��D;9��IֈD��>��0��E��F��R��4��:��0��>ğCѭDӮD��:ٟ@�H��@Ԛ</-��I��>��0��E��F��M��:��0��>ɟCܮDٟ@�H��@Ԛ<���F��IֈD��:��0��D��F��I��:��0��D��IֈD��:��0��@��?��I��:��0��@��F��IֈD��:��0��D��F��I��:��0��D,*��IֈD��:��0ߢ?��D��T��7��N��7��9��U��A��T#!��I��:��0ߢ?��D��7��N��7��9��U��A���F��IֈD��:��0��D��F��I��:��0��D20��D��3Ԛ<��I��F��>��IֈD��0��>��D��:��D��S�D��A#!��3��I��F��>��I��>��D��D��S�D��A��F��IֈD��:��0��D��F��I��:��0��D><��0��IֈD��:��0��D��0��4��IֈD��:��0��B��I��0��4��T��C��,��>/-��0��I��:��0��D��0��4��I��:��0��B��I��0��4��C���C��@��N����=��
+��>��>��C��@��N����=��
+��>��>DB�
+��������
+��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
+��>��>��C��@��N����=��
+��>��>;9��E��8��7�C��C��@��N��.��H˱U����=���F��CסE��@��@��@20��8��7�C��C��@��N��.��H˱U����=���F��C��@��@���C��@��N����=��
+��>��>��C��@��N����=��
+��>��>DB�
+��������
+��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
+��>��>��C��@��N����=��
+��>��>GE����=��
+��>��C��@��N��7��U��0��>ٟ@��6��M��V�I��W��>��E��D��S�D��A><����=��
+��>��C��@��N��U��0��>ٟ@��6��V��=��>��E��D��S�D��A���C��@��N����=��
+��>��>��C��@��N����=��
+��>��>DB�
+��������
+��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
+��>��>��C��@��N����=��
+��>��>A?��C��@��N����=��
+�F��>��E��Mӛ?ߤ8��>��4��F��C��@��N��@��@��@;9��C��@��N����=��
+�F��>��E��Mӛ?ߤ8��>��4��C��@��N��@��@���C��@��N����=��
+��>��>��C��@��N����=��
+��>��>DB�
+��������
+��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
+��>��>��C��@��N����=��
+��>��>DB����=��
+��>��CסE��C��@��N��D��E��0��*��*ԑ4��9��A��*��/��@��@��@;9����=��
+��>��C��C��@��N��D��0��*��*ԑ4��9��A��*��/��@��@���C��@��N����=��
+��>��>��C��@��N����=��
+��>��>DB�
+��������
+��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
+��>��>��C��@��N����=��
+��>��>b`��B˩5��5����=����>��O��*��7��C��@����N��7��C��@��ĕ6��T����F��R��/��H����F��H��4��ĕ6��TPN��B˩5��5����=����>��O��*����C��@��ĕ6��T����F��R��/��H����F��4��ĕ6��T���C��@��N����=��
+��>��>��C��@��N����=��
+��>��>DB�
+��������
+��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
+��>��>��C��@��N����=��
+��>��>_]��6ɵO����=����>��C��@��Nð.��A��
+��>��>ٟ@�9ٟ@��D��DܢE��SܤK��A��@��CסE��SܤK��A��@Ԛ<SQ��6ɵO����=����>��C��@��Nð.��A��
+��>��>��9��DܢE��SܤK��A��@��C��SܤK��A��@Ԛ<���C��@��N����=��
+��>��>��C��@��N����=��
+��>��>DB�
+��������
+��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
+��>��>��C��@��N����=��
+��>��>/-��7��C��@��N��7��
+��=���F��S������ÐW��7#!����
+��=���F��S������ÐW��7���C��@��N����=��
+��>��>��C��@��N����=��
+��>��>DB�
+��������
+��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
+��>��>��C��@��N����=��
+��>��>><����=��
+�F��>��C��@��N��P��E��Mӛ?��M��>��>��Fٟ@��6��@Ԛ<;9����=��
+�F��>��C��@��N��P��E��Mӛ?��M��>��>��F��5��@Ԛ<���C��@��N����=��
+��>��>��C��@��N����=��
+��>��>DB�
+��������
+��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
+��>��>��C��@��N����=��
+��>��>)'��C��@��N����=��
+�F��(��D��>��M��@Ԛ<)'��C��@��N����=��
+�F��(��D��>��M��@Ԛ<����H��S��/��O��N��4�/��:���H��S��O��N��4��/VT��7���H��>��S��/��O��B��4��ʡH��9ɰ5ȥ��7�/��:������į?��I����������E��L��/����������������E���H��S��/��O��N��4�/��:���H��S��O��N��4��/JHùB��L��W¶7��/��J��7���H��>��/��B��/����Wȥ��O��B��4��7��8��4�/��:ĹB��N��/��J��8��4��/����H��S��/��O��N��4�/��:���H��S��O��N��4��/,*���H��S��/��O��N��4�/��:ҁX��?��L��-��T#!���H��S��O��N��4��/ҁX��?��L�-���H��S��/��O��N��4�/��:���H��S��O��N��4��/&$���H��W��>��S��/��9��4�/��:��AƭI���H��W��>��S��9��4��/��A����H��S��/��O��N��4�/��:���H��S��O��N��4��/)'��7���H��/��B��/��>��O��N��4��7�/��:��/���H��S��/��O��N��4�/��:���H��S��O��N��4��/;9���H��>��S��/��>��4��N��O�/��:��/��/��9¶7��JùB��L��/#!����>��4��N��O��/��/��N��JĹB��/����H��S��/��O��N��4�/��:���H��S��O��N��4��/;9��7���H��>��S��/��>��O��B��4��7��B��R��:����0����A��B��B��R��:����0����A��B���H��S��/��O��N��4�/��:���H��S��O��N��4��/20�/��:��7���H��>��S��/����O��B��4����7�/��:��/��/����H��S��/��O��N��4�/��:���H��S��O��N��4��/VT��7���H��>��S��/��O��B��4��ʡH��9ɰ5ȥ��7�/��:������į?��I����������E��L��/����������������E���H��S��/��O��N��4�/��:���H��S��O��N��4��/&$���H��>��S��/��O��B��4��"����>������O��B��4��>������H��S��/��O��N��4�/��:���H��S��O��N��4��/,*���H��S��/��O��N��4�/��:ҁX��?��L��-��T#!���H��S��O��N��4��/ҁX��?��L�-���H��S��/��O��N��4�/��:���H��S��O��N��4��/PN��7���H��S��/��4��7�/��:����ȥ������Ƕ,��W¶7��/��>��;��������G��B20��/����ȥ������Ƕ,��N��/��>��;��������G����H��S��/��O��N��4�/��:���H��S��O��N��4��/)'��7���H��/��B��/��>��O��N��4��7�/��:��/���H��S��/��O��N��4�/��:���H��S��O��N��4��/kiùB��L��9¶7��/��J��7���H��S��/����9ȥ��4��N��O��7�/��:����9¶7��/�/��:����6��6ȈX��4��������&20ĹB��N��/��J��/����N��/��/����6��6ȈX��4��������H��S��/��O��N��4�/��:���H��S��O��N��4��/;9��7���H��>��S��/��>��O��B��4��7��B��R��:����0����A��B��B��R��:����0����A��B���H��S��/��O��N��4�/��:���H��S��O��N��4��/><ʡH�U٨I��7���H��S��/��4��7�/��:������:��,��A��F��>�� ʡH�U٨I��/������:��,��>������H��S��/��O��N��4�/��:���H��S��O��N��4��/VT��7���H��>��S��/��O��B��4��ʡH��9ɰ5ȥ��7�/��:������į?��I����������E��L��/����������������E���H��S��/��O��N��4�/��:���H��S��O��N��4��/��H��N�1��,��;��T��L��H�1��,��;��T��L���X��:��8��6˩5��X��:��8��6&$��X��:��8��6˩5�>��X��:��8��6˩5��7 ��X��:��8��6�>��X��:��8��6��7��X��:��8��6˩5��X��:��8��686��X��:��8��6˩5��4��X��:��8��6��4��V��D��T����(����!)'��X��:��8��6��4��X��:��8��6��V��D��T�����X��:��8��6˩5��X��:��8��6&$��X��:��8��6˩5�>��X��:��8��6˩5��7 ��X��:��8��6�>��X��:��8��6��7��X��:��8��6˩5��X��:��8��6)'��E��8��:��X��6��6˩5�H��3��8��@��@��@#!��E��8��:��X��6��6�H��3��8��@��@���X��:��8��6˩5��X��:��8��6&$��X��:��8��6˩5�>��X��:��8��6˩5��7 ��X��:��8��6�>��X��:��8��6��7��X��:��8��6˩5��X��:��8��620��X��:��8��6ӻB��O��X��:��8��6˩5��Q��4��6��4��T#!��X��:��8��0��X��:��8��6��Q��6��T���X��:��8��6˩5��X��:��8��6&$��X��:��8��6˩5�>��X��:��8��6˩5��7 ��X��:��8��6�>��X��:��8��6��7��X��:��8��6˩5��X��:��8��6GE��X��:��8��6ӻB��O��X��:��8��6ӻB��O��X��:��8��6��H��6��T��$������!20��X��:��8��0��X��:��8��6��O��X��:��8��6��6��T����X��:��8��6˩5��X��:��8��6&$��X��:��8��6˩5�>��X��:��8��6˩5��7 ��X��:��8��6�>��X��:��8��6��7��X��:��8��6˩5��X��:��8��6#!��5��X��:��8��>��6˩5��6�R��@Ԛ< ��5��X��:��8��>��6��6�R��@Ԛ<�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��>߹-��U�.��8߹-��U�.��8�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��>/-��U�.��>��D��P��?��1��4��:щQȻ;��T��=��.��L)'��U�.��>��D��?��1��4��:щQȻ;��T��=��L�߹-��U�.��:��D��>߹-��U�.��:��D��>ec��1��O��������.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ڤ5��5��J��@��C��;ϵ>͵A��T��J��.��P��TVT��1��O����.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ܤ5��J��@��Aϵ>͵A��T��J��.��P߹-��U�.��:��D��>߹-��U�.��:��D��>)'кB��U�.��6��:��D��P߇;Ȼ;��T��=��.��L#!кB��U�.��6��:��D߇;Ȼ;��T��=��L�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��> ��U�.��8߹-��U�.��.ʺB��P��T ��U�.��8߹-��U�.��.ʺB��P��T�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��> ߹-��U�.��:��/�0��E��F��T��6 ߹-��U�.��:��/�0��E��F��T��6�߹-��U�.��:��D��>߹-��U�.��:��D��>ec��1��O��������.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ڤ5��5��J��@��C��;ϵ>͵A��T��J��.��P��TVT��1��O����.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ܤ5��J��@��Aϵ>͵A��T��J��.��P߹-��U�.��:��D��>߹-��U�.��:��D��>&$��U�.����V��P����1��B��,��,��	��5&$��U�.����V��P����1��B��,��,��	��5�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��>߹-��V��;��T��6��4߹-��V��;��T��6�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��>MK߹-��U�.�6��>��P��5��,�A߹-��U�.˭V�6��,��3��T߹-˭V�6܈I��U��?��9�0GE߹-��U�.�6��>��P��5��9߹-��U�.˭V�6��,��3��T߹-˭V�6߈I��?��9�0�߹-��U�.��:��D��>߹-��U�.��:��D��>ec��1��O��������.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ڤ5��5��J��@��C��;ϵ>͵A��T��J��.��P��TVT��1��O����.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ܤ5��J��@��Aϵ>͵A��T��J��.��P߹-��U�.��:��D��>߹-��U�.��:��D��>MK����N����=��.��H����=���F��0��B��U�.��$��D��:����N����=��M��P��M��PJH����N����=��U����=���F��0��B��U�.��$��D��:����N����=��M��P��M��P�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��>/-߹-��U�.��D��I��4��2��9��-��D��I��V��=�R��J#!߹-��U�.��D��4��2��9��-��+�R��J�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��>53߹-��U�.��/�0��Bб��D��D�7��=��E��U��T۹/��U��D,*߹-��U�.��/�0��Bб��D��D�7��,��U��/��D�߹-��U�.��:��D��>߹-��U�.��:��D��>ec��1��O��������.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ڤ5��5��J��@��C��;ϵ>͵A��T��J��.��P��TVT��1��O����.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ܤ5��J��@��Aϵ>͵A��T��J��.��P߹-��U�.��:��D��>߹-��U�.��:��D��>;9߹-��U�.��0��N��U��Oބ2��E������	����	��=ĪC��'��A��B&$߹-��U�.��0��N��U��OǷ.����=��A��B�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��>SQ��:��Aб��=����>��U�.��=��9��V��>��D��>�9Ԛ<��O��I��SÄN��2��6��8�9��F��T��6GE��:��Aб��=����>��U�.��V��>��D��>��1��O��I��SÄN��9��8�9��F��T��6�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��>)'��U�.��>��D��P��6��:��,Ȼ;��T��=��.��L#!��U�.��>��D��6��:��,Ȼ;��T��=��L�߹-��U�.��:��D��>߹-��U�.��:��D��>ec��1��O��������.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ڤ5��5��J��@��C��;ϵ>͵A��T��J��.��P��TVT��1��O����.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ܤ5��J��@��Aϵ>͵A��T��J��.��P߹-��U�.��:��D��>߹-��U�.��:��D��> ߹-��1��U�.��9��T��D��S�D��A ߹-��1��U�.��9��T��D��S�D��A�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��>GE��3Ԛ<��U�.��D��Pڶ>��9��V��C��=��6��R��M��K��
+��P��T��6��1��T��P��653��1��U�.��D��9��V��C��6��R��K��
+��P��T��6��1��TڀP�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��>;9߹-��U�.��:��S��M��?��B��;��BɵO��M��S��B��#��*��*��.��T53߹-��U�.��:��S��M��?��B��;��BֵO��S��B��#��*��*��.�	��U�8��S��U��S/-��1۠N�
+��FɹK��U��=��S��5ۓR��:ϡS��F��A��T#!ޠN�
+��F��U��=��S��5ۓR��:ݡS��A	��U�8��S��U��S&$��F��M��G��M��M��>��.��3ˠS�8��7��T��F��M��G��M��M��>��.�8��7�	��U�8��S��U��SA?����N��,ˏR��0��#��>ˌD��3��U��=��S��NۥN����������&����7><����N��,ˏR��0��#��>ьD��U��=��S��NۥN����������&����7	��U�8��S��U��S)'��V��X��?��A��MP��S��>��S��M�8��G��J#!��V��X��?��A��MP��S��>��SٶM��1�	��U�8��S��U��S/-��1۠N�
+��FɹK��U��=��S��5ۓR��:ϡS��F��A��T#!ޠN�
+��F��U��=��S��5ۓR��:ݡS��A	��U�8��S��U��S/-��U��=��Sб�.��6��5��J��?��O��4ʄ/��&�8��7&$��U��=��Sб�.��6�J��O��4ʄ/��&��8�	��U�8��S��U��SA?����N��,ˏR��0��#��>ˌD��3��U��=��S��NۥN����������&����7><����N��,ˏR��0��#��>ьD��U��=��S��NۥN����������&����7	��U�8��S��U��SVT��7��0�:��7�K���U�6��A�8��>��C������¾9�8��T��P��7P��X��>¾9�8��7��;><�K���U�6��A��>��C���¾9��8��P��7P��X��>¾9��8��;�	��U�8��S��U��S/-��1۠N�
+��FɹK��U��=��S��5ۓR��:ϡS��F��A��T#!ޠN�
+��F��U��=��S��5ۓR��:ݡS��A	��U�8��S��U��SA?��U��=��Sб����7̛<�8��7��E��7��C��7����7̛<��(��������!53��U��=��Sб����7̛<�8��7��E��7��C��7����7̛<���	��U�8��S��U��SA?����N��,ˏR��0��#��>ˌD��3��U��=��S��NۥN����������&����7><����N��,ˏR��0��#��>ьD��U��=��S��NۥN����������&����7	��U�8��S��U��S86��W��7��I��U�8��>��S��E��Sٟ@�M߫U��@��U��'��@��@��@20��W��7��I��U��>��S��E��Sٟ@�M߫U��@��U��'��@��@�	��U�8��S��U��S/-��1۠N�
+��FɹK��U��=��S��5ۓR��:ϡS��F��A��T#!ޠN�
+��F��U��=��S��5ۓR��:ݡS��A	��U�8��S��U��S><�����������������
+��4��J��6����N��L�F��;��8��T��786�����������������
+��4��J��6����N��L�F��8��7�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.;9��/�,��<��7��F����N��C��N��:��Q��E��I��/��4�O�5��.��L,*��/��<��7��F����N��N��:��+��@��4�O�5��L�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.b`��/��Q��E��T����N��/��Q��E��V��K��/��Q��E��L��9��O��/Լ=��E��T��/��Q��E��/��4��/��V��Q��E��1��WJH��/��+��T����N��/��+��V��/��+��L��9��O��/�=��T��/��+��/��/��V��7��1��W�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.&$����N��C��N��/��Q��E��L��<��W��Q��T����N��N��/��+��L��<��W��Q�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.86����N��C��N��/�,��7ΩWǔ:��Q��B��1�5�O��.��Q��E��T,*����N��N��/��7ǔ:��Q��B��1�5�O��.��+��T�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.������N��/��Q��E��/��4��/��Q��/��4��K��/��Q��E��/��4��H��/��Q��E��Q��W��J��/��Q��E��E��/��V��/��Q��/��4��8��/��Q�O��4��/��Q��E��/��/��Q��/��K����-qo����N��/��+��/��/��Q��/��K��/��+��/��H��/��+��Q��W��/��+��E��V��/��Q��/��8��/��6��4��/��7��/��/��Q��/��K����-�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��./-����N��/��X�O��6��E��/��4�O�5��L��Q��E��@/-����N��/��X�O��6��E��/��4�O�5��L��Q��E��@�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.;9��2����N��Q��E��/�O��O�5��2��Q�O��K��2��Q��O��K��"��W53��2����N��+��/�O��O�5��2��6��K��2��Q��O��K��"��W�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.�~����N��/�,��K�O��6��1ؙD��T�5�O��L��/��Q��:��L��E��G��:��9��/��4��E��.��E��J��Q��E��T����N��D��Q��E��N��K��7��9��1��W��Tec����N��/��K�O��6��1ؙD��T�5�O��L��/��Q��:��E��:��/��E��.��E��J��+��T����N۳9��E��N��7��9��1��W�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.&$����N��/��1��Q��I��/��4�O��.��D��Q ����N��/��1��Q��@��4�O��.۳9�	��@��U��E	��@��U��E��U��E��T��@��?	��U��E��@	��@��U��E	��@��U��E ��@��U��F��5��E�9��:��U��@Ԛ<��@��U��F��5��E��:��U��@Ԛ<���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>)'��U��C��9��S��;ٟ@��>��6��E��6��>��@Ԛ<)'��U��C��9��S��;ٟ@��>��6��E��6��>��@Ԛ<���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>&$��V��X��,��)��E��Bٟ@��&��EϜV��Q��T��V��X��)��E��@��&��EϜV��Q���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>DB��DԚ<��(��������!ٟ@��6��E��S��>��)����%������"��6��"����&)'��D��5��E��S��>��)����%������"��6��"���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>#!��;ښL��)��E��6��?����?��O��K��T��;ښL��)��E��6����?��A��T���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>&$��8��V��1��)ٟ@��>��6��E��6��>��@Ԛ<#!��8��1��)ٟ@��>��6��E��6��>��@Ԛ<���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>53��K��:��S��;ٟ@��Sٟ@��>��6��E��6��6��>��G��A�7��B/-��K��:��S��;ٟ@��Sٟ@��>��6��E��6��6��>��G��+���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>\Z��$��B��)��K��F��E��6��>��R��D�I��6��P��G��H��>��R��5��K��9��>��6��6��;��N��D��S��PԮK߀3VT��$��B��)��K��F��E��6��>��R��D�I��6��P��G��H��>��R��K��9��>��6��6��;��N��D��S��PٮK���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>/-��RPٟ@��)����%ٟ@��6��E��6��>��D��P�D��A&$��RPٟ@��)��5��E��6��>��D��P�D��A���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>GE��!��Rٟ@��6��E��6��>ٟ@��щQ��K��B��)��B��$����&��9��U��>щQ��@Ԛ<;9��!��R��5��E��6��>ٟ@��щQ��K��B��)��B����U��>щQ��@Ԛ<���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$hf��$��>��I��?��9��T��W��O��$�8��$��>��I��?��9��T��W��O��$�8��Q��$��>��I��?��9��T��W��O��$�8��,��9��PMK��$��>ɞ9��W��O��$�8��$��>ɞ9��W��O��$�8��Q��$��>ɞ9��W��O��$�8��,��9��P���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$)'��֥>��$�8��?��9��T��W�8��Q��H��.��T#!��֥>��$�8ɞ9��W�8��Q��H��.��T���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$)'��U��"����҈����$��4��T޲F��?��9��T��U��"����4��T޲Fɞ9���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$tr�L��:��V��1��T��>��B��;��W�8������׫B�!��U��H��?��I��?��9��T��$�8��C��W��O��?��9��8��W�8ɳQ��W��Q��B��H��O_]�L��:��V��1��T��>��;��W�8������׫B�!��U��H��?��Iɞ9��$�8��C��W��OǞ9��W�8ɳQ��WвQ��H���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$;9µ��$��?��9��Tµ��$��?��9��T�@��M��@��>��K��T��@�/��Bɞ9ɞ9�@ܱM��>��K��@�/���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$86��<��?��9��T��W�8��$��C��Q��-����Q�;ۓR��T��C��G�0/-��<ɞ9��W�8��$��C��Q��-����Q�;ۓR��C��G�0���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$��?��9��T��$�8��:ɞ9��$�8��:���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$86��?��9��T��$�8��?��9��T��$�8��,��?��9��T��$�8�/��P&$ɞ9��$�8ɞ9��$�8��,ɞ9��$�8�/��P���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$nl��4��T޲F��?��9��T��"����҈����$��A�/��B��4��T޲F��?��9��T��"����҈����$��Q��8ޚT��N��G��K��T��O��T><��4��T޲Fɞ9��"����A�/��4��T޲Fɞ9��"����Q��8��+��K��T��O���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$&$��?��9��T��Q��0��"ǉ:��?��9��TɳQ��Qɞ9��Q��0��"ǉ:ɞ9ɳQ��Q���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$ec����?��9��T��8�8��I��?��9��T��$�8��W��O��8�8��8��O��?��9�8��Q��D��2��C��I��0�C��9��8��>ŒA��TSQ��ɞ9��8�8��Iɞ9��$�8��W��O��8�8��8��O��?��9�8��Q��D��2��C��0�C��9��>ŒA��T���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$20��?��9��T��$�8ܞN��D֥>��W��8ݶ;��U��W��8��9��T#!ɞ9��$�8�N֥>��W΀8��U��8��9��T���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$,*��"����҈����$��4��T޲F��?��9��Tǉ:��"��"����4��T޲Fɞ9ǉ:��"���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$53��"����҈����$��4��T޲F��?��9��T��Q��D��2��D��T#!��"����4��T޲Fɞ9��Q��D��2��D��T���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$A?µ��$��?��9��T��A��=��U����L����E��Q��?Ǳ.��<��?��9��T��C��9/-ɞ9��A��=��U����L����E��Q��?Ǳ.��<ɞ9��C��9���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$)'����?��9��T��$�8����1��Ƨ!��"��K��0 ��ɞ9��$�8����1��'��"��K��0���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$_]��W�8��?��9��T��W�8��"��?��9��T��W�8��$���)��?��9��T��W�8��5��6��U��C��7��C��R��?��7��?A?��W�8ɞ9��W�8��"ɞ9��W�8ɞ9��W�8��5��6��U��C��C��R��?��7��?���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$GE��"ǉ:��?��9��T��W�8��5��6��U��$��Ȓ ��������Ƨ!��G��8��O��<��T,*��"ǉ:ɞ9��W�8��5��6��U����G��8��O��<��T���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��CP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�OP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աOMK��/��N��/��4ʅ>߰4�>���N��.��O��X��,��F��J��O��:��9��/��N��/��4��@��@��@,*��N��4���N��O��X��,��F��O��:��9��N��@��@���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��CP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�OP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աO><��/��N��/��4ʅ>߰4�>�N��.��Xҥ3߫U��B��W��O��F��J��U��Q��J&$��N��4�N��X��U��B��W��O��F��U��Q��J���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��CP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�OP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աO����6��Mӛ?��6��Mӛ?��O�;��O��/��N��/��4��7��>��6��7��4��4��B����9��H��S����1����HŞ��1���K��į?��D��ߋ5����G��ބ2��4����P��K��ۥN��ɿC�R����S����2ބ2��B��@����Bބ2ͩ-��	ۥN������B�D��B���M��/����N��7����<��Q��B������J��7����1��R��6��Mӛ?گD��4��ɭ4��:��N��7����1������ӛ?��9��:������6��Mӛ?��6��Mӛ?աO��O��N��̻4��B����9��S����1����HŞ��1���K��į?��D��ߋ5����G��ބ2��4����P��K��ۥN��ۿC����S����2ބ2��B��@����B��D��	ۥN������B�D��B���M��/����N��7����<��Q��B������J��7����1��R��6��Mӛ?گD��4��ɭ4��:��N��7����1������ӛ?��9��:�����/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��CP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�OP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աO86��/��N��/��4ʅ>߰4�>�N��.��X��W��B��O��F��J��U��Q��J#!��N��4�N��X��W��B��O��F��U��Q��J���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��CP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�OP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աOSQ��4��/��N��/��4��5��F��>��J��F��J��Iݩ5��O��Rܠ9��4��/��N��/��4��5��F��>��J��F��J,*��4��N��5��4��F��Iݩ5��O��M��4��N��5��4��F���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��CP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�OP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աO ��0��0��,��B��4��.��Iַ;��@��?��0��,��B��4��.��I��@���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��CP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�OP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աO&$����9��:��9��;��2P��X��>��9��:��;#!����9��:��9��;P��X��>��9��:��;���,��6��B��J��>��P��7��B��J��>��P><��7��6��N��J��F��3��P��;�7��N��@��N��;��J��T��;��J��Q��;��J/-��7��N��J��F��3��P��;�7��N��N��;��T��;��Q��;��,��6��B��J��>��P��7��B��J��>��P)'��7��6��B��J��P��T��;��<̖@��@��T��M��L&$��7��B��J��P��T��;��<̖@��@��T��M��L���,��6��B��J��>��P��7��B��J��>��P/-��1��7��6��N��Jǭ;�J��2��=��>��P��Q��@��@��@)'��1��7��N��Jǭ;�J��2��=��>��P��Q��@��@��,��6��B��J��>��P��7��B��J��>��P,*��C�F��7��6��B��JԿ7��;˨O��O��/��J��Iַ;&$��C�F��7��B��JԿ7��;˨O��O��/��J��I���,��6��B��J��>��P��7��B��J��>��P��U�F��J��B��7��6͎?��/��U�F��J��B��7͎?��/��,��6��B��J��>��P��7��B��J��>��P��P��H��I��L��2��C����O��JP��H��I��L��2��C����O��JسS��B��6��B��J��Dʿ7��E��>��PP��H��I��L��2��C����O��JP��H��I��L��2��C����O��J��Q��0��N��>��>��K��J��N����B��I��L��2��C����O��J��B��I��L��2��C����O��J۳S��6��B��JϿ7��E��>��P��B��I��L��2��C����O��J��B��I��L��2��C����O��J��Q��0��N׎>��KɏJ���,��6��B��J��>��P��7��B��J��>��P&$��7��6��B��J��>��P��P��/��M�G��Q��T ��7��B��J��>��P��P��M�G��Q��T��,��6��B��J��>��P��7��B��J��>��P,*��J��R��J�C��J��D��6��P��V��.��6��;��J��T)'��J��R��J�C��J��D��6��P��V��.��6��;��T���,��6��B��J��>��P��7��B��J��>��P ��6��B��J��Dʿ7��E��>��P��@Ԛ<��6��B��JϿ7��E��>��P��@Ԛ<��,��6��B��J��>��P��7��B��J��>��P)'��A��R��J��B��J��D��6��PщQ��U��;�7��P&$��A��R��J��B��J��D��6��PщQ��U��;��7���,��6��B��J��>��P��7��B��J��>��P/-��7��6��B��J��F��6�F��,��Q��V��M�G��.��D��6,*��7��B��J��F��6�F��,��Q��V��M�G��.��D��6��,��6��B��J��>��P��7��B��J��>��P53��BܥN��F��C��S��7��B��7��6��B��R��6��H��J��>��A��P/-��BܥN��F��C��S��7��B��7��B��7��H��J��>��A��P���,��6��B��J��>��P��7��B��J��>��P><��7��6��N��J��F��3��P��;�7��N��@��N��;��J��T��;��J��Q��;��J/-��7��N��J��F��3��P��;�7��N��N��;��T��;��Q��;��,��6��B��J��>��P��7��B��J��>��PYW��Jǭ;��N��,��6��>��P��Jǭ;��DƂGщQ��Jǭ;��D�@щQ��,��6��>��G��3��.ٟ@��DƂGщQ��@Ԛ<SQ��Jǭ;��N��7��>��P��Jǭ;��DƂGщQ��Jǭ;��D�@щQ��7��>��G��3��.ٟ@��DƂGщQ��@Ԛ<���,��6��B��J��>��P��7��B��J��>��P/-��1��7��6��N��Jǭ;�J��2��=��>��P��Q��@��@��@)'��1��7��N��Jǭ;�J��2��=��>��P��Q��@��@��,��6��B��J��>��P��7��B��J��>��P/-��F��Jō/��N��J��D��0��P��L�3��6��>��;��G��B&$��J��N��J��D��0��P��L�3��6��>��;��G���,��6��B��J��>��P��7��B��J��>��P��U�F��J��B��7��6͎?��/��U�F��J��B��7͎?��/��,��6��B��J��>��P��7��B��J��>��P#!��6ǭ;��>��Q��6��N��J��>��P��;�7 ��6��>��Q��6��N��J��>��P��;�7���,��6��B��J��>��P��7��B��J��>��P&$��7��6��B��J��>��P��P��/��M�G��Q��T ��7��B��J��>��P��P��M�G��Q��T��,��6��B��J��>��P��7��B��J��>��P��C�F��J��B��6ǭ;��@Ԛ<��C�F��J��B��6��@Ԛ<���,��6��B��J��>��P��7��B��J��>��P ��6��B��J��Dʿ7��E��>��P��@Ԛ<��6��B��JϿ7��E��>��P��@Ԛ<��,��6��B��J��>��P��7��B��J��>��P ��6��B��J��D��6��E��>��P��@Ԛ< ��6��B��J��D��6��E��>��P��@Ԛ<���,��6��B��J��>��P��7��B��J��>��P/-��7��6��B��J��F��6�F��,��Q��V��M�G��.��D��6,*��7��B��J��F��6�F��,��Q��V��M�G��.��D��6��,��6��B��J��>��P��7��B��J��>��P ��7��6��B��J��>��P��Hڶ>��@Ԛ<��7��B��J��>��P��Hڶ>��@Ԛ<���,��6��B��J��>��P��7��B��J��>��P><��7��6��N��J��F��3��P��;�7��N��@��N��;��J��T��;��J��Q��;��J/-��7��N��J��F��3��P��;�7��N��N��;��T��;��Q��;��,��6��B��J��>��P��7��B��J��>��P ��Lǭ;��BϨH��J��>��P��A�7��B��L��BϨH��J��>��P��+���,��6��B��J��>��P��7��B��J��>��P/-��1��7��6��N��Jǭ;�J��2��=��>��P��Q��@��@��@)'��1��7��N��Jǭ;�J��2��=��>��P��Q��@��@��,��6��B��J��>��P��7��B��J��>��P#!��Lǭ;��BϨH��J��>��P��D��G��@��K��L��BϨH��J��>��P��D��@��K���,��6��B��J��>��P��7��B��J��>��P��U�F��J��B��7��6͎?��/��U�F��J��B��7͎?��/��,��6��B��J��>��P��7��B��J��>��P��;��-��M��=��;��-��M��=���,��6��B��J��>��P��7��B��J��>��P&$��7��6��B��J��>��P��P��/��M�G��Q��T ��7��B��J��>��P��P��M�G��Q��T��,��6��B��J��>��P��7��B��J��>��P��;��-��M��=��;��-��M��=���,��6��B��J��>��P��7��B��J��>��P ��6��B��J��Dʿ7��E��>��P��@Ԛ<��6��B��JϿ7��E��>��P��@Ԛ<��,��6��B��J��>��P��7��B��J��>��P/-ϨH��Jō/��B��J��>��PϨH��Jō/��B��Jڶ>��F��=/-ϨH��Jō/��B��J��>��PϨH��Jō/��B��Jڶ>��F��=���,��6��B��J��>��P��7��B��J��>��P/-��7��6��B��J��F��6�F��,��Q��V��M�G��.��D��6,*��7��B��J��F��6�F��,��Q��V��M�G��.��D��6��,��6��B��J��>��P��7��B��J��>��P20��7��6��B��C��J��>��P��/��G��=��Q��>��B��D��>ÐW,*��7��B��C��J��>��P��G��=��Q��>��B��D��>ÐW���,��6��B��J��>��P��7��B��J��>��P><��7��6��N��J��F��3��P��;�7��N��@��N��;��J��T��;��J��Q��;��J/-��7��N��J��F��3��P��;�7��N��N��;��T��;��Q��;��,��6��B��J��>��P��7��B��J��>��P ��Lǭ;��BϨH��J��>��P��:ÐW��4��L��BϨH��J��>��P��:ÐW��4���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��> ��<��6��>��7��T��<��@��9��:��T��<��6��>��7��T��?��9��:���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��> ��M��4��T��C��T��7��@��<��@Ԛ<��M��4��C��T��7��@��@Ԛ<���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>53��9��T��B��@��>��T�K��7��<��:��7��@��<ǭ;��?��A��B,*��9��T��B��>��T�K��7��<��:��7��@ՄN��A��B���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>A?��>��T��<��@��>��/��2��6��S��C��S��E��T��<��@��>��-��/��7��B��6;9��>��T��?��>��/��2��6��S��C��S��E��T��?��>��-��/��7��B��6���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>/-��T��@��<��T��@��<��/��T��@��<��6��S��E��A��T&$��T��@��T��@��/��T��@��6��S��E��A��T���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>,*��4��T��<��@��H��A��V��T��J��D��8��D��A��P��4��?��H��A��V��T��D��8��A���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>��4��T��<�G��D��G��@��K��4��<�G��D��@��K���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>;9��4��T��R��F��7��@��<��5��@��2��D��0��O����6��P����6��T,*��4��R��I��@��5��@��2��0��O����6��P����6���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>,*��C��>��8��T��<��7��@��<��1��>��D��P�D��A&$��C��>��8��<��7��@��1��>��D��P�D��A���6��N��B��U��C��6�O	��@��U��6��6��N��B��V��1��U��C��6�O��@��V��1��U��6��6��N��B��U��C��6�O	��@��U��686��6��N��B��U��C��-�9Ԛ<��D��/щQ��6��C��U��,��C��<��P)'��@��U��-��1��D��/щQ��6��U��,��C��<��P���6��N��B��U��C��6�O	��@��U��6��6��N��B��U��C��D��P�D��A��@��U��D��P�D��A��6��N��B��U��C��6�O	��@��U��6#!��6��N��B��E��E��U�DщQ��C�D��P��@��E��E��U�DщQ��C�D��P���6��N��B��U��C��6�O	��@��U��6��6��N��B��V��1��U��C��6�O��@��V��1��U��6��6��N��B��U��C��6�O	��@��U��6��6��N��B��U��C��@Ԛ<��@��U��@Ԛ<���6��N��B��U��C��6�O	��@��U��6��6��N��B��U��C��D��P�D��A��@��U��D��P�D��A��6��N��B��U��C��6�O	��@��U��6PN��6��N��B��U��Cٟ@�9ٟ@��N��D��.��B��2�I�O��=��.��@��D��N��B��2��B��E��1�S;9��@��U��9��N��D��.��2�I�O��=��.��@��D��N��2��B��E��1�S���6��N��B��U��C��6�O	��@��U��6��6��N��B��V��1��U��C��6�O��@��V��1��U��6��6��N��B��U��C��6�O	��@��U��6&$��6��N��B��U��MP�C��.��6��D��@Ԛ< ��@��U��MP�C��.��6��D��@Ԛ<���6��N��B��U��C��6�O	��@��U��6��6��N��B��U��C��D��P�D��A��@��U��D��P�D��A��6��N��B��U��C��6�O	��@��U��6&$��6��N��B��U��Uӛ?��C��D��T��D�A��4��@��U��Uӛ?��C��D��T��D��A���6��N��B��U��C��6�O	��@��U��6��6��N��B��V��1��U��C��6�O��@��V��1��U��6��6��N��B��U��C��6�O	��@��U��620��6��N��B��U��C��-�9Ԛ<��6�Oݠ.��D����N��@Ԛ<#!��@��U��-��1��6ݠ.��D����N��@Ԛ<���6��N��B��U��C��6�O	��@��U��6��6��N��B��U��C��D��P�D��A��@��U��D��P�D��A��6��N��B��U��C��6�O	��@��U��6/-��6��N��B��6��O��U��C��N��3��>��E��T��B��E��T&$��@��6��O��C��N��3��>��E��T��B��E��T�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9/-��D��H��L��K��D?�O��A��O��6�:��,��A�7��B)'��D��H��L��K��D?�O��A��O��6�:��,��+�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��986��H��:��!��DƇ>�O��-��8�W��H�O��W��K��-��4��=�R��J53��H��:��!��DƇ>�O��-��8�W��H�O��W��K��-��4�R��J�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��986��N��A��9��=��H��5��D?�O��Jٟ@��6�:��G��2��@��@��@,*��N��9��H��5��D?�O��J��5�:��G��2��@��@�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9SQ��Dǭ;��D��Q��D�U��H��:��D�O��>��&��D�B��7��D�O��O��J��D��I��P��A��F��E��>��6MK��Dǭ;��D��Q��D�U��H��:��D�O��>��&��D�B��7��D�O��O��J��D��P��A��F��>��6�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9&$��H��=��Dć?�O��=�9��=ϷA��H����@��H��=��Dć?�O��9��A����@�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9A?��,�O��-��H��D��Bٟ@��;��?��=��1��P��K��@?�O��=�9��=��@Ԛ<53��,�O��-��H��D��@�?��=��1��P��K��@?�O��9��@Ԛ<�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9JH��H��=��W��K��=��:��B��:��D?�O��:��D��1��=��@�9��=��D��9��D��5��@Ԛ<><��H��=��W��K��=��:��B��:��D?�O��:��1��=��@��=��9��5��@Ԛ<�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9&$��H��=��Dć?�O��=�9��=��D��S�D��A ��H��=��Dć?�O��9��D��S�D��A�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9><��D��H��K��L��9�GϪJ��D?�O��=��D�?��/ٟ@��=��@��6��@Ԛ<;9��D��H��K��LݲLϪJ��D?�O��=��D�?��/ٟ@��=��@��6��@Ԛ<�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��120��H��,��8��2�,ֈ;��0��4��V��C��7��G��/��T��>��1)'��H��,��8��2ڈ;��4��V��Cî7��/��T��>��1�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1,*��V��@��,��1��V��2�,��7��C��7��G��.��V��@ �M��,��1��V��2��7��Cî7��.�M�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��153��H��8��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>,*��H��8��2��7��Cî7��/��T��>��1��?��T��J��Q�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1/-��W��?��A��;��O��V��2�,��7��C��7��G��A��.��T#!��W��?ҞM��O��V��2��7��Cî7��A��.�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1��2�,��>��B��-��4��5��J��2��>��B��-��5�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1hf��H��8�
+��N��2�,ԓ4��D��C��7��G��7��7��B��K��;��9��/��T��>��1��K��L��/��U��5�
+��5����>��2�,��W��FVT��H��8�
+��N��2��4��Cî7��7��B��K��;��9��/��T��>��1��K��L��/��U��5�
+��5����>��2��W�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��186��H��,��7��H��8��2�,��R��N��V��C��7��G��/��T��7��>��1��H��,��>��1�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1��2��C��2��C�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1��V��2�,��7��C��7��G¶;��V��2��7��Cî7¶;���B��R��A��D��K��B��R��A��DDB��R��A��D��K��C��5P��=Pބ2��R��A��D��K��C��5��8�:��-��9��E��T86��R��A��D��C��5��=܉2��R��A��D��C��5��8�:��-��9��E��T��B��R��A��D��K��B��R��A��D><��@��G��D��5��ՂP��R��A��5��H��D��KϲL��K��2��!��Q��H�9��T;9��@��G��D��5��ՂP��R��A��5��H��DϲL��K��2��!��Q��H�9��T���B��R��A��D��K��B��R��A��DJH��H���5ՂP��2��C��D��K��L��A��R��A��K��3��D��K��M��K��5��D��Kև9��>��TA?��H���5ՂP��2��R��K��L��A��R��A��K��3��D��M��K��5��D��Kև9��>��B��R��A��D��K��B��R��A��D,*�F�7��C��P��L߫W��A��=��R��A��D��K��S��7)'�F�7��C��P��L߫W��A��=��R��A��D��S��7���B��R��A��D��K��B��R��A��DPNՂP��L��E��;ߏG��K��C��R��A��D��K��C��B��A��M��K��C��K�?��K��C��<��O��Sߋ5��,MKՂP��L��E��;ߏG��K��C��R��A��D��C��B��A��M��K��C��K�?��K��C��<��O��Sߋ5��,��B��R��A��D��K��B��R��A��DGE��,��9��;��D��K��1��?؇9��U��8ȴS��>��C��P��D��7��L��R��A��B��A��D��K><��,��9��;��D�R؇9��U��8ȴS��>��C��P��D��7��L��R��A��B��A��D���B��R��A��D��K��B��R��A��DDB��R��A��D��K��C��5P��=Pބ2��R��A��D��K��C��5��8�:��-��9��E��T86��R��A��D��C��5��=܉2��R��A��D��C��5��8�:��-��9��E��T��B��R��A��D��K��B��R��A��D\Z��:��D��K��C��L��C��B��C��L��5��?��L��F��L��>��H��D��K��C��R��A��K�?��M��KߏG��K��C��BùFPN��:��D��C��L��C��B��C��5��?��L��L��>��H��D��C��R��A��K�?��M��KߏG��K��C��BùF���B��R��A��D��K��B��R��A��DJH��H���5ՂP��2��C��D��K��L��A��R��A��K��3��D��K��M��K��5��D��Kև9��>��TA?��H���5ՂP��2��R��K��L��A��R��A��K��3��D��M��K��5��D��Kև9��>��B��R��A��D��K��B��R��A��Dqo�
+��2��C��D��KՂP��L����A��R��A��K��3��D��K��M��KߏG��K����HӒC��,��N��D��K��5��=��T��Uߋ5��,��,��=��>��:��J_]�
+��2��R��KՂP��L����A��R��A��K��3��D��M��KߏG��K����HӒC��,��D��5��=��T��Uߋ5��,��=��>��:���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�ѤI��;��;��K��F��A��BѤI��;��;��K��F��A��B���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;� ѤI��;��S��>��>ٟ@��6��;��@Ԛ<ѤI��;��S��>��>��5��;��@Ԛ<���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�JH��3Ԛ<ѤI��;��>��6��;��6��S��F����;�.��T��T��D��<��D�<��D��C��)�.��FDB��1ѤI��;��>��6��;��6��S��F����;�.��T��D��<��D�<��D��C��)�.��F���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�PNѤI��;��A�9ٟ@��6��-�9��A��4��>��T�M��SѤI��2��2��E��7��>��>��2��O��D��@��TDBѤI��;��A @��6��9��>��T�M��SѤI��2��2��E��7��>��>��2��O��D��@��T���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�><ѤI��;��B��2ѤI��;��2ѤI��;��0ѤI��;��S��NѤI��;��N��O��F��T><ѤI��;��B��2ѤI��;��2ѤI��;��0ѤI��;��S��NѤI��;��N��O��F��T���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�53ѤI��;��>��>��;��U�0��>��;��D��6��P��G��D��S�D��A53ѤI��;��>��>��;��U�0��>��;��D��6��P��G��D��S�D��A���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�ѤI��;��M��@��KѤI��;��M��@��K���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;� ѤI��;��8ٟ@��>��6��C��A�7��BѤI��;��8ٟ@��>��6��C��+�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��9��E��N��3��9��>ڹ3��T��2��1��M����1��T)'��9��E��N��3��9��>ڹ3��T��2��M����1��T&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��:��B�7��>��B��3��1��C��T��C��CԃP��-��C#!��:��B�7��>��B��3��1��C��CƠ<��C�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G53�1��TН?��>��/��3��>��N��D��3��>��2��H��T��C��.��:)'�1��?��/��3��N��D��3��>��2��H��C��.��:&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G/-Ԋ/��B��N��P��9��2��K��1��W��>��2Ԋ/��Lؒ.��=#!Ԋ/��N��9��K��1��W��>��2��Lؒ.��=�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��V��>��N��Dͯ?ڹ3�F��1ȇN��;�9��F��G�B)'��V��>��N��Dͯ?ڹ3�F��1ׇN�9��F��G�B&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GDB��N��W��>ڹ3��2��1��%��K��9��E��?��A��F��F��?��D�J��EʡH��9��?�/86��N��W��>ڹ3��2��%��K��9��E��?��A��F��F��D�J��E��9�/�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G20��;�@�7��:��T��C��U��D��Tڹ3��>��NщQ��A�7��B#!��;�7��:��C��U��Dڹ3��>��NщQ��+&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G)'��A��F��F��?��9��E��1ڹ3��2��1��K��W��(#!��A��F��F��9��E��1ڹ3��2��K��W��(�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G53��E��E��O��3��5��B��5��Iٟ@�7��A��:��5��Gς1��6��T&$��E��>��3��B��5��@�7��A��:��5��G��+&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G86��>��>��8��R��V��G��Bڹ3��;��2��F��5��>��H�K��7��<��653��>��>��8��R��V��G��Bڹ3��;��2��F��5��>��H��7��<��6�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GA?��5�B��J��H����:ɚ��K�7��3��G��H��A��V��T��J��D��8��D��A��P86��5�B��J��H����:ɚ��K�7��3��G��H��A��V��T��D��8��A&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GDB��D��N��A��D��V��9��3��>��R��9��B�>��:��D��:��T��C��S��-��@��@��@86��D��A��D��V��9��3��>��R��9��B�>��:��D��:��C��S��@��@�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G��J��B�7��>��J��3��/��:��J�7��>��J��3��/��:&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G20��A��F��?��9��C��1��N��Wڹ3��2��1��K��:��&�8��7)'��A��F��9��C��1��N��Wڹ3��2��K��:��&��8�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��9��E��N��3��9��>ڹ3��T��2��1��M����1��T)'��9��E��N��3��9��>ڹ3��T��2��M����1��T&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GSQ��9��D��B��B��3ҾW��1��9��6��5��9��D��:��Q��T��C��2ʶU��>��3�.ٟ@��6ǽ=��G��@Ԛ<A?��D��B��B��3ҾW��1��9��6��5ՔD��Q��C��2ʶU��>��3ٟ@��6��G��@Ԛ<�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G53�1��TН?��>��/��3��>��N��D��3��>��2��H��T��C��.��:)'�1��?��/��3��N��D��3��>��2��H��C��.��:&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GDB��5��3��>��R��D��>��B�7��H��L��T��D��>��B�7��:��L��G��D��S�D��ADB��5��3��>��R��D��>��B�7��H��L��T��D��>��B�7��:��L��G��D��S�D��A�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��V��>��N��Dͯ?ڹ3�F��1ȇN��;�9��F��G�B)'��V��>��N��Dͯ?ڹ3�F��1ׇN�9��F��G�B&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��5�D��Bڹ3�G��>��<��9�7��>�?��L��S�:)'��5�D��Bڹ3�G��>��<��9�7��>��F��S�:�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G20��;�@�7��:��T��C��U��D��Tڹ3��>��NщQ��A�7��B#!��;�7��:��C��U��Dڹ3��>��NщQ��+&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��Gki��R��V��>��N��Dڹ3��2��į?��1��T�9��Fܫ7�M��N��6��K��9��D��,��K��6�9��.��1��R��3�R��F����B��O��B��Tec��/��>��N��Dڹ3��2��į?��1��T�9��Fܫ7�M��N��6��K��9��D��,��K��6�9��.��1��R��3�R��F����B��O��B�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G53��E��E��O��3��5��B��5��Iٟ@�7��A��:��5��Gς1��6��T&$��E��>��3��B��5��@�7��A��:��5��G��+&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GSQ��6��3��P��7��S��4��D��T�9��I��8��7��1�Dڹ3��2��:��T��C��RٍB��KЅJ��C��G��>��6DB��6��3��P��7��S��C��T��8��7��1�Dڹ3��2��:��C��RٍB��KЅJ��C��>��6�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GA?��5�B��J��H����:ɚ��K�7��3��G��H��A��V��T��J��D��8��D��A��P86��5�B��J��H����:ɚ��K�7��3��G��H��A��V��T��D��8��A&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G&$��>��>��8��R��V��G��Bڹ3��;��2��F��5&$��>��>��8��R��V��G��Bڹ3��;��2��F��5�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��886يR��2��A��8��>يR��2��A��8��>��8��J��-��I��N��=��J��T)'يR��2ŞيR��2Ş��8��J��-��I��=��J��T�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8_]��>��7��JЁH��?��ʡH��W��O��U��A��7��J��1��H����N����=���F���F��HيR��2��8��>��G��@��K\Z��>��7��JЁH��?��ʡH��W��O��U��A��7��J��1��H����N����=���F���F��HيR��2��8��G��@��K�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8zx��(����"���������
+��U��A��7��J��1��H�
+��N����=���F���F��HيR��2��8��>��D��A��7��J��U��A��7��<��J��:��B��;��#_]�
+��U��A��7��J��1��H�
+��N����=���F���F��HيR��2��8��D��A��7��J��U��A��7��<��:��B��;��#�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8><يR��J��TيR��8��T��يR��<��J����N��	��=يR��J��-����8��T;9يR��J��TيR��8��T��يR��<����N��	��=يR��J��-����8��T�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��886يR��2��A��8��>يR��2��A��8��>��8��J��-��W��N����8��T,*يR��2ŞيR��2Ş��8��J��-��W��N����8��T�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8VT�
+��U��A��7��J��1��H����=���F���F��HيR��2��8��>�1��0�7����A��@��H۰M�3��AMK�
+��U��A��7��J��1��H����=���F���F��HيR��2��8�1��0�7����@��H�3��A�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8;9يR��2��8��>��9��K��A��8��D��6��P��>��JщQ��N��.��6��@Ԛ<53يR��2��8��9��K��A��8��D��P��>��JщQ��N��.��6��@Ԛ<�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8b`����U��A��7��J��1��H����N����=б�F���F��HيR��2��8��>��D��A��7��J��U��A��7��<��J��:��B��;\Z����U��A��7��J��1��H����N����=б�F���F��HيR��2��8��D��A��7��J��U��A��7��<��:��B��;���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD��/��U��P��G��,��N��K��Q��M��/��U��P��G��,��K��Q��M���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD ��/�J��P��C��9�8��?��U��P��T��/�J��P��9�8��?��U��P��T���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕDA?��/��/��P��/��/��P��OP��=��-��/��/��P��C�?��K��P��/��/��Pĩ8><��/��/��P��/��/��P��O��=��-��/��/��P��C�?��K��P��/��/��Pĩ8���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD><��N��R��9��K��U��P��S��>��9��S�9Ԛ<��/��D��9��D��R��K��@Ԛ<53��N��R��9��K��U��P��S��>��9��S��1��/��9��R��K��@Ԛ<���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD20��5��9��P��/ַ;��/��P��/��P��A��/��P��?��P��F��7,*��5��9��P��/ַ;��/��P��/��P��/��P��?��P��F���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕDPN��U��P�?İU��H��P��.��F��-��S��5��1��S�S��A��P��K�8��5��G��6�����)��ʪJH��U��P�?İU��H��1��F��-��S��5��1��S�S��A��P��K�8��G��6�����)��ʪ���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD ��U��PʡH��9��8��C��C��H��/��T��U��P��9��C��C��H��/��T���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD��PʰD��/��Fַ;��PʰD��/��1�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��WGE��>ܷT��1��8��W��>��/��2��6��S��C��S��EܷT��1��8��W��>��-��/��7��B��6A?��>��U��8��W��>��/��2��6��S��C��S��E��U��8��W��>��-��/��7��B��6�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��W��ܷT��1��W��>��/��>��/��C��SܷT��1��W��>��/��U��SܷT��1��W��>��/ܷT��1��W��>��/��>�8��M����6��@��6��6��>ќ:��0��F��6�<��G��>��2��6��7��(����%����!����"�~��U��W��>��/��>��/��C��S��U��W��>��/��U��S��U��W��>��/��U��W��>��/��>�8��M����6��@��6��6��>ќ:��0��F��6�<��G��>��2��6��7���ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��WGE��>ܷT��1��8��W��>��/��2��6��S��C��S��EܷT��1��8��W��>��-��/��7��B��6A?��>��U��8��W��>��/��2��6��S��C��S��E��U��8��W��>��-��/��7��B��6�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��WA?��6��D��Q��6��6��NیV��O��H��2ܷT��1��8��W��/��Q��6��6��;��6��=;9��6��Q��6��6��NیV��O��H��2��U��8��W��/��Q��6��6��;��6��=�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��W��8ܷT��1��O��W��=��;��8��U��O��W��=��;�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��W ܷT��1��W��F��M��>Л6��;��@��K��U��W��F��>Л6��;��@�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��W20��A��8��9�Q��EܷT��1��G��4��W��E��>��F��W��A��B)'��A��8ƋQ��E��U��G��4��W��E��>��F��A��B���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��XMK��:��?��:��?��L��I��M��W��#������D��E��=��X��<��F��#����#��%Ӳ&��Ӳ&��;9��:��?��:��?��L��M��W��#������D��E��=��XѶ<��#��#�Χ���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X ��W��R��:��?�9��Iʉ5��X��@Ԛ<��W��R��:��?��Iʉ5��X��@Ԛ<���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��XVT��9��W��I��>��:��?щQ��V��4��6��V��6��#����6��#��%��6��#����6��$����6��#�8���8GE��9��W��I��>��:��?щQ��V��6��V��6��#��6��#��6��#��6����6��#�8���8���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X�9��Wʉ5��X��@��N�9��Wʉ5��X��@��N���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X><��W��R��:��?�9��I��G��>ܤK��V��#����%ѾC��H��T��L��6��L��T53��W��R��:��?��I��G��>ܤK��V��#����%��5��L��6��L��T���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X;9��<��W��1��/��>��:��?б�����9��WڶU��5���P����R��T53��<��W��1��/��>��:��?���9��WڶU��5���P����R��T���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X53��E��W��N��6��=��A��9��S��:��?�9��I��>��<��G��V��620��E��W��N��6��=��A��9��S��:��?��I��>��<��G��V��6���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?��Gʉ5����B��W��/��U��X��7�A��E��3��D��#����&&$��:��?��G����B��W��/��U��X�A��E�����D��,��?��R��F��D��,��?��R��F;9��,��?��R��F��Q��U��B��D��A��P��;��0��T��?��6��T��)����!&$��,��?��R��F��Q��U��A��;��T��6��T��)��D��,��?��R��F��D��,��?��R��F/-��?��,��F��R��>��,��6��2ɀ?��E�B��P��2��2��>)'��8��F��R��>��,��6��2ɀ?��E�B��P��2��2���D��,��?��R��F��D��,��?��R��FMK��D��=��D��3��Dٟ@��F��R��?��,��1��@��?��>��1�9��Kٟ@�9ٟ@�-��4��,��@Ԛ<><��D��D��Rٟ@��F��R��8��1��@��?��>��1�9��K��9�-��4��,��@Ԛ<��D��,��?��R��F��D��,��?��R��F,*��R��F��>��BϨH��,��@��?��,��6��D��P�D��A&$��R��F��>��B؋8��@��8��6��D��P�D��A���D��,��?��R��F��D��,��?��R��F><��D��,��?��R��F��?ϨH��.��?��R��J��V��9��S��6��>��EщQ��@Ԛ<53��D��,�.��F��?��H�.��J��V��9��S��6��>��EщQ��@Ԛ<��D��,��?��R��F��D��,��?��R��F,*��R��F��,��?��H��A��V��T��J��D��8��D��A��P ��R��8��?��H��A��V��T��D��8��A���D��,��?��R��F��D��,��?��R��FGE��D��3��D����R�I��F��,��2��?��.����@��PیV��D��H��A��V��D��A��P��T><��D��R����R�I��F��,��2��?��.����@��PیV��D��H��A��V��A��T��D��,��?��R��F��D��,��?��R��F20��A��9��=��R��F��,��L��?��.��,��K��,��6щQ��@Ԛ<,*��9��R��F��,��L��?��.��,��K��,��6щQ��@Ԛ<���D��,��?��R��F��D��,��?��R��FA?��D��R��F��,íB��?��2��D��9��7��I��6��.��2��9��DܤK��4��@��@��@86��D��R��F��,íB��?��2��D��9��7��I��6��.��2�D��4��@��@��D��,��?��R��F��D��,��?��R��F��,��?��R��F��U��P��U��T��,�.��F��U��P��U���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I&$��9��C��Xֈ?�N��X��Iַ;��/��N��7��2��9��Xֈ?�N��X��I��/��NĚ7��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I><��S��9��C��X��7ֈ?�N��X��Iַ;��W��N��I��,ڶ>��T��0��N��6�Q20��S��9��Xֈ?�N��X��I��W��N��,ڶ>��T��0��N��6�Q���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I,*��9��C��Xֈ?�N��X��Iַ;��M��/��TۓR��7��K ��9��Xֈ?�N��X��I��M��TۓR��7��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I)'��9��C��Xֈ?�N��X��Iַ;��B�<ނB��<��T#!��9��Xֈ?�N��X��I��B�<ނB��<��T���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I ��:��-��9��C��Xֈ?�N��X��Iַ;��:��-��9��Xֈ?�N��X��I���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I&$��9��C��Xֈ?�N��X��Iַ;��/��N��7��2��9��Xֈ?�N��X��I��/��NĚ7��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��Iec��/��S��B��X��D��/��S��:��X��D��<Υ6��1یV��0��/��S��B��X��U��B��O��B��E��B��V��B��,��B��-ނB��<��TMK��/��S��B��D��/��S�:��D��<Υ6��V��0��/��S��B��UüO��E��B��V��,��-ނB��<��T���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I,*��9��C��Xֈ?�N��X��Iַ;��M��/��TۓR��7��K ��9��Xֈ?�N��X��I��M��TۓR��7��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I#!��9��C��FۨV��T��/��9��7ʡHб��6��9��F��8��/��9��7ʡHб��6���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I ��9��C��Xֈ?�N��X��Iַ;�7��4��9��Xֈ?�N��X��I�7���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I&$��9��C��Xֈ?�N��X��Iַ;��/��N��7��2��9��Xֈ?�N��X��I��/��NĚ7��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I��9��C��4ֈ?��Iַ;��6��B��T��9��4ֈ?��I��B��T���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�OPN��D��D��7��O��C��-��S�O֊2��>��S��DɵO��6��8�9��H��A��V��T��J��D��8��D��A��PA?��D��D��7��C��-��S�O��>��S��DɵO��6��8�9��H��A��V��T��D��8��A���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�OA?��7��8��B��Bر/��D��2ѺKٟ@��6��T��C��M��U�<��F������!����"20��7��8��B��Bر/��D��2ѺK��5��T��C��M��U�<��F��	���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�O ��5��7��:��C��D�O֊2ѺK��@Ԛ<��5��7��:��C��D�OѺK��@Ԛ<���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�ODBкB��9��N��7��:��C�O֊2��>ٟ@��6߇;��1��G��3��F��7;Q��6��7;Q��T><кB��9��N��7��:��C�O��>��5߇;��1��G��3��F��7;Q��6��7;Q��T���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�ODB��7��O��B��6��2��>��R��@��2��A��5��7;QԚ<��7;Q��T��7��N�3��>��M><��7��O��B��6��2��>��R��@��2��A��5��7;QԚ<��7;Q��T��7��N��>���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�O)'��V��D��D��7��B��C��9��2��>��/��6��7��T)'��V��D��D��7��B��C��9��2��>��/��6��7��T���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�O&$��V��7�J��R��1��:��2��R��<��@��@��@#!��V��7�J��R��1��:��2��R��<��@��@���2��8��>��1��S��6��Mſ2��>��1��5��M;9��K��S��6��MԚ<��2��8��D��A��P��;��0��T��?��6��T��)����!#!��K��5��MԚ<ſ2��A��;��T��6��T��)��2��8��>��1��S��6��Mſ2��>��1��5��M;9��2��8��=��S��0��M��2��8��G��N��0��6��W��,��6��4����6��T20ſ2��=��S��0��Mſ2��G��N��0��6��W��,��6��4����6���2��8��>��1��S��6��Mſ2��>��1��5��M><��2��8ٟ@��6�Q��>��D��1��.��S��6��M��>��=����6��P����6��T20ſ2��5�Q��>��D��1��.��5��M��>��=����6��P����6��2��8��>��1��S��6��Mſ2��>��1��5��M/-��2��8��>��M��2��8��1��S��6��MۓR��9��T��,��K&$ƿ2��>��Mƿ2��1��5��MۓR��9��T��,��K���2��8��>��1��S��6��Mſ2��>��1��5��M><��2��8��1��S��6��M��>��1�H��3��PیV��D��H��A��V��D��A��P��T20ſ2��1��5��M��>��1�H��3��PیV��D��H��A��V��A��T��2��8��>��1��S��6��Mſ2��>��1��5��M53��A��2��8��=χ7��1��S��6��M��N��1��SщQχ7��=�R��J,*��Aſ2��=χ7��1��5��M��N��1��SщQχ7�R��J���2��8��>��1��S��6��Mſ2��>��1��5��M;9��K��S��6��MԚ<��2��8��D��A��P��;��0��T��?��6��T��)����!#!��K��5��MԚ<ſ2��A��;��T��6��T��)��2��8��>��1��S��6��Mſ2��>��1��5��M��S��6��M��E��S��2��8��@Ԛ<��5��M��E��Sſ2��@Ԛ<���2��8��>��1��S��6��Mſ2��>��1��5��M><��2��8ٟ@��6�Q��>��D��1��.��S��6��M��>��=����6��P����6��T20ſ2��5�Q��>��D��1��.��5��M��>��=����6��P����6��2��8��>��1��S��6��Mſ2��>��1��5��MYW��D��NԚ<��2��8��1��S��6��M��1�H��3��Vٟ@��2��8��1��D��A��P��;��0��T��?��6��T��)����!86��Nſ2��1��5��M��1�H��3��Vٟ@ſ2��1��A��;��T��6��T��)���2��8��>��1��S��6��Mſ2��>��1��5��M><��2��8��1��S��6��M��>��1�H��3��PیV��D��H��A��V��D��A��P��T20ſ2��1��5��M��>��1�H��3��PیV��D��H��A��V��A��T��2��8��>��1��S��6��Mſ2��>��1��5��M53��M��Vٟ@��2��8��D��A��P��;��0��T��?��6��T��)����!��Mٟ@ſ2��A��;��T��6��T��)���2��8��>��1��S��6��Mſ2��>��1��5��M;9��K��S��6��MԚ<��2��8��D��A��P��;��0��T��?��6��T��)����!#!��K��5��MԚ<ſ2��A��;��T��6��T��)��2��8��>��1��S��6��Mſ2��>��1��5��M#!��A��2��8��1��S��6��M��D��S�D��A��Aƿ2��1��5��M��D��S�D��A�����P��J��>��R��JЍP��J��>��R��J,*��M��:��������D��>��J��6߻W��D��S�D��A,*��M��:��������D��>��J��6߻W��D��S�D��A����P��J��>��R��JЍP��J��>��R��JA?��V��N��N��,̥6��:��D��9��S��J��6�O��Q��Nέ;��L�S��DʡH��9�;86��V��N��,̥6��:��D��9��S��J��6��Q��N٭;�S��DʡH��9�;�����P��J��>��R��JЍP��J��>��R��J/-����R߻WPۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ</-����R߻WPۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ<����P��J��>��R��JЍP��J��>��R��J53����DȂ3��@��>��QP��JP��J��>��R��C��R��A�7��B/-����DȂ3��@��>��QP��JP��J��>��R��C��R��+�����P��J��>��R��JЍP��J��>��R��Jqo��=��>����Q��H��,��<��5�Wį?��;��>P��J��BPۃJ��D��9��S�I�F��>��J��6��R��N��7��>�8��0��9��D��S��PԮK߀3hf��=��>����Q��H��,��5�Wį?��;��>P��J��BPۃJ��D��9��S�I�F��>��J��6��R�7��>�8��0��9��D��S��PٮK����P��J��>��R��JЍP��J��>��R��J;9��5��$��,��U��,������6��D��>��:��5��JЂJ��D��J��A�7��B/-��5��$��,��,Ѝ��6��D��>��:��5��JЂJ��D��J��+�����P��J��>��R��JЍP��J��>��R��JPN������P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6ǽ=��E��X��>��PGEЍ��P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6��E��>��P����P��J��>��R��JЍP��J��>��R��J��G��7��T��Q��-��G��7��T��Q��-�����P��J��>��R��JЍP��J��>��R��J)'����QP��J��R��6�9��:��A��D��S�D��A#!����QP��J��R��9��A��D��S�D��A����P��J��>��R��JЍP��J��>��R��JDB�����������A��2ûR�9��?��A��>��;��B��TûR�9��?��A��>��5��653��T�A��2ûR�9��?��A��>��;��BûR�9��?��A��>��5��6�����P��J��>��R��JЍP��J��>��R��J,*��M��:��������D��>��J��6߻W��D��S�D��A,*��M��:��������D��>��J��6߻W��D��S�D��A����P��J��>��R��JЍP��J��>��R��J><������6��J��D��9��S��J��6ȻW̑-�9ٟ@P��J��>��R��J��@Ԛ<86����6��J��D��9��S��J��6ȻW̑- @P��J��>��R��J��@Ԛ<�����P��J��>��R��JЍP��J��>��R��J/-����R߻WPۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ</-����R߻WPۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ<����P��J��>��R��JЍP��J��>��R��JMK����6��6�D��J��QP��L��>��J��R��J��J��QP��L��@��B��J��9��Uڤ5��5��@Ԛ<GE����6��6�D��J��QP��L��>��J��R��J��J��QP��L��@��B��J��Uܤ5��@Ԛ<�����P��J��>��R��JЍP��J��>��R��Jqo��=��>����Q��H��,��<��5�Wį?��;��>P��J��BPۃJ��D��9��S�I�F��>��J��6��R��N��7��>�8��0��9��D��S��PԮK߀3hf��=��>����Q��H��,��5�Wį?��;��>P��J��BPۃJ��D��9��S�I�F��>��J��6��R�7��>�8��0��9��D��S��PٮK����P��J��>��R��JЍP��J��>��R��J����4��B��4յG��W��G��X��F����4��B��4��W��X�����P��J��>��R��JЍP��J��>��R��JPN������P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6ǽ=��E��X��>��PGEЍ��P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6��E��>��P����P��J��>��R��JЍP��J��>��R��J,*������R��:��D��>ڝJ��R��K��2��D��G��@��K&$Ѝ��R��:��D��>ڝJ��R��K��2��D��@��K�����P��J��>��R��JЍP��J��>��R��J)'����QP��J��R��6�9��:��A��D��S�D��A#!����QP��J��R��9��A��D��S�D��A����P��J��>��R��JЍP��J��>��R��J;9��5��$��,��U��,������6��D��>��:��5��JЂJ��9��W��W����C/-��5��$��,��,Ѝ��6��D��>��:��5��JЂJ��9āR��A�����P��J��>��R��JЍP��J��>��R��J,*��M��:��������D��>��J��6߻W��D��S�D��A,*��M��:��������D��>��J��6߻W��D��S�D��A����P��J��>��R��JЍP��J��>��R��J�����A��F��8�,��TЍ�A��F��,��T�����P��J��>��R��JЍP��J��>��R��J/-����R߻WPۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ</-����R߻WPۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ<����P��J��>��R��JЍP��J��>��R��J	����4��B	����4��B�����P��J��>��R��JЍP��J��>��R��Jqo��=��>����Q��H��,��<��5�Wį?��;��>P��J��BPۃJ��D��9��S�I�F��>��J��6��R��N��7��>�8��0��9��D��S��PԮK߀3hf��=��>����Q��H��,��5�Wį?��;��>P��J��BPۃJ��D��9��S�I�F��>��J��6��R�7��>�8��0��9��D��S��PٮK����P��J��>��R��JЍP��J��>��R��J/-��U��C��U��TʡH��>��/��X��>����>�A��2���� ��U��UʡH��>��/��X����>��2Ѝ�����P��J��>��R��JЍP��J��>��R��JPN������P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6ǽ=��E��X��>��PGEЍ��P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6��E��>��P����P��J��>��R��JЍP��J��>��R��J�����A��F��P��C��<��B��BЍ�A��F��P��<��B��B�����P��J��>��R��JЍP��J��>��R��J)'����QP��J��R��6�9��:��A��D��S�D��A#!����QP��J��R��9��A��D��S�D��A����P��J��>��R��JЍP��J��>��R��J&$����U��R��:��D��>��J����B����/��T&$����U��R��:��D��>��J����B����/��T���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB���2��<��;��>��2��<��;��>���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�)'��V����2��P��K��C�4��EȯB��-�;��J��6&$��V����2��P��K��C�4��EȯB��-�;ϜJ���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�20�T��3��=��C��;��D��9��>��:��C��O��-֛7��<��B��B,*�T��=��C��;��D��9��>��:��C��O��-��<��B��B���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�/-ȯB��K��C��;��9��;��L�V��6�����)��ʪ��,*ȯB��K��C��;��;��L�V��6�����)��ʪ�����P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�53��2��<��F��2��D��C��D��:��LܾW��X��F��H��F��N��I��9)'��2��<��,��D��:��LܾW��X��F��H��F��N��I���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�#!��H����D��C��,ȯB��J��P��I��@Ԛ<#!��H����D��C��,ȯB��J��P��I��@Ԛ<���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�,*��>����2��P��:ȯB��K��6��N�K��D��S�D��A,*��>����2��P��:ȯB��K��6��N�K��D��S�D��A���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�#!��P��N��ȯB��>��9��H��-�B�V��6#!��P��N��ȯB��>��9��H��-�B�V��6���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�JH����D��P��RȯB��I��H��,��5��6��:��L��I��B��,��I��;�9��V��;��K��XܤK��$GE����D��P��RȯB��I��H��,��5��6��:��L��I��B��,��I��;��V��;��K��XܤK��$�����X˩5�R�9��:����X˩5ֲ9/-��'����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<,*����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<����X˩5�R�9��:����X˩5ֲ9,*ȏBҲU��>��R��<��G��I��X��I��C��E��#��CҮJ��B��>ɸ<��I��X��I��C��#߭J�����X˩5�R�9��:����X˩5ֲ9GEݩ5��T��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<DBݩ5��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<����X˩5�R�9��:����X˩5ֲ9GE����B߹-�;��:��XܷT��6˩5��J˩5��4����B߹-�;��:��XܷT��6˩5��/��7;9����B��-��:��XܷT��6��J˩5��4����B��-��:��XܷT��6��/��7�����X˩5�R�9��:����X˩5ֲ9��D��B��:����>˩5��A��K��B��:����>˩5��A��K����X˩5�R�9��:����X˩5ֲ9��6��T��'��߹-��X��6˩5��6��T��߹-��X��6�����X˩5�R�9��:����X˩5ֲ9/-��'����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<,*����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<����X˩5�R�9��:����X˩5ֲ9A?��Uٟ@����5߹-��:��X��D˩5��I��:��X��B��9��D˩5ƛK��6��@��@��@;9��Uٟ@����5߹-��:��X��D˩5��I��:��X��B��D˩5ƛK��6��@��@�����X˩5�R�9��:����X˩5ֲ9GEݩ5��T��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<DBݩ5��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<����X˩5�R�9��:����X˩5ֲ9GE����B��I�;��:��XܷT��6˩5��J˩5��4����B��I�;��:��XܷT��6˩5��/��7A?����B��I�;��:��XܷT��6��J˩5��4����B��I�;��:��XܷT��6��/��7�����X˩5�R�9��:����X˩5ֲ9��D��B��:����>˩5��A��K��B��:����>˩5��A��K����X˩5�R�9��:����X˩5ֲ9;9��5˱U̾-��C��3��C��I��Q��:����>����:��X��>��6˩5��,��;86��5˱U̾-��C��3��C��I��Q��:����>����:��X��>��6��,��;�����X˩5�R�9��:����X˩5ֲ9/-��'����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<,*����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<����X˩5�R�9��:����X˩5ֲ9\Z����B��H����6ӻB��O��߹-��:��XܷT��B��H��߹-��XܷT��;��W��;����N����=��	�F��J˩5��4YW����B��H����6��O��߹-��:��XܷT��B��H��߹-��XܷT��;��W��;����N����=��	�F��J˩5��4�����X˩5�R�9��:����X˩5ֲ9GEݩ5��T��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<DBݩ5��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<����X˩5�R�9��:����X˩5ֲ9)'����>��6˩5��5�W�R�9��:��D��S�D��A ����>��6��5�Wֲ9��D��S�D��A�����X˩5�R�9��:����X˩5ֲ9��D��B��:����>˩5��A��K��B��:����>˩5��A��K����X˩5�R�9��:����X˩5ֲ9A?б��=��	�F߹-��=��X����B˩5��H��F��S��T��:��TʡH��?��CگD��/86б��=��	�F߹-��=��X����B��H��F��S��T��T��9��CگD��/�����X˩5�R�9��:����X˩5ֲ9/-��'����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<,*����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<����X˩5�R�9��:����X˩5ֲ9nl�R��A߹-��:��X��>����B��6˩5��1��D��0�;��Hٟ@�R�9��:��K��B��B��>�5��I��B��E��K�R��F��T��D�>��6��@Ԛ<_]�R��A߹-��:��X��>����B��6��1��0��Hٟ@ֲ9��K��B��B��>�5��I��B��E��K�R��F��T��D�>��6��@Ԛ<�����X˩5�R�9��:����X˩5ֲ9GEݩ5��T��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<DBݩ5��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<����X˩5�R�9��:����X˩5ֲ9GE����K��B��6��N��E��I��:��X��5�R�9��:˩5��U�I�R��>��:��D��S�D��AA?����K��B��6��N��E��I��:��X��5ֲ9˩5��U�I�R��>��:��D��S�D��A�����X˩5�R�9��:����X˩5ֲ9��D��B��:����>˩5��A��K��B��:����>˩5��A��K����X˩5�R�9��:����X˩5ֲ9DB߹-��:��XܷT��6��H߹-��:��XܷT��6˩5��Q��'����Ѳ��B��6ӻB��O��453߹-��:��XܷT��6߹-��:��XܷT��6��Q����Ѳ��B��0��4�����X˩5�R�9��:����X˩5ֲ9/-��'����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<,*����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<����X˩5�R�9��:����X˩5ֲ9/-��0��:��X��6˩5��0��:��X��6˩5�>��4��6��4��T&$��0��:��X��6��0��:��X��6�>��4��6��T�����X˩5�R�9��:����X˩5ֲ9GEݩ5��T��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<DBݩ5��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<����X˩5�R�9��:����X˩5ֲ9A?б��=��	�F߹-��=��X����B˩5��H��F��S��T��:��TʡH��?��CگD��/86б��=��	�F߹-��=��X����B��H��F��S��T��T��9��CگD��/�����X˩5�R�9��:����X˩5ֲ9��D��B��:����>˩5��A��K��B��:����>˩5��A��K����X˩5�R�9��:����X˩5ֲ9\Z��D�R��A��9į?߹-��=��X��>��6˩5��H��0��-��D��E��0��6��EщQ��I��.��6щQ��2��2��D��S�D��AYW��D�R��A��9į?߹-��=��X��>��6��H��0��-��D��E��0��6��EщQ��I��.��6щQ��2��2��D��S�D��A�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K#!����N��;��W��H��E��K��9ݠ.��E��T����N��;��W��H��E��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��;��W��H��E��K��K��9ݠ.��E��T����N��;��W��H��E��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K)'����N��9ݠ.��;��W��K��E��	��9ݠ.��E��T ����N��9��;��W��K��	��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K#!����N��;��W��K��E��K��9ݠ.��E��T����N��;��W��K��K��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K#!����N��F��;��W��E��K��9ݠ.��E��T ����N��F��;��W��E��K��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K)'����N��F��;��W��2��T��9��K��9ݠ.��E��T&$����N��F��;��W��2��T��9��K��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K#!����N��F��;��W��H��K��9ݠ.��E��T ����N��F��;��W��H��K��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K)'����N��9ݠ.��;��W��H��E��	��9ݠ.��E��T#!����N��9��;��W��H��E��	��9��E��T���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I/-��:��T��6��7��8��:��T��6��7��Iַ;P��=��8��-)'��:��T��6��7��8��:��T��6��7��I��=��8��-���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I ��T��1��8��:��C��T��6��7��Iַ;��T��1��8��:��T��6��7��I���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��Iܥ6��0��T��6��7ȣ8��Iַ;ܥ6��0��T��6��7��I���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I,*��:��0��E��U��P��U��,��I��:��T��6��7��Iַ;#!��:��0��8��P��,��I��:��T��6��7��I���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I��0��T��6��7ȣ8��Iַ;��0��T��6��7��I���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I��:��C��T��6��7��Iַ;��@��?��:��T��6��7��I��@���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I/-��:��C��T��6��7��Iַ;��:��C��T��6��7��Iַ;��;#!��:��T��6��7��I��:��T��6��7��I��;���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I,*��0��E��U��4��J��8��:��C��T��6��7��Iַ;ܥ6 ��0��8��4��J��:��T��6��7��Iܥ6���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I)'��:��C��T��6��7��Iַ;��M��/��TۓR��7��K��:��T��6��7��I��M��TۓR��7���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I)'��:��C��T��6��7��Iַ;��M��/��TۓR��7��K��:��T��6��7��I��M��TۓR��7���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I����P����P���4��X��>��E��1��;��4��X��>��B ��E��1��;��>��X��H��MʭB��W��T��B��>��X��H��MʭB��W��4��X��>��E��1��;��4��X��>��B�:��D��>�7��5��.��T�:��D��>�7��5��.��T���4��X��>��E��1��;��4��X��>��B/-��-��X��E��1��;��7߹-��W��D��7Օ��Nծ��H����-��X��B��Օ��Nծ��H����4��X��>��E��1��;��4��X��>��BMK��%��X��6��Xޡ8��X��S��X��8��X��N��X��.��X��C��X�C��X��F��X��2��X��4��X��CA?��%��X��6��Xޡ8��X��X��8��X��N��X��X��X��X��F��X��2��X��4��X��C���4��X��>��E��1��;��4��X��>��B ��E��1��;��/��6��4��D��G��@��K��B��/��6��D��@��K��4��X��>��E��1��;��4��X��>��BSQİF��E��1��;��/��6��4��X۹/��>��O��X۹/��>��TʭB��S��>��OʭB��S��>��T��U��>��6��K53İF��B��/��6��X��>��O��X��>��B��>��O��B��>��U��>��6���4��X��>��E��1��;��4��X��>��Bki��E��1��;��M�I��B��>ю2��/��4��A��T��2��3��W��S��;��X��Iю2��Xю2��>ю2��U��A��T��X��I��X�����)����PN��B��M��I��>ю2��/��4��F��2��3��W��S��;��X��2�2ю2��U��F��X��I�����)������4��X��>��E��1��;��4��X��>��B��/��4��?����B��O��B��T��/��4��?����B��O��B���4��X��>��E��1��;��4��X��>��B ��E��1��;��>��X��H��MʭB��W��T��B��>��X��H��MʭB��W��4��X��>��E��1��;��4��X��>��B)'��/��4��3��>��L��?��?��H��F��?����F��T#!��/��4��3��>��L��?��H��F����F��T���4��X��>��E��1��;��4��X��>��B/-��-��X��E��1��;��7߹-��W��D��7Օ��Nծ��H����-��X��B��Օ��Nծ��H����4��X��>��E��1��;��4��X��>��BGE˛5��9��/��=��T��4��>��X��?ޡ8��R��V��4��>��E��1��;��6��T��4��4��K��2,*���-��4��>����/��4��>��B��6��T��5��K��2���4��X��>��E��1��;��4��X��>��B ��E��1��;��/��6��4��D��G��@��K��B��/��6��D��@��K��4��X��>��E��1��;��4��X��>��BVT��D�G��:��/��4��X��>��3��?��X��?��F��B��T��F��?ޡ8��H��?��.��:��FʭB��.��4��?��F��6><��G��:��/��4��X��>��3��?��X��F��B��T��Fޡ8��H��.��F����F��6���4��X��>��E��1��;��4��X��>��Bki��E��1��;��M�I��B��>ю2��/��4��A��T��2��3��W��S��;��X��Iю2��Xю2��>ю2��U��A��T��X��I��X�����)����PN��B��M��I��>ю2��/��4��F��2��3��W��S��;��X��2�2ю2��U��F��X��I�����)������4��X��>��E��1��;��4��X��>��BVT��E��1��;��>��C��6��P��K��H��,��-��X��?��7�1�E��7��0����NʡH����H��0��6��4��T��DB��B��>��C��6��P��K��H��,��-��X����0����NʡH����H��0��6��4��T�����4��X��>��E��1��;��4��X��>��B ��E��1��;��>��X��H��MʭB��W��T��B��>��X��H��MʭB��W��4��X��>��E��1��;��4��X��>��B#!��U�/��4��X��>��3��B��?��8�,��T ��U�/��4��X��>��3��B��?��,��T���4��X��>��E��1��;��4��X��>��B/-��-��X��E��1��;��7߹-��W��D��7Օ��Nծ��H����-��X��B��Օ��Nծ��H����4��X��>��E��1��;��4��X��>��B/-��-��4��4��6��M��;����-��>��>��@��W��>��W��>)'��-��5��6��M��;����-��>��>��@��>��W��>���4��X��>��E��1��;��4��X��>��B ��E��1��;��/��6��4��D��G��@��K��B��/��6��D��@��K��4��X��>��E��1��;��4��X��>��B53��?��4��1��K��>��F����7��>�>��D��<��(��6հL��3��T53��?��4��1��K��>��F����7��>�>��D��<��(��6հL��3��T���4��X��>��E��1��;��4��X��>��Bki��E��1��;��M�I��B��>ю2��/��4��A��T��2��3��W��S��;��X��Iю2��Xю2��>ю2��U��A��T��X��I��X�����)����PN��B��M��I��>ю2��/��4��F��2��3��W��S��;��X��2�2ю2��U��F��X��I�����)������4��X��>��E��1��;��4��X��>��B_]��E��1��;��1��-��X��?��P��@��4��B��S��?��H��-��M��>ԁ:�F��T��,��;��J��8��L�0��(��������!MK��B��1��-��X��?��P��@��4��B��S��?��H��-��M��>ԁ:�F��T��,��;��J��8��L�0��X���4��X��>��E��1��;��4��X��>��B ��E��1��;��>��X��H��MʭB��W��T��B��>��X��H��MʭB��W��4��X��>��E��1��;��4��X��>��B)'��/��4��3��?��>��L��H��J��X�/��E��N��B#!��/��4��3��?��>��L��H��J��X��E��N���4��X��>��E��1��;��4��X��>��B/-��-��X��E��1��;��7߹-��W��D��7Օ��Nծ��H����-��X��B��Օ��Nծ��H����4��X��>��E��1��;��4��X��>��B����3��H��2��4��C��M΄/ǟ9��=��Tޡ8��?������$����������������������ڻ��������������(����$�������������������!����3��H��2��4��C��Mτ/��-�8������$����������������������ڻ��������������(����$�������������������!���4��X��>��E��1��;��4��X��>��B ��E��1��;��/��6��4��D��G��@��K��B��/��6��D��@��K��4��X��>��E��1��;��4��X��>��B,*��/��4��3��?��>��L��H��J��X�/��E�1ʞ:��-)'��/��4��3��?��>��L��H��J��X��E�1ʞ:��-���4��X��>��E��1��;��4��X��>��Bki��E��1��;��M�I��B��>ю2��/��4��A��T��2��3��W��S��;��X��Iю2��Xю2��>ю2��U��A��T��X��I��X�����)����PN��B��M��I��>ю2��/��4��F��2��3��W��S��;��X��2�2ю2��U��F��X��I�����)������4��X��>��E��1��;��4��X��>��B,*��H��E��1��;��>��/��4��H��?��L��B��<��B��B#!��H��B��>��/��4��H��L��B��<��B��B���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F)'��4�>��B��W��A��Q��T��2��>��J��F��@Ԛ< ��4�>��B��W��A��L��>��S��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F86��9��G��O��B��Q��T��2��>����P��V��P��.��5��A��J��>��P)'��9��G��O��B��L��>����P��P�.��J��>��P���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F20��R��0��W��6�>��BйS��Q��T��2��>��F��D��S�D��A,*��R��0��W��6�>��BйS��L��>��F��D��S�D��A��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��FA?�C��O��W��>��M�>��B��W��A��Q��T��2��9��6��O��8��G��D��S�D��A;9�C��O��W��>��M�>��B��W��A��L��9��6��O��8��G��D��S�D��A���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��FMK��R��9��G��M��W��W��A��I�>��B��N��=��=�9��=��A��>��M��N��S��9��=��A�7��B86��RךG��W��W��A��I�>��B��N��=��=��>��M��N��S��9��=��+��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F20��9��G�>��B��Q��T��2��>��V��J��7��6��8��T��7��=&$��9��G�>��B��L��>��VќJ��6��8��7��=���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F;9��D��9��D��I�>��B��W��R��Q��T��2��>��S��Q�U��>��V��@Ԛ<&$��9��I�>��B��W��R��L��S��>��V��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F/-��
+��F��W��L����S��J������$����2����A��B/-��
+��F��W��L����S��J������$����2����A��B���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F86��D��S��8��G�>��B��W��A��Q��T��2��>��M��@��?��@��@��@/-��D��S��8��G�>��B��W��A��L��>��M��@��?��@��@��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F)'��N��F��H��F��O��F��O��V��V��A��4��@��K&$��N��F��H��F��O��F��O��V��V��A��4��@���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F&$��W��>��V�>��B��W��A��Q��T��2��@Ԛ< ��W��>��V�>��B��W��A��L��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F86�>��B��Q��T��2��>΂P��F��;�/��U����N��5��L��U��ٶ,*�>��B��L��>΂P��F��;�/��U����N��5��L��U���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F)'��4�>��B��W��A��Q��T��2��>��J��F��@Ԛ< ��4�>��B��W��A��L��>��S��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F#!�>��B��W��B��Q��T��2��>��F��@Ԛ<�>��B��W��B��L��>��F��@Ԛ<���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F20��R��0��W��6�>��BйS��Q��T��2��>��F��D��S�D��A,*��R��0��W��6�>��BйS��L��>��F��D��S�D��A��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F#!��W��2��E�>��D��Q��T��2ϩN��F��B��W��E�>��D��LϩN��F���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��FMK��R��9��G��M��W��W��A��I�>��B��N��=��=�9��=��A��>��M��N��S��9��=��A�7��B86��RךG��W��W��A��I�>��B��N��=��=��>��M��N��S��9��=��+��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F��8��F��5��R��.��U��E��S��2��8��F��=��.��U��E��S���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F;9��D��9��D��I�>��B��W��R��Q��T��2��>��S��Q�U��>��V��@Ԛ<&$��9��I�>��B��W��R��L��S��>��V��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F)'��J��>��R��8��"����������F��K��%��F��J��>��R������F��%��F���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F86��D��S��8��G�>��B��W��A��Q��T��2��>��M��@��?��@��@��@/-��D��S��8��G�>��B��W��A��L��>��M��@��?��@��@��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F53��D��9��6��M��E��K�>��B��Q��T��2��>��V��D��@��@��@&$��D��9��6��E��K�>��B��L��>��V��@��@���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F&$��W��>��V�>��B��W��A��Q��T��2��@Ԛ< ��W��>��V�>��B��W��A��L��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F ֖F��>��P��Mމ6��J��6��J����7��+��Mމ6��J��6��J����7���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F)'��4�>��B��W��A��Q��T��2��>��J��F��@Ԛ< ��4�>��B��W��A��L��>��S��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F/-��U�Mӛ?�1��?��7��F��,��7���M�����R��Q#!��U�Mӛ?�1��?���M�����R��Q���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F20��R��0��W��6�>��BйS��Q��T��2��>��F��D��S�D��A,*��R��0��W��6�>��BйS��L��>��F��D��S�D��A��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F/-��D��9��6��M�>��B��W��A��Q��T��2��D��S�D��A&$��D��9��6�>��B��W��A��L��D��S�D��A���I��D��T��0��I��I��D��0��IDB��D��T��3��0��I��D��T��3��0��I��4��D��T��3��0��IǱ.��>��4��I����?86��D��3��0��I��D��3��0��I��4��D��3��0��IǱ.��>��4��I��2��I��D��T��0��I��I��D��0��I,*��I��D��T��0��1�,��I��C��D��T��0��I��D��T#!��I��D��T��0��1��I��D��0��I��D��T���I��D��T��0��I��I��D��0��I&$��R��I��D��T��N��0��I��0��I��4��@Ԛ<��R��I��D��0��0��I��4��@Ԛ<��I��D��T��0��I��I��D��0��I/-��0��I��9��Q��6��S��=��K��I��8��K��I��:��@Ԛ</-��0��I��9��Q��6��S��=��K��I��8��K��I��:��@Ԛ<���I��D��T��0��I��I��D��0��I��0��I��5��I����?��0��I��5��I��2��I��D��T��0��I��I��D��0��IA?��D��T��3��I��5��8��D��T��3��8��I��5��D��T��3��X��5��8��I����?53��D��3��I��5��8��D��3��8��I��5��D��3��X��5��8��I��2���I��D��T��0��I��I��D��0��I��0��I��Iַ;��4��D��G��@��K��0��I��I��4��D��@��K��I��D��T��0��I��I��D��0��I&$��I��D��T��P��D��N��0��I��0��I��@Ԛ<��I��D��P��D��0��0��I��@Ԛ<���I��D��T��0��I��I��D��0��I#!��I��0��IػK��I��0��I��4��I��0��I#!��I��0��IػK��I��0��I��4��I��0��I��I��D��T��0��I��I��D��0��I/-��0��I��D��T��Fַ;��8��-����8��T��������!��0��D��1��8��-����8�����I��D��T��0��I��I��D��0��I��0��I��4��@Ԛ<��0��I��4��@Ԛ<��I��D��T��0��I��I��D��0��I20��D��0��I��D��0��I��4��D��0��IǱ.��>��4��I����?/-��D��0��I��D��0��I��4��D��0��IǱ.��>��4��I��2���I��D��T��0��I��I��D��0��I��0��IǱ.��>��4��@Ԛ<��0��IǱ.��>��4��@Ԛ<��I��D��T��0��I��I��D��0��I20��I��D��T��,�;��0��1�,��I��C��D��T��0��I��D��T)'��I��D��T��,�;��0��1��I��D��0��I��D��T���I��D��T��0��I��I��D��0��IDB��D��T��3��0��I��D��T��3��0��I��4��D��T��3��0��IǱ.��>��4��I����?86��D��3��0��I��D��3��0��I��4��D��3��0��IǱ.��>��4��I��2��I��D��T��0��I��I��D��0��IDB��,��TܷT��0��I��,��TܷT��0��I��4��,��TܷT��0��IǱ.��>��4��I����?86��,ܷT��0��I��,ܷT��0��I��4��,ܷT��0��IǱ.��>��4��I��2���I��D��T��0��I��I��D��0��I&$��R��I��D��T��N��0��I��0��I��4��@Ԛ<��R��I��D��0��0��I��4��@Ԛ<��I��D��T��0��I��I��D��0��IMK��D��T��3��N��0��I��D��T��3��N��0��I��4��D��T��3��N��0��IǱ.��>��4��I����?/-��D��3��0��D��3��0��4��D��3��0Ǳ.��>��4��I��2���I��D��T��0��I��I��D��0��I��0��I��5��I����?��0��I��5��I��2��I��D��T��0��I��I��D��0��I��0��IػK��4��@��K��0��IػK��4��@��K���I��D��T��0��I��I��D��0��I��0��I��Iַ;��4��D��G��@��K��0��I��I��4��D��@��K��I��D��T��0��I��I��D��0��I��0��I��D��G��@��K��0��I��D��@��K���I��D��T��0��I��I��D��0��I#!��I��0��IػK��I��0��I��4��I��0��I#!��I��0��IػK��I��0��I��4��I��0��I��I��D��T��0��I��I��D��0��I53��D��T��3��I��D��T��3��Iַ;��D��T��3�O�I��I����?#!��D��3��I��D��3��I��D��3�O��I��2��L��7ٟ@��8��Lٟ@��8��L��7��@��?�L��@�L��7ٟ@��8��Lٟ@��8��L��7��B��6	�L��B��6��L��7ٟ@��8��Lٟ@��8�,*��6P��,�L��7ٟ@��8��H��7��@�K��7��@Ԛ<#!��6�ٟ@��8��H��7��@�K��7��@Ԛ<�L��7ٟ@��8��Lٟ@��8��L��7��@��K	�L��@��K��L��7ٟ@��8��Lٟ@��8��L��7��@��?�L��@�L��7ٟ@��8��Lٟ@��8��L��7��6��?	�L��6��?��L��7ٟ@��8��Lٟ@��8�,*��6P��,�L��7ٟ@��8��H��7��@�K��7��@Ԛ<#!��6�ٟ@��8��H��7��@�K��7��@Ԛ<�L��7ٟ@��8��Lٟ@��8�,*��6�L��7��8��>ٟ@��H��F��@��F��7��6��>��P)'��6�L��8��>ٟ@��H��F��@��F��7��6��>��P��L��7ٟ@��8��Lٟ@��8��L��7��@��?�L��@�L��7ٟ@��8��Lٟ@��8�20��A��7�L��7�Hٟ@��8��E��P��;��:��P��O��@��@��@,*��A��7�L�Hٟ@��8��E��P��;��:��P��O��@��@��L��7ٟ@��8��Lٟ@��8�,*��6P��,�L��7ٟ@��8��H��7��@�K��7��@Ԛ<#!��6�ٟ@��8��H��7��@�K��7��@Ԛ<�L��7ٟ@��8��Lٟ@��8�P��,�L��7��?��6��0���?��6��0��L��7ٟ@��8��Lٟ@��8��L��7��@��?�L��@�L��7ٟ@��8��Lٟ@��8�/-�L��7ٟ@��8��A��R��>��:��6��>��N��D��S�D��A,*�Lٟ@��8��A��R��>��:��6��>��N��D��S�D��A��L��7ٟ@��8��Lٟ@��8�,*��6P��,�L��7ٟ@��8��H��7��@�K��7��@Ԛ<#!��6�ٟ@��8��H��7��@�K��7��@Ԛ<�L��7ٟ@��8��Lٟ@��8�wu�L��7��D��F��6�L��7��B��7�L��7��6��<��6P��,�L��7��
+�Gٟ@��8��6��7��@��7��5�L��7��8��>ٟ@�;��F��J��>��N��1�S_]�L��D��F��6�L��B��7�L��6Ǥ<���
+�Gٟ@��8��6��7��@��7��5�L��8��>ٟ@�;��F��J��>��N��1�S�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0��A��=��Tɾ=��S��N��.��W��0�A��T����(����"�0��=��T̗<��.��0�A��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��S,*¨0��A��=��Tɾ=��C��P��I��/��C��/��9��?��T#!�0��=����P��I��/��C��/��9��?��T�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S¨0ʽ=��>��=��Tɾ=��R��@Ԛ<�0��>��=��Tɾ=��R��@Ԛ<¨0��A��=��Tɾ=��S�0��=��Tɾ=��S86¨0ʽ=��=��Tɾ=��C��6��=��Tɾ=��C��6��A�A��N��T��A��T#!�0��=��T�6��=��T�6��A��A��T��A�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0ʽ=��Dٟ@ޢ<�Qɾ=��@��=��Tɾ=��C��P��H��/��4&$�0��Dٟ@ޢ<�Qɾ=��@��=����H��/��4¨0��A��=��Tɾ=��S�0��=��Tɾ=��S,*¨0��A��=��Tɾ=��W����7��>��7��C��<��B��B)'�0��=��Tɾ=��W����7��>��7��C��<��B��B�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S/-¨0��A��D��>��=��Tɾ=��CѲ/��D��T����(����"�0��D��>��=��Ѳ/��D��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��S/-��=��Tɾ=��>¨0ʽ=ʇX��Qޢ<�Qɾ=��Cݰ?��Q��.&$��=��Tɾ=��>�0ʇX��Qޢ<�Q̾=ݰ?��Q�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S53¨0��A��=��Tɾ=��C��D�A��4��A�A��T��UʡH��9�A��/,*�0��=��T˾=��D��A��A�A��T��UʡH��9�A��/¨0��A��=��Tɾ=��S�0��=��Tɾ=��S53¨0��A��=��Tɾ=��J��6��O��T¨0��A��=��Tɾ=��J��6��K�0ҳ��O��T�0ҳ��K�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0��A��=��Tɾ=��S��N��.��W��0�A��T����(����"�0��=��T̗<��.��0�A��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��SJH¨0ʽ=��PʇX��D�Q��=��Tɾ=��C��>ΉX˛5¨0��A��/��T��D¨0��A��/�A��4�J53�-ʇX��D�Q��=����>ΉX˛5�0��/��T��D�0��/��A�J�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S¨0ʽ=��>��=��Tɾ=��R��@Ԛ<�0��>��=��Tɾ=��R��@Ԛ<¨0��A��=��Tɾ=��S�0��=��Tɾ=��S)'¨0��A��=��Tɾ=��W�9��L��/͒�A��4��T�0��=��Tɾ=��W��/͒�A��4�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0ʽ=��Dٟ@ޢ<�Qɾ=��@��=��Tɾ=��C��P��H��/��4&$�0��Dٟ@ޢ<�Qɾ=��@��=����H��/��4¨0��A��=��Tɾ=��S�0��=��Tɾ=��SSQ¨0ʽ=��P��N��=��Tɾ=��C��@��Eޢ<�Qɾ=��C��D��/��D��/��QİU��4��4����������/-�-��N��=����@��Eޢ<�Q̾=��D��/��D��9��4�*�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S/-¨0��A��D��>��=��Tɾ=��CѲ/��D��T����(����"�0��D��>��=��Ѳ/��D��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��S,*¨0��A��=��Tɾ=��6��=��T��3��O��T��D�A��4&$�0��=��Tɾ=��6��=��T��3��O��T��D��A�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S53¨0��A��=��Tɾ=��C��D�A��4��A�A��T��UʡH��9�A��/,*�0��=��T˾=��D��A��A�A��T��UʡH��9�A��/¨0��A��=��Tɾ=��S�0��=��Tɾ=��S ¨0��A��=��Tɾ=��6����8�,��T�0��=��Tɾ=��6����,��T�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0��A��=��Tɾ=��S��N��.��W��0�A��T����(����"�0��=��T̗<��.��0�A��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��SDB��S��4��8¨0ʽ=��P��=��Tɾ=��Cϛ)ϛ)�)�)�Q��Tɾ=��C��9��8��K��T/-ФO��8�-��=��ϛ)ϛ)�)�)�Q����9��8��K��T�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S¨0ʽ=��>��=��Tɾ=��R��@Ԛ<�0��>��=��Tɾ=��R��@Ԛ<¨0��A��=��Tɾ=��S�0��=��Tɾ=��S#!��E��=¨0ʽ=��=��Tɾ=��.��8��?̛<��=�0��=��Tɾ=��.��?�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0ʽ=��Dٟ@ޢ<�Qɾ=��@��=��Tɾ=��C��P��H��/��4&$�0��Dٟ@ޢ<�Qɾ=��@��=����H��/��4¨0��A��=��Tɾ=��S�0��=��Tɾ=��S;9¨0��A��=��Tɾ=��C��E��S��S��.��PщQ¨0��A�A��4��D�A��T&$�0��=����E��S��*щQ�0��A��D�A��T�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S/-¨0��A��D��>��=��Tɾ=��CѲ/��D��T����(����"�0��D��>��=��Ѳ/��D��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��S ¨0��A��D��>��=��Tɾ=��C��@��K�0��D��>��=����@��K�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S53¨0��A��=��Tɾ=��C��D�A��4��A�A��T��UʡH��9�A��/,*�0��=��T˾=��D��A��A�A��T��UʡH��9�A��/¨0��A��=��Tɾ=��S�0��=��Tɾ=��S��=��Tɾ=��C��6¨0��A��T��=��T�6�0��T�	��S�1��/��W��/߹-��CʡH��9��7��Qן9ں-ʡH��9��7��Qן9	��S�1��/��W��/)'��/��/Æ.��J��:��N��L��J��S�1��/��G��B ��/��/Æ.��J��:��N��L��W��/��G�	��S�1��/��W��/��S�1��/��B��;��A�A��T��W��/��B��A�A��T	��S�1��/��W��/��S�1�D��?	��W�D��?�	��S�1��/��W��/߹-��CʡH��9��7��Qן9ں-ʡH��9��7��Qן9	��S�1��/��W��/53��S�1��/��E��7��0��C��/��7��7����S�1��/����A��B#!��W��/��E����7����W��/����A��Bw	��S�1��/��W��/��S�1��/��B��;��A�A��T��W��/��B��A�A��T	��S�1��/��W��/�A��B�A��B�	��S�1��/��W��/߹-��CʡH��9��7��Qן9ں-ʡH��9��7��Qן9	��S�1��/��W��/20��9��J��/��?ſQ��5ߕJ��C��M��C��R��U��RН?��Q��T)'��9��J��/��?ſQ��5ߕJ��C��M��C��R����Q�	��S�1��/��W��/��S�1��/��B��;��A�A��T��W��/��B��A�A��T	��S�1��/��W��/><��B��U��>��9��@��V��W��F�?��Wַ;��;�E��-��S�1Н?��>��A��T20��B��>��9��@��V��W��F�?��Wַ;��;�E��-��W��?��A�	��S�1��/��W��/߹-��CʡH��9��7��Qן9ں-ʡH��9��7��Qן9	��S�1��/��W��/��S�1��/��I�A��6��W��/��I�A��6���E��G��?��>��-��E��G��?��>��-/-��G��?�R�1��4��2��T��N��5��=�7��@��P��:��J#!��G��?�1��4��2��T��5��=��@��P��J��E��G��?��>��-��E��G��?��>��-)'��G��?��>��-��P��L΅/��Bڶ>��S��J��@Ԛ< ��G��?��>��-΅/��B��S��J��@Ԛ<���E��G��?��>��-��E��G��?��>��-��G��?��T�4��G��?��T�4��E��G��?��>��-��E��G��?��>��-��G��W��-��T��G��*���E��G��?��>��-��E��G��?��>��-��E�,��G��?�/��-��"��D��:��E��G��?�/��-��"��D��E��G��?��>��-��E��G��?��>��-20��G��?��>��-��G��6��4��?��9ʉ5��;˫N¶;�P��N��T,*��G��?��>��-��G��4��?��9ʉ5��;ΫN�P��N��T���E��G��?��>��-��E��G��?��>��-86��G��?��>��-��2��2΅/��8��B��?¶7ģC��CщQ��D��P�D��A,*��G��?��>��-΅/��8��B��N�CщQ��D��P�D��A��E��G��?��>��-��E��G��?��>��-20����N��E�,��G��?��>��-��?¶7ʡH��W��B��:ģC��O#!����N��E��G��?��>��-��N��W��:�C���E��G��?��>��-��E��G��?��>��-&$��G��?��>��-��8��G��?��>��-��4��-��2#!��G��?��>��-��G��?��>��-��4��-��2��E��G��?��>��-��E��G��?��>��- ��G��?��>��-�/��.��BʭBѡ8¶;��G��?��>��-�/��BʭBѡ8¶;���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>,*��K�=�9��:ׄ9��?��DϪJ��P��>؞C��@��@��@ ��=��:ׄ9��?��DϪJ��Pρ>��@��@���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>��>؞C��1��9��Tׄ9��?��@Ԛ<ρ>��1��9��Tׄ9��?��@Ԛ<���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>Ư8��Hׄ9��?��>؞C��@��@��@Ư8��Hׄ9��?ρ>��@��@���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>ׄ9��?��=��7ׄ9��?��=��7���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>&$��>؞Cׄ9��?��6��R�1��T��D��P�D��A ρ>ׄ9��?��6�1��T��D��P�D��A���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>DB����=����>��1��9��Tׄ9��?ׄ9��B��9��>��>Ư8��I��>؞CбM�8��6><����=����>��1��9��Tׄ9��?ׄ9��B��9��>��>Ư8��Iρ>бM��8���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>#!��UP۴2��>��M��N��,��B��MСG��T��U��P��>��M��N��B��MСG��T���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>hf��1��	��T��Sׄ9��?��A��J�9��J��O��T��,��Q����S��F��>��T��9��P��,��1�R��>؞Cб��:��6�����)��ʪ_]��1��	��T��Sׄ9��?��A˱9��O��T��,��Q����S��F��>��T��9��P��,��1�Rρ>б��:��6�����)��ʪ���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>A?��:��9�1��S��Tׄ9��?��9��M��,��.��T��>��BϪJ�9��>؞C��@��@��@;9��:��9�1��S��Tׄ9��?��9��M��,��.��T��>��BϪJ�9ρ>��@��@�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2����6�9��4��2��A�7��B����6�9��4��+#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2#!��C��1����4��>��@��D��2��>��@Ԛ<#!��C��1����4��>��@��D��2��>��@Ԛ<�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��&��F��D��6�D��2��4�9��@��D��2��>����1��X��J��V����V53��&��F��D��6�D��2��4�9��@��D��2��>����1��J����V#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2SQ��&����L����4�9��2��IщQP��=��&��1��X��4��B��D��7��1��X��G��:��&������T��6GEީ��L����4�9��2��IщQ��=��&��1��4��B��D��7��1��G��:��&������T��6�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2����6�9��4��2��A�7��B����6�9��4��+#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2 ��Cڜ>����4��2��K��.��B��@��K��Cڜ>����4��K��.��@��K�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��&��F��D��6�D��2��4�9��@��D��2��>����1��X��J��V����V53��&��F��D��6�D��2��4�9��@��D��2��>����1��J����V#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��6��1��&������6��P��>��4��2��9�Q��1��@����&��@��@��@20��6��1��&������6��P��>��4ƋQ��1��@����&��@��@�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2����6�9��4��2��A�7��B����6�9��4��+#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2#!������6��E��4��2��4ڜ>��2����AЍ��6��E��4��4ڜ>��2����A�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��&��F��D��6�D��2��4�9��@��D��2��>����1��X��J��V����V53��&��F��D��6�D��2��4�9��@��D��2��>����1��J����V#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2��6�>����4��2��E��X��@��N��6�>����4��E��@��N�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2����6�9��4��2��A�7��B����6�9��4��+#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2����>��4ڜ>��F��5��@Ԛ<����>��4ڜ>��F��5��@Ԛ<�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��&��F��D��6�D��2��4�9��@��D��2��>����1��X��J��V����V53��&��F��D��6�D��2��4�9��@��D��2��>����1��J����V#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��C��R����W��6��?۱U��R��T��:����R��&��6��D��>��6�2��486��C��R����W��6��?۱U��R��T��:����R��&��6��D��>��D��4���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8MK����U��E��;��6֊2��>��W��6��,ϨH��@��F�L��6��,��B��,��T��E��;��>��A�7��BDB��Ǡ2��;��6֊2��>��W��6��,ϨH��@��F�L��6��,��B��,��T��E��;��>��+���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�853�D��U��E��>��W��@��P��2��1�H��S��V��9��;��W��@Ԛ<,*�DǠ2��>��W��@��7��1�H��S��V��9��=��@Ԛ<���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�886��U��E��2��V��=��L��2��9��6��T��=ȟN��2��D��S��>؞CԚ<20Ǡ2��2��V��=��L��2��9��6��T��=ȟN��2��D��Sρ>Ԛ<���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8JH��>؞C��X��A�N��;��W����S��V��6��2��6��D��U��E��=��W�L��6��,��6��@Ԛ<A?ρ>��X��A�N��=����S��V��6��2��6��DǠ2��=��W�L��6��,��6��@Ԛ<���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8PN��9��;��2��U��E��D����S��V�1��6��=��G��B��<�6��>؞Cб��:��6�����)��ʪDB��9��;��2Ǡ2��D����S��V�1��6��=��G��<ρ>б��:��6�����)��ʪ���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8DB��D��7��>��U��E��;��A����S��VϨH��,ϨH��W��;��6��2��>��T��6��@Ԛ<><��D��7��>Ǡ2��;��A����S��V؋8ϨH��W��;��6��2��>��T��6��@Ԛ<���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8;9��>��U��E��;��F��W��O��T��7��,��>��A��8��S��V��D��P�D��A20��>Ǡ2��;��F��W��3��7��,��>��A��S��V��D��P�D��A���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�886����U��E��6�1��6��=��V��6��>��6��L��=��>؞C��@��@��@/-��Ǡ2��6�1��6��=��V��6��>��6��L��=ρ>��@��@���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8)'��>؞C��U��E��;��V��6��2��6��D��S�D��A#!ρ>Ǡ2��;��V��6��2��6��D��S�D��A���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��TJH�R��6��>��#����H��L��6��M��9ٟ@��U��V��UӁG��DܤK��8��<��#����@��@��@;9�R��6��>��#��H��L��6��M�@��U��V��UӁG��A��8��<��#��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��T ��U��U��D��,��A��#��%��@��@��@��U��U��D��,��A��#��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��T20��U��N��.��T��5ƛK��,��6�I�1��6��#��%��@��@��@,*��U��N��.��T��5ƛK��,��6�I�1��6��#��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��TDB�9Ԛ<��6ϪJ��>��#����>��Q�@��D��9��D��FҾW��SܤK��#����@��@��@,*��1��6��>��#��>��Q�@��9��FҾW��S��#��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��T20��#��%��>��UӁG��D��9��D��.��7��>��#��%��@��@��@#!��#��>��UӁG��9��.��7��>��#��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��TMK�R��,����9��S��=ɵOʡH��9��B��>��U��U��D��=��UL�9��T��M��#����@��@��@;9�R��,����SɵO��9��>��U��U��D��=��UL�9��T��M��#��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��T&$��DПC��,��UӁG��DܤK��#����@��@��@��DПC��,��UӁG��A��#��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��T&$��E��>��F��#����U��D�K�0��@��@��@ ��E��>��F��#��U��D�K�0��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��T,*��#����U��D�K��-щQ��R��Q��#����@��@��@#!��#��U��D�K��-щQ��R��Q��#��@��@���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-DB�;��J��Iٟ@��F������X��H��-��E��DܤK��V�3��E��T�����)��ʪ86��;��@��Fح��X��H��-��E��D��V�3��E��T�����)��ʪ��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-YW��9��T����:��B��7ٟ@��������)��X��-���;��J��%��)ѾC��T��O��7��%��T�8��7��F��D�0A?��9��T����:��B��7ٟ@��X��-����;��)ѾC��T��O��7��%��T��8��F�0���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-86�;��J�8��5��S���������X��H��-�8��E��6�O��@Ԛ<&$��;�8��5��S��X��H��-�8��E��6��@Ԛ<��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-&$��5��M�;��J��.��B��7��H��1��R��@Ԛ<#!��5��M��;��.��B��7��H��1��R��@Ԛ<���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-><�;��J�8��5��S���������X��H��-�8��E��6�O��D��S�D��A,*��;�8��5��S��X��H��-�8��E��6��D��S�D��A��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-PN�;��J��A��5��D��N�8��R��8��E��B��S��;��7��6��X��H��-��N��F��K��,��D��P�D��AA?��;��A��5��D��N��R��N��B��S��;��5��X��H��-��N��F��,��D��P�D��A���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-DB�;��J��Iٟ@��F������X��H��-��E��DܤK��V�3��E��T�����)��ʪ86��;��@��Fح��X��H��-��E��D��V�3��E��T�����)��ʪ��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-_]��5��M�;��J��D��Cٟ@��F��2��6��K��:��X��-��R��B��9��S�8��@��D��6��9��>ҾW��D��,��D��P�D��APN��5��M��;��D��@��F��2��6��K��:��X��-��R��B��9��S�8��@��6ߖ>��D��,��D��P�D��A���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-86�;��J�8��5��S���������X��H��-�8��E��6�O��@Ԛ<&$��;�8��5��S��X��H��-�8��E��6��@Ԛ<��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-�;��J١-ܤK��S��/��@��N��;١-ܤK��S��@��N���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-><�;��J�8��5��S���������X��H��-�8��E��6�O��D��S�D��A,*��;�8��5��S��X��H��-�8��E��6��D��S�D��A��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-86��6��C��;����	��X��-��N��W��H��T�;��J�����)��ʪ/-��6��C��;����X��-��N��W��H��;�����)��ʪ�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�GE��5��D��R��9��3��A��7��.��8��R��AƛK��2��T��H�?��T�!��H��AM�8��6A?��5��D��R��9��3��A��7��8��R��AƛK��2��T��H�?��T�!��H��AM��8�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�86��5��R��Aб�D��9��3��A��7��.��8��R��A���!��@��@��@20��5��R��Aб�D��9��3��A��7��8��R��A���!��@��@�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�><ʡH��9��B��R��9����3��A��V��7��.��R��A��ϪJ��H��A��@��@��@20��9��R��9����3��A��V��7��R��A��ϪJ��H��A��@��@�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�;9�!��H��A��5��D��R��9��L��9��B��R��7��.��R��ϪJ��,��@Ԛ<86�!��H��A��5��D��R��9��L��9��B��R��7��R��ϪJ��,��@Ԛ<�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�ki��5��D��9��3��A��J��R��7��.��B��R��F��D��3����Bٟ@��7��5��Dٟ@��7��>��H��A��K��A��D��P�!��H��AM�8��6ec��5��D��9��3��A��J��R��7��B��R��F��D��3����Bٟ@��7��5��Dٟ@��7��>��H��A��K��A��D��P�!��H��AM��8�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�b`��5��D��R��9��L��9�D��R������7��.��3��>����3��R��Q��K��U��D��A��-��D��3��D�!��H��AM�8��6\Z��5��D��R��9��L��9�D��R������7��3��>����3��R��Q��K��U��D��A��-��D��3��D�!��H��AM��8�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�/-��5��D��9��L��9�D��7��.��R��ƭI�!��@��@��@)'��5��D��9��L��9�D��7��R��ƭI�!��@��@�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�hf��9��5��L��9�D��R��G��7��.��3��A��W�D��E��W��KѾC��H��T��7��H��A��7��:��6�����������)��ʪ��VT��9��5��L��9�D��R��G��7��3��A��W�D��E��W��K��5����:��6�����������)��ʪ���/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�><ϪJ��A��H��A��R��A��9�D��9��3��A��R��A��D��ϪJ��7��.��K��5;9ϪJ��A��H��A��R��A��9�D��9��3��A��R��A��D��ϪJ��7��K��5���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<DB��W��<��-��7��R��:��.��6��O�/��1��E��Pٟ@�9ٟ@�M��Bʔ7��7��>��P/-��W��<��7��R��:����1��E��P��9�M��Bݔ7��>��P���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<��R��-��R��-���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<53��9��.�<��J��O��<��-щQ��.��6��O��.��6��U��7��7��T&$��9�<��J��O��<щQ��.��O��.��UԚ7��T���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<20��.��6��O��3��7��;��0��G����.��6��.��6��<��B��B&$��.��O��3�7��0��G����.��.��<��B��B���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<��.��6��O��8�I��6��T��.��O��8�I��6��T���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<��.��6��O��T����K��6��.��O��T����K��6���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<20����.��6��O�/��E��E��D��.��6��O�/��E��E��"��W#!������E��D��.��6��O�/��E��"��W���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<DB��W��<��-��7��R��:��.��6��O�/��1��E��Pٟ@�9ٟ@�M��B��D��S�D��A20��W��<��7��R��:����1��E��P��9�M��B��D��S�D��A���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<><��W��<��-��7��R��:��.��6��O�/��1��E��Pٟ@�9ٟ@�M��B��@Ԛ<,*��W��<��7��R��:����1��E��P��9�M��B��@Ԛ<�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1,*��U��D��1��M��>��4��M��5��4Н?��A��3��A��T ��U��D��1ձM��4��M��5��4��A��A�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1,*��U��D��1��C��T��%��8��>��9��S�1��M��E��;)'��U��D��1��C��T��%��8��>��9��S�1��M��;�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1 ��U��D��1��U��3ʡH��W��R�D��U ��U��D��1��U��3ʡH��W��R�D��U�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1 ��U��D��1��M��>��4��M��5��G��3��U��D��1ձM��4��M��5��G�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1/-��U��8��J��D��1��U��H��AʡH����R��G��M��=��T,*��U��8��J��D��1��U��H��AʡH����R��G��M��=�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��120��U��D��1��F��B��L��L¶7��JѾC��4��W��,��M��4��;#!��U��D��1�B��N��J�C��W��1��4��;�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1&$��U��D��1ʡH��R��:��D��G��AʈO��>��6#!��U��D��1ʡH��R��:��D��G��A��>��6�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1)'��UȂ3��.��1��P��D��>��J١-��-�� ��A��B&$��UȂ3��.��1��P��>��J١-��-�� ��A��B�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1SQ��>��K��U��9��D��1��M��.��O��G��UʡH��9�>��9��U��1��9��9��>��U��6�9��I��T��@Ԛ<DB��>��K��U��9��D��1��M��.ǼO��U�>��9��U��1��9��>��U��6��I��T��@Ԛ<�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1#!��U��D��1��9ҧK��1�BPϪJ��>��D ��U��D��1��9ҧK��1�B��J��>��D�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1#!��U��D��1��9ҧK��1�BPϪJ��>��D ��U��D��1��9ҧK��1�B��J��>��D�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1)'��UȂ3��1��M��CP��Q��>��DԃP��E��A��B ��U͂3�MP��Q��>��D�U��A��B�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1)'��U��D��1P��Rޚ6��H��U��4��9��QÐW��B&$��U��D��1��Rޚ6��H��U��4��9��QÐW��B�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1&$��U��D��1��W��>β7��UщQ��D��G��@��K ��U��D��1��W��>ƴ7щQ��D��@��K�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1��U��D��1��MʡH��R��H��U��U��D��1��MʡH��R��H��U�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1/-��U��D��1��@��1��G��M��3̛<��:��9��T�����!#!��U��D��1��@��1��G��M��3��:��T��W�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1��U��D��1��>��N��V����N��F��U��D��1��>��N��V����N��F�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1,*��U��D��1��MʡH��W��R��H��U��J��6��J����7,*��U��D��1��MʡH��W��R��H��U��J��6��J����7�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1JH��U��B��M�B��U��D��1��9ҧK��1�BPϪJ��>��D��S��1��U��B��D��B��N��@Ԛ<><��U��M�B��U��D��1��9ҧK��1�B��J��>��Dū1��U��D��B��N��@Ԛ<�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��120��AʋM��Q��U��,��D��1��U��>��4��,��3��T��5��=��T&$��A��Q��U��,��D��1��U��>��4��3��5��=�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��153��4��U��D��1��M��/��5��S��7��H��4��7����N����H��)'��4��U��D��1��M��5��S������N����H���	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1)'��U��D��1��U��/��VӲU��>��/��=��W��Q��T ��U��D��1��*ӲU��>��/��=��W��Q�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1DB��U��D��1��UʡH��W��R��6��U��>�G��=��S��U��/��T��(����)������!/-��U��D��1��UʡH��W��R��6��U��>��G��S��U��T���	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1#!��U��D��1��M��>��U��.��6��<��B��B��U��D��1ձM��U��.��<��B��B�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��120��U��D��1�K��U��>�1��D��3̛<��2��/ќ6��H��Q��T&$��U��D��1�K��U��>�1��D��3��/��H��Q�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1DB��U��D��1ۓR��4��H��5�BPϪJ��>��D��3��K��T��(������(������!)'��U��D��1ۓR��4��H��5�B��J��>��D��3��K���EԼO��R��@����C��/��8��E������C��/��8#!��HԼO��R��@��C��/��8��>ٟ@��@Ԛ<��H����C��/��8��>ٟ@��@Ԛ<��EԼO��R��@����C��/��8��E������C��/��8qo��HԼO��R��@��C��/��D��8��>ٟ@��8��	��P��@��N��LΊ;�J��@��>��@��BΊ;��R��P��@��N��LΊ;�J��@��>��D��Oָ:��?��Thf��H����C��/��D��8��>ٟ@��8��	��P��@��N��LΊ;�J��@��>��@��BΊ;��R��P��@��N��LΊ;�J��@��>��Dָ:��?��T���EԼO��R��@����C��/��8��E������C��/��8ԼO��@��K��@��K�O��K��@��K��EԼO��R��@����C��/��8��E������C��/��8nl��HԼO��R��@��C��/��8��>ٟ@��8����P��HۇL��B��D��CɕH��5ǟV��G��R��P��H��G��L��B��D��=ږH��5��D��Oָ:��?��Tec��H����C��/��8��>ٟ@��8����P��HۇL��B��D��CɕH��5ǟV��G��R��P��H��G��L��B��D��=ږH��5��Dָ:��?��T���EԼO��R��@����C��/��8��E������C��/��8><ԼO��R��@��E��>��1��>��T��H��Iԓ4��C��/��8��>ٟ@����A�7��B,*����E��>��1��>��T����C��/��8��>ٟ@����+��EԼO��R��@����C��/��8��E������C��/��8}{��M��:İU��;ԼO��R��@��?��R��5��<��D��>��B��D�5��@��E��7��K՞R��W��K��D��5��C��/��8��>ٟ@��>��:��T��(������!��K��;�8��6_]��:����?��R��5��<��D��>��B��D�5��@��E��G՞RʼG��D��5��C��/��8��>ٟ@��>��:��T����K��;�8��6���EԼO��R��@����C��/��8��E������C��/��8#!��HԼO��R��@��C��/��8��>ٟ@��@Ԛ<��H����C��/��8��>ٟ@��@Ԛ<��EԼO��R��@����C��/��8��E������C��/��8#!ß<��:��Dć?ԼO��@��C��/��8��>ٟ@ ß<��:��Dć?�O��C��/��8��>ٟ@���EԼO��R��@����C��/��8��E������C��/��8ԼO��@��K��@��K�O��K��@��K��EԼO��R��@����C��/��8��E������C��/��8,*��HԼO��R��@��NܒM̺2��C��/��D��8��>ٟ@��#!��H����N�M��C��/��D��8��>ٟ@�����EԼO��R��@����C��/��8��E������C��/��8><ԼO��R��@��E��>��1��>��T��H��Iԓ4��C��/��8��>ٟ@����A�7��B,*����E��>��1��>��T����C��/��8��>ٟ@����+��EԼO��R��@����C��/��8��E������C��/��853��HԼO��R��@��C��/��D��8��>ٟ@����R��E��@��>��D��W/-��H����C��/��D��8��>ٟ@����R��E��@��>��D��W���EԼO��R��@����C��/��8��E������C��/��8#!��HԼO��R��@��C��/��8��>ٟ@��@Ԛ<��H����C��/��8��>ٟ@��@Ԛ<��EԼO��R��@����C��/��8��E������C��/��8&$��EԼO��@��C��/��8��>ٟ@��D��P�D��A#!��E�O��C��/��8��>ٟ@��D��P�D��A���EԼO��R��@����C��/��8��E������C��/��8ԼO��@��K��@��K�O��K��@��K��EԼO��R��@����C��/��8��E������C��/��8����HԼO��R��@��C��/��8��>ٟ@����D��>��A��I��H��!������D��>�H��I��H��:��@��>�Hٟ@��/ў7����:��@՞R��.ٟ@��/ў7��9��O��E��E��Xqo��H����C��/��8��>ٟ@����D��>��A��I��H��!������D��>�H��I��H��:��>�Hٟ@��/ў7����:��>ٟ@��/ў7��9��E��E��X���EԼO��R��@����C��/��8��E������C��/��8><ԼO��R��@��E��>��1��>��T��H��Iԓ4��C��/��8��>ٟ@����A�7��B,*����E��>��1��>��T����C��/��8��>ٟ@����+��EԼO��R��@����C��/��8��E������C��/��8;9��HԼO��@��C��/��8��>ٟ@����H��2��9��<��Cԓ4��>�9��2��653��H�O��C��/��8��>ٟ@����H��2��9��<��Cԓ4��>��2��6���EԼO��R��@����C��/��8��E������C��/��8#!��HԼO��R��@��C��/��8��>ٟ@��@Ԛ<��H����C��/��8��>ٟ@��@Ԛ<��EԼO��R��@����C��/��8��E������C��/��820��H��?��R��S��H��Iԓ4��>ԼO��@7��C��/��8��>ٟ@)'��H��?��R��S����>�O7��C��/��8��>ٟ@
\ No newline at end of file
diff --git a/paddle/trainer/tests/gen_proto_data.py b/paddle/trainer/tests/gen_proto_data.py
new file mode 100644
index 00000000000000..c818a94bee7c28
--- /dev/null
+++ b/paddle/trainer/tests/gen_proto_data.py
@@ -0,0 +1,288 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cStringIO import StringIO
+
+import paddle.proto.DataFormat_pb2 as DataFormat
+from google.protobuf.internal.encoder import _EncodeVarint
+
+import logging
+import pprint
+
+logging.basicConfig(
+    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s',
+)
+logger = logging.getLogger('paddle')
+logger.setLevel(logging.INFO)
+
+OOV_POLICY_IGNORE = 0
+OOV_POLICY_USE = 1
+OOV_POLICY_ERROR = 2
+
+num_original_columns = 3
+
+# Feature combination patterns.
+# [[-1,0], [0,0]]  means previous token at column 0 and current token at
+# column 0 are combined as one feature.
+patterns = [
+    [[-2,0]],
+    [[-1,0]],
+    [[0,0]],
+    [[1,0]],
+    [[2,0]],
+
+    [[-1,0], [0,0]],
+    [[0,0], [1,0]],
+
+    [[-2,1]],
+    [[-1,1]],
+    [[0,1]],
+    [[1,1]],
+    [[2,1]],
+    [[-2,1], [-1,1]],
+    [[-1,1], [0,1]],
+    [[0,1], [1,1]],
+    [[1,1], [2,1]],
+
+    [[-2,1], [-1,1], [0,1]],
+    [[-1,1], [0,1], [1,1]],
+    [[0,1], [1,1], [2,1]],
+]
+
+def make_features(sequence):
+    length = len(sequence)
+    num_features = len(sequence[0])
+    def get_features(pos):
+        if pos < 0:
+            return ['#B%s' % -pos] * num_features
+        if pos >= length:
+            return ['#E%s' % (pos - length + 1)] * num_features
+        return sequence[pos]
+
+    for i in xrange(length):
+        for pattern in patterns:
+            fname = '/'.join([get_features(i+pos)[f] for pos, f in pattern])
+            sequence[i].append(fname)
+
+'''
+Source file format:
+Each line is for one timestep. The features are separated by space.
+An empty line indicates end of a sequence.
+
+cutoff: a list of numbers. If count of a feature is smaller than this,
+ it will be ignored.
+if oov_policy[i] is OOV_POLICY_USE, id 0 is reserved for OOV features of
+i-th column.
+
+return a list of dict for each column
+'''
+def create_dictionaries(filename, cutoff, oov_policy):
+    def add_to_dict(sequence, dicts):
+        num_features = len(dicts)
+        for features in sequence:
+            l = len(features)
+            assert l == num_features, "Wrong number of features " + line
+            for i in xrange(l):
+                if features[i] in dicts[i]:
+                    dicts[i][features[i]] += 1
+                else:
+                    dicts[i][features[i]] = 1
+
+    num_features = len(cutoff)
+    dicts = []
+    for i in xrange(num_features):
+        dicts.append(dict())
+
+    f = open(filename, 'rb')
+
+    sequence = []
+
+    for line in f:
+        line = line.strip()
+        if not line:
+            make_features(sequence)
+            add_to_dict(sequence, dicts)
+            sequence = []
+            continue
+        features = line.split(' ')
+        sequence.append(features)
+
+
+    for i in xrange(num_features):
+        dct = dicts[i]
+        n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
+        todo = []
+        for k, v in dct.iteritems():
+            if v < cutoff[i]:
+                todo.append(k)
+            else:
+                dct[k] = n
+                n += 1
+
+        if oov_policy[i] == OOV_POLICY_USE:
+            # placeholder so that len(dct) will be the number of features
+            # including OOV
+            dct['#OOV#'] = 0
+
+        logger.info('column %d dict size=%d, ignored %d' % (i, n, len(todo)))
+        for k in todo:
+            del dct[k]
+
+    f.close()
+    return dicts
+
+
+def encode_varint(v):
+    out = StringIO()
+    _EncodeVarint(out.write, v)
+    return out.getvalue()
+
+
+def write_proto(file, message):
+    s = message.SerializeToString()
+    packed_len = encode_varint(len(s))
+    file.write(packed_len + s)
+
+
+'''
+if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not
+existed in dicts[i] will be assigned to id 0.
+if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist
+in dicts[i].
+'''
+def gen_proto_file(
+        input_file,
+        dicts,
+        oov_policy,
+        output_file):
+
+    def write_sequence(out, sequence):
+        num_features = len(dicts)
+        is_beginning = True
+        for features in sequence:
+            assert len(features) == num_features, \
+                "Wrong number of features: " + line
+            sample = DataFormat.DataSample()
+            for i in xrange(num_original_columns):
+                id = dicts[i].get(features[i], -1)
+                if id != -1:
+                    sample.id_slots.append(id)
+                elif oov_policy[i] == OOV_POLICY_IGNORE:
+                    sample.id_slots.append(0xffffffff)
+                elif oov_policy[i] == OOV_POLICY_ERROR:
+                    logger.fatal("Unknown token: %s" % features[i])
+                else:
+                    sample.id_slots.append(0)
+
+            if patterns:
+                dim = 0
+                vec = sample.vector_slots.add()
+                for i in xrange(num_original_columns, num_features):
+                    id = dicts[i].get(features[i], -1)
+                    if id != -1:
+                        vec.ids.append(dim + id)
+                    elif oov_policy[i] == OOV_POLICY_IGNORE:
+                        pass
+                    elif oov_policy[i] == OOV_POLICY_ERROR:
+                        logger.fatal("Unknown token: %s" % features[i])
+                    else:
+                        vec.ids.append(dim + 0)
+
+                    dim += len(dicts[i])
+
+            sample.is_beginning = is_beginning
+            is_beginning = False
+            write_proto(out, sample)
+
+    num_features = len(dicts)
+    f = open(input_file, 'rb')
+    out = open(output_file, 'wb')
+
+    header = DataFormat.DataHeader()
+    if patterns:
+        slot_def = header.slot_defs.add()
+        slot_def.type = DataFormat.SlotDef.VECTOR_SPARSE_NON_VALUE
+        slot_def.dim = sum([len(dicts[i])
+                            for i in xrange(num_original_columns, len(dicts))])
+        logger.info("feature_dim=%s" % slot_def.dim)
+
+    for i in xrange(num_original_columns):
+        slot_def = header.slot_defs.add()
+        slot_def.type = DataFormat.SlotDef.INDEX
+        slot_def.dim = len(dicts[i])
+
+    write_proto(out, header)
+
+    num_sequences = 0
+    sequence = []
+    for line in f:
+        line = line.strip()
+        if not line:
+            make_features(sequence)
+            write_sequence(out, sequence)
+            sequence = []
+            num_sequences += 1
+            continue
+        features = line.split(' ')
+        sequence.append(features)
+
+    f.close()
+    out.close()
+
+    logger.info("num_sequences=%s" % num_sequences)
+
+dict2 = {
+ 'B-ADJP': 0,
+ 'I-ADJP': 1,
+ 'B-ADVP': 2,
+ 'I-ADVP': 3,
+ 'B-CONJP': 4,
+ 'I-CONJP': 5,
+ 'B-INTJ': 6,
+ 'I-INTJ': 7,
+ 'B-LST': 8,
+ 'I-LST': 9,
+ 'B-NP': 10,
+ 'I-NP': 11,
+ 'B-PP': 12,
+ 'I-PP': 13,
+ 'B-PRT': 14,
+ 'I-PRT': 15,
+ 'B-SBAR': 16,
+ 'I-SBAR': 17,
+ 'B-UCP': 18,
+ 'I-UCP': 19,
+ 'B-VP': 20,
+ 'I-VP': 21,
+ 'O': 22
+}
+
+if __name__ == '__main__':
+    cutoff = [3, 1, 0]
+    cutoff += [3] * len(patterns)
+    oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR]
+    oov_policy += [OOV_POLICY_IGNORE] * len(patterns)
+    dicts = create_dictionaries(
+        'trainer/tests/train.txt', cutoff, oov_policy)
+    dicts[2] = dict2
+    gen_proto_file(
+        'trainer/tests/train.txt',
+        dicts,
+        oov_policy,
+        'trainer/tests/train_proto.bin')
+    gen_proto_file(
+        'trainer/tests/test.txt',
+        dicts,
+        oov_policy,
+        'trainer/tests/test_proto.bin')
diff --git a/paddle/trainer/tests/picojson.h b/paddle/trainer/tests/picojson.h
new file mode 100644
index 00000000000000..a0b5c2274b20fd
--- /dev/null
+++ b/paddle/trainer/tests/picojson.h
@@ -0,0 +1,1081 @@
+/*
+ * Copyright 2009-2010 Cybozu Labs, Inc.
+ * Copyright 2011-2014 Kazuho Oku
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef picojson_h
+#define picojson_h
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cstddef>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+// for isnan/isinf
+#if __cplusplus >= 201103L
+#include <cmath>
+#else
+extern "C" {
+#ifdef _MSC_VER
+#include <float.h>
+#elif defined(__INTEL_COMPILER)
+#include <mathimf.h>
+#else
+#include <math.h>
+#endif
+}
+#endif
+
+// experimental support for int64_t (see README.mkdn for detail)
+#ifdef PICOJSON_USE_INT64
+#define __STDC_FORMAT_MACROS
+#include <errno.h>
+#include <inttypes.h>
+#endif
+
+// to disable the use of localeconv(3), set PICOJSON_USE_LOCALE to 0
+#ifndef PICOJSON_USE_LOCALE
+#define PICOJSON_USE_LOCALE 1
+#endif
+#if PICOJSON_USE_LOCALE
+extern "C" {
+#include <locale.h>
+}
+#endif
+
+#ifndef PICOJSON_ASSERT
+#define PICOJSON_ASSERT(e)                  \
+  do {                                      \
+    if (!(e)) throw std::runtime_error(#e); \
+  } while (0)
+#endif
+
+#ifdef _MSC_VER
+#define SNPRINTF _snprintf_s
+#pragma warning(push)
+#pragma warning(disable : 4244)  // conversion from int to char
+#pragma warning(disable : 4127)  // conditional expression is constant
+#pragma warning(disable : 4702)  // unreachable code
+#else
+#define SNPRINTF snprintf
+#endif
+
+namespace picojson {
+
+enum {
+  null_type,
+  boolean_type,
+  number_type,
+  string_type,
+  array_type,
+  object_type
+#ifdef PICOJSON_USE_INT64
+  ,
+  int64_type
+#endif
+};
+
+enum { INDENT_WIDTH = 2 };
+
+struct null {};
+
+class value {
+public:
+  typedef std::vector<value> array;
+  typedef std::map<std::string, value> object;
+  union _storage {
+    bool boolean_;
+    double number_;
+#ifdef PICOJSON_USE_INT64
+    int64_t int64_;
+#endif
+    std::string* string_;
+    array* array_;
+    object* object_;
+  };
+
+protected:
+  int type_;
+  _storage u_;
+
+public:
+  value();
+  value(int type, bool);
+  explicit value(bool b);
+#ifdef PICOJSON_USE_INT64
+  explicit value(int64_t i);
+#endif
+  explicit value(double n);
+  explicit value(const std::string& s);
+  explicit value(const array& a);
+  explicit value(const object& o);
+  explicit value(const char* s);
+  value(const char* s, size_t len);
+  ~value();
+  value(const value& x);
+  value& operator=(const value& x);
+  void swap(value& x);
+  template <typename T>
+  bool is() const;
+  template <typename T>
+  const T& get() const;
+  template <typename T>
+  T& get();
+  bool evaluate_as_boolean() const;
+  const value& get(size_t idx) const;
+  const value& get(const std::string& key) const;
+  value& get(size_t idx);
+  value& get(const std::string& key);
+
+  bool contains(size_t idx) const;
+  bool contains(const std::string& key) const;
+  std::string to_str() const;
+  template <typename Iter>
+  void serialize(Iter os, bool prettify = false) const;
+  std::string serialize(bool prettify = false) const;
+
+private:
+  template <typename T>
+  value(const T*);  // intentionally defined to block implicit conversion of
+                    // pointer to bool
+  template <typename Iter>
+  static void _indent(Iter os, int indent);
+  template <typename Iter>
+  void _serialize(Iter os, int indent) const;
+  std::string _serialize(int indent) const;
+};
+
+typedef value::array array;
+typedef value::object object;
+
+inline value::value() : type_(null_type) {}
+
+inline value::value(int type, bool) : type_(type) {
+  switch (type) {
+#define INIT(p, v) \
+  case p##type:    \
+    u_.p = v;      \
+    break
+    INIT(boolean_, false);
+    INIT(number_, 0.0);
+#ifdef PICOJSON_USE_INT64
+    INIT(int64_, 0);
+#endif
+    INIT(string_, new std::string());
+    INIT(array_, new array());
+    INIT(object_, new object());
+#undef INIT
+    default:
+      break;
+  }
+}
+
+inline value::value(bool b) : type_(boolean_type) { u_.boolean_ = b; }
+
+#ifdef PICOJSON_USE_INT64
+inline value::value(int64_t i) : type_(int64_type) { u_.int64_ = i; }
+#endif
+
+inline value::value(double n) : type_(number_type) {
+  if (
+#ifdef _MSC_VER
+      !_finite(n)
+#elif __cplusplus >= 201103L || !(defined(isnan) && defined(isinf))
+      std::isnan(n) || std::isinf(n)
+#else
+      isnan(n) || isinf(n)
+#endif
+          ) {
+    throw std::overflow_error("");
+  }
+  u_.number_ = n;
+}
+
+inline value::value(const std::string& s) : type_(string_type) {
+  u_.string_ = new std::string(s);
+}
+
+inline value::value(const array& a) : type_(array_type) {
+  u_.array_ = new array(a);
+}
+
+inline value::value(const object& o) : type_(object_type) {
+  u_.object_ = new object(o);
+}
+
+inline value::value(const char* s) : type_(string_type) {
+  u_.string_ = new std::string(s);
+}
+
+inline value::value(const char* s, size_t len) : type_(string_type) {
+  u_.string_ = new std::string(s, len);
+}
+
+inline value::~value() {
+  switch (type_) {
+#define DEINIT(p) \
+  case p##type:   \
+    delete u_.p;  \
+    break
+    DEINIT(string_);
+    DEINIT(array_);
+    DEINIT(object_);
+#undef DEINIT
+    default:
+      break;
+  }
+}
+
+inline value::value(const value& x) : type_(x.type_) {
+  switch (type_) {
+#define INIT(p, v) \
+  case p##type:    \
+    u_.p = v;      \
+    break
+    INIT(string_, new std::string(*x.u_.string_));
+    INIT(array_, new array(*x.u_.array_));
+    INIT(object_, new object(*x.u_.object_));
+#undef INIT
+    default:
+      u_ = x.u_;
+      break;
+  }
+}
+
+inline value& value::operator=(const value& x) {
+  if (this != &x) {
+    value t(x);
+    swap(t);
+  }
+  return *this;
+}
+
+inline void value::swap(value& x) {
+  std::swap(type_, x.type_);
+  std::swap(u_, x.u_);
+}
+
+#define IS(ctype, jtype)                 \
+  template <>                            \
+  inline bool value::is<ctype>() const { \
+    return type_ == jtype##_type;        \
+  }
+IS(null, null)
+IS(bool, boolean)
+#ifdef PICOJSON_USE_INT64
+IS(int64_t, int64)
+#endif
+IS(std::string, string)
+IS(array, array)
+IS(object, object)
+#undef IS
+template <>
+inline bool value::is<double>() const {
+  return type_ == number_type
+#ifdef PICOJSON_USE_INT64
+         || type_ == int64_type
+#endif
+      ;
+}
+
+#define GET(ctype, var)                                                    \
+  template <>                                                              \
+  inline const ctype& value::get<ctype>() const {                          \
+    PICOJSON_ASSERT("type mismatch! call is<type>() before get<type>()" && \
+                    is<ctype>());                                          \
+    return var;                                                            \
+  }                                                                        \
+  template <>                                                              \
+  inline ctype& value::get<ctype>() {                                      \
+    PICOJSON_ASSERT("type mismatch! call is<type>() before get<type>()" && \
+                    is<ctype>());                                          \
+    return var;                                                            \
+  }
+GET(bool, u_.boolean_)
+GET(std::string, *u_.string_)
+GET(array, *u_.array_)
+GET(object, *u_.object_)
+#ifdef PICOJSON_USE_INT64
+GET(double,
+    (type_ == int64_type && (const_cast<value*>(this)->type_ = number_type,
+                             const_cast<value*>(this)->u_.number_ = u_.int64_),
+     u_.number_))
+GET(int64_t, u_.int64_)
+#else
+GET(double, u_.number_)
+#endif
+#undef GET
+
+inline bool value::evaluate_as_boolean() const {
+  switch (type_) {
+    case null_type:
+      return false;
+    case boolean_type:
+      return u_.boolean_;
+    case number_type:
+      return u_.number_ != 0;
+#ifdef PICOJSON_USE_INT64
+    case int64_type:
+      return u_.int64_ != 0;
+#endif
+    case string_type:
+      return !u_.string_->empty();
+    default:
+      return true;
+  }
+}
+
+inline const value& value::get(size_t idx) const {
+  static value s_null;
+  PICOJSON_ASSERT(is<array>());
+  return idx < u_.array_->size() ? (*u_.array_)[idx] : s_null;
+}
+
+inline value& value::get(size_t idx) {
+  static value s_null;
+  PICOJSON_ASSERT(is<array>());
+  return idx < u_.array_->size() ? (*u_.array_)[idx] : s_null;
+}
+
+inline const value& value::get(const std::string& key) const {
+  static value s_null;
+  PICOJSON_ASSERT(is<object>());
+  object::const_iterator i = u_.object_->find(key);
+  return i != u_.object_->end() ? i->second : s_null;
+}
+
+inline value& value::get(const std::string& key) {
+  static value s_null;
+  PICOJSON_ASSERT(is<object>());
+  object::iterator i = u_.object_->find(key);
+  return i != u_.object_->end() ? i->second : s_null;
+}
+
+inline bool value::contains(size_t idx) const {
+  PICOJSON_ASSERT(is<array>());
+  return idx < u_.array_->size();
+}
+
+inline bool value::contains(const std::string& key) const {
+  PICOJSON_ASSERT(is<object>());
+  object::const_iterator i = u_.object_->find(key);
+  return i != u_.object_->end();
+}
+
+inline std::string value::to_str() const {
+  switch (type_) {
+    case null_type:
+      return "null";
+    case boolean_type:
+      return u_.boolean_ ? "true" : "false";
+#ifdef PICOJSON_USE_INT64
+    case int64_type: {
+      char buf[sizeof("-9223372036854775808")];
+      SNPRINTF(buf, sizeof(buf), "%" PRId64, u_.int64_);
+      return buf;
+    }
+#endif
+    case number_type: {
+      char buf[256];
+      double tmp;
+      SNPRINTF(buf, sizeof(buf),
+               fabs(u_.number_) < (1ULL << 53) && modf(u_.number_, &tmp) == 0
+                   ? "%.f"
+                   : "%.17g",
+               u_.number_);
+#if PICOJSON_USE_LOCALE
+      char* decimal_point = localeconv()->decimal_point;
+      if (strcmp(decimal_point, ".") != 0) {
+        size_t decimal_point_len = strlen(decimal_point);
+        for (char* p = buf; *p != '\0'; ++p) {
+          if (strncmp(p, decimal_point, decimal_point_len) == 0) {
+            return std::string(buf, p) + "." + (p + decimal_point_len);
+          }
+        }
+      }
+#endif
+      return buf;
+    }
+    case string_type:
+      return *u_.string_;
+    case array_type:
+      return "array";
+    case object_type:
+      return "object";
+    default:
+      PICOJSON_ASSERT(0);
+#ifdef _MSC_VER
+      __assume(0);
+#endif
+  }
+  return std::string();
+}
+
+template <typename Iter>
+void copy(const std::string& s, Iter oi) {
+  std::copy(s.begin(), s.end(), oi);
+}
+
+template <typename Iter>
+void serialize_str(const std::string& s, Iter oi) {
+  *oi++ = '"';
+  for (std::string::const_iterator i = s.begin(); i != s.end(); ++i) {
+    switch (*i) {
+#define MAP(val, sym) \
+  case val:           \
+    copy(sym, oi);    \
+    break
+      MAP('"', "\\\"");
+      MAP('\\', "\\\\");
+      MAP('/', "\\/");
+      MAP('\b', "\\b");
+      MAP('\f', "\\f");
+      MAP('\n', "\\n");
+      MAP('\r', "\\r");
+      MAP('\t', "\\t");
+#undef MAP
+      default:
+        if (static_cast<unsigned char>(*i) < 0x20 || *i == 0x7f) {
+          char buf[7];
+          SNPRINTF(buf, sizeof(buf), "\\u%04x", *i & 0xff);
+          copy(buf, buf + 6, oi);
+        } else {
+          *oi++ = *i;
+        }
+        break;
+    }
+  }
+  *oi++ = '"';
+}
+
+template <typename Iter>
+void value::serialize(Iter oi, bool prettify) const {
+  return _serialize(oi, prettify ? 0 : -1);
+}
+
+inline std::string value::serialize(bool prettify) const {
+  return _serialize(prettify ? 0 : -1);
+}
+
+template <typename Iter>
+void value::_indent(Iter oi, int indent) {
+  *oi++ = '\n';
+  for (int i = 0; i < indent * INDENT_WIDTH; ++i) {
+    *oi++ = ' ';
+  }
+}
+
+template <typename Iter>
+void value::_serialize(Iter oi, int indent) const {
+  switch (type_) {
+    case string_type:
+      serialize_str(*u_.string_, oi);
+      break;
+    case array_type: {
+      *oi++ = '[';
+      if (indent != -1) {
+        ++indent;
+      }
+      for (array::const_iterator i = u_.array_->begin(); i != u_.array_->end();
+           ++i) {
+        if (i != u_.array_->begin()) {
+          *oi++ = ',';
+        }
+        if (indent != -1) {
+          _indent(oi, indent);
+        }
+        i->_serialize(oi, indent);
+      }
+      if (indent != -1) {
+        --indent;
+        if (!u_.array_->empty()) {
+          _indent(oi, indent);
+        }
+      }
+      *oi++ = ']';
+      break;
+    }
+    case object_type: {
+      *oi++ = '{';
+      if (indent != -1) {
+        ++indent;
+      }
+      for (object::const_iterator i = u_.object_->begin();
+           i != u_.object_->end(); ++i) {
+        if (i != u_.object_->begin()) {
+          *oi++ = ',';
+        }
+        if (indent != -1) {
+          _indent(oi, indent);
+        }
+        serialize_str(i->first, oi);
+        *oi++ = ':';
+        if (indent != -1) {
+          *oi++ = ' ';
+        }
+        i->second._serialize(oi, indent);
+      }
+      if (indent != -1) {
+        --indent;
+        if (!u_.object_->empty()) {
+          _indent(oi, indent);
+        }
+      }
+      *oi++ = '}';
+      break;
+    }
+    default:
+      copy(to_str(), oi);
+      break;
+  }
+  if (indent == 0) {
+    *oi++ = '\n';
+  }
+}
+
+inline std::string value::_serialize(int indent) const {
+  std::string s;
+  _serialize(std::back_inserter(s), indent);
+  return s;
+}
+
+template <typename Iter>
+class input {
+protected:
+  Iter cur_, end_;
+  int last_ch_;
+  bool ungot_;
+  int line_;
+
+public:
+  input(const Iter& first, const Iter& last)
+      : cur_(first), end_(last), last_ch_(-1), ungot_(false), line_(1) {}
+  int getc() {
+    if (ungot_) {
+      ungot_ = false;
+      return last_ch_;
+    }
+    if (cur_ == end_) {
+      last_ch_ = -1;
+      return -1;
+    }
+    if (last_ch_ == '\n') {
+      line_++;
+    }
+    last_ch_ = *cur_ & 0xff;
+    ++cur_;
+    return last_ch_;
+  }
+  void ungetc() {
+    if (last_ch_ != -1) {
+      PICOJSON_ASSERT(!ungot_);
+      ungot_ = true;
+    }
+  }
+  Iter cur() const { return cur_; }
+  int line() const { return line_; }
+  void skip_ws() {
+    while (1) {
+      int ch = getc();
+      if (!(ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')) {
+        ungetc();
+        break;
+      }
+    }
+  }
+  bool expect(int expect) {
+    skip_ws();
+    if (getc() != expect) {
+      ungetc();
+      return false;
+    }
+    return true;
+  }
+  bool match(const std::string& pattern) {
+    for (std::string::const_iterator pi(pattern.begin()); pi != pattern.end();
+         ++pi) {
+      if (getc() != *pi) {
+        ungetc();
+        return false;
+      }
+    }
+    return true;
+  }
+};
+
+template <typename Iter>
+inline int _parse_quadhex(input<Iter>& in) {
+  int uni_ch = 0, hex;
+  for (int i = 0; i < 4; i++) {
+    if ((hex = in.getc()) == -1) {
+      return -1;
+    }
+    if ('0' <= hex && hex <= '9') {
+      hex -= '0';
+    } else if ('A' <= hex && hex <= 'F') {
+      hex -= 'A' - 0xa;
+    } else if ('a' <= hex && hex <= 'f') {
+      hex -= 'a' - 0xa;
+    } else {
+      in.ungetc();
+      return -1;
+    }
+    uni_ch = uni_ch * 16 + hex;
+  }
+  return uni_ch;
+}
+
+template <typename String, typename Iter>
+inline bool _parse_codepoint(String& out, input<Iter>& in) {
+  int uni_ch;
+  if ((uni_ch = _parse_quadhex(in)) == -1) {
+    return false;
+  }
+  if (0xd800 <= uni_ch && uni_ch <= 0xdfff) {
+    if (0xdc00 <= uni_ch) {
+      // a second 16-bit of a surrogate pair appeared
+      return false;
+    }
+    // first 16-bit of surrogate pair, get the next one
+    if (in.getc() != '\\' || in.getc() != 'u') {
+      in.ungetc();
+      return false;
+    }
+    int second = _parse_quadhex(in);
+    if (!(0xdc00 <= second && second <= 0xdfff)) {
+      return false;
+    }
+    uni_ch = ((uni_ch - 0xd800) << 10) | ((second - 0xdc00) & 0x3ff);
+    uni_ch += 0x10000;
+  }
+  if (uni_ch < 0x80) {
+    out.push_back(uni_ch);
+  } else {
+    if (uni_ch < 0x800) {
+      out.push_back(0xc0 | (uni_ch >> 6));
+    } else {
+      if (uni_ch < 0x10000) {
+        out.push_back(0xe0 | (uni_ch >> 12));
+      } else {
+        out.push_back(0xf0 | (uni_ch >> 18));
+        out.push_back(0x80 | ((uni_ch >> 12) & 0x3f));
+      }
+      out.push_back(0x80 | ((uni_ch >> 6) & 0x3f));
+    }
+    out.push_back(0x80 | (uni_ch & 0x3f));
+  }
+  return true;
+}
+
+template <typename String, typename Iter>
+inline bool _parse_string(String& out, input<Iter>& in) {
+  while (1) {
+    int ch = in.getc();
+    if (ch < ' ') {
+      in.ungetc();
+      return false;
+    } else if (ch == '"') {
+      return true;
+    } else if (ch == '\\') {
+      if ((ch = in.getc()) == -1) {
+        return false;
+      }
+      switch (ch) {
+#define MAP(sym, val)   \
+  case sym:             \
+    out.push_back(val); \
+    break
+        MAP('"', '\"');
+        MAP('\\', '\\');
+        MAP('/', '/');
+        MAP('b', '\b');
+        MAP('f', '\f');
+        MAP('n', '\n');
+        MAP('r', '\r');
+        MAP('t', '\t');
+#undef MAP
+        case 'u':
+          if (!_parse_codepoint(out, in)) {
+            return false;
+          }
+          break;
+        default:
+          return false;
+      }
+    } else {
+      out.push_back(ch);
+    }
+  }
+  return false;
+}
+
+template <typename Context, typename Iter>
+inline bool _parse_array(Context& ctx, input<Iter>& in) {
+  if (!ctx.parse_array_start()) {
+    return false;
+  }
+  size_t idx = 0;
+  if (in.expect(']')) {
+    return ctx.parse_array_stop(idx);
+  }
+  do {
+    if (!ctx.parse_array_item(in, idx)) {
+      return false;
+    }
+    idx++;
+  } while (in.expect(','));
+  return in.expect(']') && ctx.parse_array_stop(idx);
+}
+
+template <typename Context, typename Iter>
+inline bool _parse_object(Context& ctx, input<Iter>& in) {
+  if (!ctx.parse_object_start()) {
+    return false;
+  }
+  if (in.expect('}')) {
+    return true;
+  }
+  do {
+    std::string key;
+    if (!in.expect('"') || !_parse_string(key, in) || !in.expect(':')) {
+      return false;
+    }
+    if (!ctx.parse_object_item(in, key)) {
+      return false;
+    }
+  } while (in.expect(','));
+  return in.expect('}');
+}
+
+template <typename Iter>
+inline std::string _parse_number(input<Iter>& in) {
+  std::string num_str;
+  while (1) {
+    int ch = in.getc();
+    if (('0' <= ch && ch <= '9') || ch == '+' || ch == '-' || ch == 'e' ||
+        ch == 'E') {
+      num_str.push_back(ch);
+    } else if (ch == '.') {
+#if PICOJSON_USE_LOCALE
+      num_str += localeconv()->decimal_point;
+#else
+      num_str.push_back('.');
+#endif
+    } else {
+      in.ungetc();
+      break;
+    }
+  }
+  return num_str;
+}
+
+template <typename Context, typename Iter>
+inline bool _parse(Context& ctx, input<Iter>& in) {
+  in.skip_ws();
+  int ch = in.getc();
+  switch (ch) {
+#define IS(ch, text, op)        \
+  case ch:                      \
+    if (in.match(text) && op) { \
+      return true;              \
+    } else {                    \
+      return false;             \
+    }
+    IS('n', "ull", ctx.set_null());
+    IS('f', "alse", ctx.set_bool(false));
+    IS('t', "rue", ctx.set_bool(true));
+#undef IS
+    case '"':
+      return ctx.parse_string(in);
+    case '[':
+      return _parse_array(ctx, in);
+    case '{':
+      return _parse_object(ctx, in);
+    default:
+      if (('0' <= ch && ch <= '9') || ch == '-') {
+        double f;
+        char* endp;
+        in.ungetc();
+        std::string num_str = _parse_number(in);
+        if (num_str.empty()) {
+          return false;
+        }
+#ifdef PICOJSON_USE_INT64
+        {
+          errno = 0;
+          intmax_t ival = strtoimax(num_str.c_str(), &endp, 10);
+          if (errno == 0 && std::numeric_limits<int64_t>::min() <= ival &&
+              ival <= std::numeric_limits<int64_t>::max() &&
+              endp == num_str.c_str() + num_str.size()) {
+            ctx.set_int64(ival);
+            return true;
+          }
+        }
+#endif
+        f = strtod(num_str.c_str(), &endp);
+        if (endp == num_str.c_str() + num_str.size()) {
+          ctx.set_number(f);
+          return true;
+        }
+        return false;
+      }
+      break;
+  }
+  in.ungetc();
+  return false;
+}
+
+class deny_parse_context {
+public:
+  bool set_null() { return false; }
+  bool set_bool(bool) { return false; }
+#ifdef PICOJSON_USE_INT64
+  bool set_int64(int64_t) { return false; }
+#endif
+  bool set_number(double) { return false; }
+  template <typename Iter>
+  bool parse_string(input<Iter>&) {
+    return false;
+  }
+  bool parse_array_start() { return false; }
+  template <typename Iter>
+  bool parse_array_item(input<Iter>&, size_t) {
+    return false;
+  }
+  bool parse_array_stop(size_t) { return false; }
+  bool parse_object_start() { return false; }
+  template <typename Iter>
+  bool parse_object_item(input<Iter>&, const std::string&) {
+    return false;
+  }
+};
+
+class default_parse_context {
+protected:
+  value* out_;
+
+public:
+  default_parse_context(value* out) : out_(out) {}
+  bool set_null() {
+    *out_ = value();
+    return true;
+  }
+  bool set_bool(bool b) {
+    *out_ = value(b);
+    return true;
+  }
+#ifdef PICOJSON_USE_INT64
+  bool set_int64(int64_t i) {
+    *out_ = value(i);
+    return true;
+  }
+#endif
+  bool set_number(double f) {
+    *out_ = value(f);
+    return true;
+  }
+  template <typename Iter>
+  bool parse_string(input<Iter>& in) {
+    *out_ = value(string_type, false);
+    return _parse_string(out_->get<std::string>(), in);
+  }
+  bool parse_array_start() {
+    *out_ = value(array_type, false);
+    return true;
+  }
+  template <typename Iter>
+  bool parse_array_item(input<Iter>& in, size_t) {
+    array& a = out_->get<array>();
+    a.push_back(value());
+    default_parse_context ctx(&a.back());
+    return _parse(ctx, in);
+  }
+  bool parse_array_stop(size_t) { return true; }
+  bool parse_object_start() {
+    *out_ = value(object_type, false);
+    return true;
+  }
+  template <typename Iter>
+  bool parse_object_item(input<Iter>& in, const std::string& key) {
+    object& o = out_->get<object>();
+    default_parse_context ctx(&o[key]);
+    return _parse(ctx, in);
+  }
+
+private:
+  default_parse_context(const default_parse_context&);
+  default_parse_context& operator=(const default_parse_context&);
+};
+
+class null_parse_context {
+public:
+  struct dummy_str {
+    void push_back(int) {}
+  };
+
+public:
+  null_parse_context() {}
+  bool set_null() { return true; }
+  bool set_bool(bool) { return true; }
+#ifdef PICOJSON_USE_INT64
+  bool set_int64(int64_t) { return true; }
+#endif
+  bool set_number(double) { return true; }
+  template <typename Iter>
+  bool parse_string(input<Iter>& in) {
+    dummy_str s;
+    return _parse_string(s, in);
+  }
+  bool parse_array_start() { return true; }
+  template <typename Iter>
+  bool parse_array_item(input<Iter>& in, size_t) {
+    return _parse(*this, in);
+  }
+  bool parse_array_stop(size_t) { return true; }
+  bool parse_object_start() { return true; }
+  template <typename Iter>
+  bool parse_object_item(input<Iter>& in, const std::string&) {
+    return _parse(*this, in);
+  }
+
+private:
+  null_parse_context(const null_parse_context&);
+  null_parse_context& operator=(const null_parse_context&);
+};
+
+// obsolete, use the version below
+template <typename Iter>
+inline std::string parse(value& out, Iter& pos, const Iter& last) {
+  std::string err;
+  pos = parse(out, pos, last, &err);
+  return err;
+}
+
+template <typename Context, typename Iter>
+inline Iter _parse(Context& ctx, const Iter& first, const Iter& last,
+                   std::string* err) {
+  input<Iter> in(first, last);
+  if (!_parse(ctx, in) && err != NULL) {
+    char buf[64];
+    SNPRINTF(buf, sizeof(buf), "syntax error at line %d near: ", in.line());
+    *err = buf;
+    while (1) {
+      int ch = in.getc();
+      if (ch == -1 || ch == '\n') {
+        break;
+      } else if (ch >= ' ') {
+        err->push_back(ch);
+      }
+    }
+  }
+  return in.cur();
+}
+
+template <typename Iter>
+inline Iter parse(value& out, const Iter& first, const Iter& last,
+                  std::string* err) {
+  default_parse_context ctx(&out);
+  return _parse(ctx, first, last, err);
+}
+
+inline std::string parse(value& out, const std::string& s) {
+  std::string err;
+  parse(out, s.begin(), s.end(), &err);
+  return err;
+}
+
+inline std::string parse(value& out, std::istream& is) {
+  std::string err;
+  parse(out, std::istreambuf_iterator<char>(is.rdbuf()),
+        std::istreambuf_iterator<char>(), &err);
+  return err;
+}
+
+template <typename T>
+struct last_error_t {
+  static std::string s;
+};
+template <typename T>
+std::string last_error_t<T>::s;
+
+inline void set_last_error(const std::string& s) { last_error_t<bool>::s = s; }
+
+inline const std::string& get_last_error() { return last_error_t<bool>::s; }
+
+inline bool operator==(const value& x, const value& y) {
+  if (x.is<null>()) return y.is<null>();
+#define PICOJSON_CMP(type) \
+  if (x.is<type>()) return y.is<type>() && x.get<type>() == y.get<type>()
+  PICOJSON_CMP(bool);
+  PICOJSON_CMP(double);
+  PICOJSON_CMP(std::string);
+  PICOJSON_CMP(array);
+  PICOJSON_CMP(object);
+#undef PICOJSON_CMP
+  PICOJSON_ASSERT(0);
+#ifdef _MSC_VER
+  __assume(0);
+#endif
+  return false;
+}
+
+inline bool operator!=(const value& x, const value& y) { return !(x == y); }
+}
+
+namespace std {
+template <>
+inline void swap(picojson::value& x, picojson::value& y) {
+  x.swap(y);
+}
+}
+
+inline std::istream& operator>>(std::istream& is, picojson::value& x) {
+  picojson::set_last_error(std::string());
+  std::string err = picojson::parse(x, is);
+  if (!err.empty()) {
+    picojson::set_last_error(err);
+    is.setstate(std::ios::failbit);
+  }
+  return is;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const picojson::value& x) {
+  x.serialize(std::ostream_iterator<char>(os));
+  return os;
+}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#endif
diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
new file mode 100644
index 00000000000000..ed83e6ae84bcf7
--- /dev/null
+++ b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
@@ -0,0 +1,2 @@
+0;0 1 3 5;1 3.42 2.25;2 4:4.2 6:2.8;3 aa
+2;0 7 3 8;1 2.25 1.24;2 1:2.3 5:8.24;3 bb
diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list
new file mode 100644
index 00000000000000..0db50f34dd24b5
--- /dev/null
+++ b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list
@@ -0,0 +1 @@
+trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto
new file mode 100644
index 00000000000000..f189b21e86a50d
Binary files /dev/null and b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto differ
diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
new file mode 100644
index 00000000000000..8b041cd6641686
--- /dev/null
+++ b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
@@ -0,0 +1 @@
+./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto
diff --git a/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.beam b/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.beam
new file mode 100644
index 00000000000000..47401c949eff1d
--- /dev/null
+++ b/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.beam
@@ -0,0 +1,60 @@
+0
+0	0	 1 2 3 4
+1	-0.2	 0 1 2 3 4
+
+1
+0	0	 1 2 3 4
+1	-0.2	 0 1 2 3 4
+
+2
+0	0	 1 2 3 4
+1	-0.2	 0 1 2 3 4
+
+3
+0	0	 1 2 3 4
+1	-0.2	 0 1 2 3 4
+
+4
+0	0	 1 2 3 4
+1	-0.2	 0 1 2 3 4
+
+5
+0	0	 1 2 3 4
+1	-0.2	 0 1 2 3 4
+
+6
+0	0	 1 2 3 4
+1	-0.2	 0 1 2 3 4
+
+7
+0	0	 1 2 3 4
+1	-0.2	 0 1 2 3 4
+
+8
+0	0	 1 2 3 4
+1	-0.2	 0 1 2 3 4
+
+9
+0	0	 1 2 3 4
+1	-0.2	 0 1 2 3 4
+
+10
+0	0	 1 2 3 4
+1	-0.2	 0 1 2 3 4
+
+11
+0	0	 1 2 3 4
+1	-0.2	 0 1 2 3 4
+
+12
+0	0	 1 2 3 4
+1	-0.2	 0 1 2 3 4
+
+13
+0	0	 1 2 3 4
+1	-0.2	 0 1 2 3 4
+
+14
+0	0	 1 2 3 4
+1	-0.2	 0 1 2 3 4
+
diff --git a/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam b/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam
new file mode 100644
index 00000000000000..23bf1179ebb2f1
--- /dev/null
+++ b/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam
@@ -0,0 +1,16 @@
+0	 1 2 3 4
+1	 1 2 3 4
+2	 1 2 3 4
+3	 1 2 3 4
+4	 1 2 3 4
+5	 1 2 3 4
+6	 1 2 3 4
+7	 1 2 3 4
+8	 1 2 3 4
+9	 1 2 3 4
+10	 1 2 3 4
+11	 1 2 3 4
+12	 1 2 3 4
+13	 1 2 3 4
+14	 1 2 3 4
+
diff --git a/paddle/trainer/tests/rnn_gen_test_model_dir/t1/transtable b/paddle/trainer/tests/rnn_gen_test_model_dir/t1/transtable
new file mode 100644
index 00000000000000..161624fbf795ac
Binary files /dev/null and b/paddle/trainer/tests/rnn_gen_test_model_dir/t1/transtable differ
diff --git a/paddle/trainer/tests/rnn_gen_test_model_dir/t1/wordvec b/paddle/trainer/tests/rnn_gen_test_model_dir/t1/wordvec
new file mode 100644
index 00000000000000..30ccf33d2e308a
Binary files /dev/null and b/paddle/trainer/tests/rnn_gen_test_model_dir/t1/wordvec differ
diff --git a/paddle/trainer/tests/sample_data.txt b/paddle/trainer/tests/sample_data.txt
new file mode 100644
index 00000000000000..3398a38bdfcc1b
--- /dev/null
+++ b/paddle/trainer/tests/sample_data.txt
@@ -0,0 +1,10 @@
+0 1 2 -1
+2 3 -1 2
+1 2 2 1
+0 2 1 2
+1 3 1 2
+1 1 2 1
+0 3 -1 2
+1 -2 2 1
+2 2 1 2
+1 3 1 2
diff --git a/paddle/trainer/tests/sample_filelist.txt b/paddle/trainer/tests/sample_filelist.txt
new file mode 100644
index 00000000000000..7db4c735359a38
--- /dev/null
+++ b/paddle/trainer/tests/sample_filelist.txt
@@ -0,0 +1 @@
+trainer/tests/sample_data.txt
diff --git a/paddle/trainer/tests/sample_trainer_config.conf b/paddle/trainer/tests/sample_trainer_config.conf
new file mode 100644
index 00000000000000..15901065b226fa
--- /dev/null
+++ b/paddle/trainer/tests/sample_trainer_config.conf
@@ -0,0 +1,87 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+TrainData(SimpleData(
+            files = "trainer/tests/sample_filelist.txt",
+            feat_dim = 3,
+            context_len = 0,
+            buffer_capacity = 1000000))
+
+TestData(SimpleData(
+           files = "trainer/tests/sample_filelist.txt",
+           feat_dim = 3,
+           context_len = 0,
+           buffer_capacity = 1000000))
+
+settings(batch_size = 100)
+
+data = data_layer(name='input', size=3)
+
+fc1 = fc_layer(input=data, size=5,
+               bias_attr=False,
+               act=SigmoidActivation())
+
+fc2 = fc_layer(input=data, size=9,
+               bias_attr=False,
+               act=LinearActivation())
+
+fc3 = fc_layer(input=data, size=3,
+               bias_attr=False,
+               act=TanhActivation())
+
+fc4 = fc_layer(input=data, size=5,
+               bias_attr=False,
+               act=LinearActivation(),
+               param_attr=ParamAttr(name='sharew'))
+
+fc5 = fc_layer(input=data, size=5,
+               bias_attr=False,
+               act=BReluActivation())
+
+fc6 = fc_layer(input=data, size=5,
+               bias_attr=False,
+               act=SoftReluActivation())
+
+fc7 = fc_layer(input=data, size=3,
+               bias_attr=False,
+               act=SquareActivation())
+
+fc8 = fc_layer(input=data, size=5,
+               bias_attr=True,
+               act=SquareActivation())
+
+with mixed_layer(size=3, act=SoftmaxActivation()) as layer9:
+    layer9 += full_matrix_projection(input=fc1)
+    layer9 += full_matrix_projection(input=fc2)
+    layer9 += full_matrix_projection(input=fc3)
+    layer9 += trans_full_matrix_projection(input=fc4,
+                                           param_attr=ParamAttr(name='sharew'))
+    layer9 += full_matrix_projection(input=fc5)
+    layer9 += full_matrix_projection(input=fc6)
+    layer9 += full_matrix_projection(input=fc7)
+    layer9 += full_matrix_projection(input=fc8)
+
+if get_config_arg('with_cost', bool, True):
+    # This is for training the neural network.
+    # We need to have another data layer for label
+    # and a layer for calculating cost
+    lbl = data_layer(name='label', size=1)
+    outputs(classification_cost(input=layer9, label=lbl))
+else:    
+    # This is for prediction where we don't have label
+    # and don't need to calculate cost
+    outputs(layer9)
diff --git a/paddle/trainer/tests/sample_trainer_config_hsigmoid.conf b/paddle/trainer/tests/sample_trainer_config_hsigmoid.conf
new file mode 100644
index 00000000000000..174cb5e25f1c4d
--- /dev/null
+++ b/paddle/trainer/tests/sample_trainer_config_hsigmoid.conf
@@ -0,0 +1,53 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+TrainData(SimpleData(
+    files = "trainer/tests/sample_filelist.txt",
+    feat_dim = 3,
+    context_len = 0,
+    buffer_capacity = 1000000,
+))
+
+settings(batch_size = 100)
+
+data = data_layer(name='input', size=3)
+
+fc1 = fc_layer(input=data, size=12,
+               bias_attr=False,
+               act=SigmoidActivation())
+
+fc2 = fc_layer(input=data, size=19,
+               bias_attr=False,
+               act=LinearActivation())
+
+fc3 = fc_layer(input=data, size=5,
+               bias_attr=False,
+               act=TanhActivation())
+
+fc4 = fc_layer(input=data, size=5,
+               bias_attr=False,
+               act=LinearActivation())
+
+# This is for training the neural network.
+# We need to have another data layer for label
+# and a layer for calculating cost
+lbl = data_layer(name='label', size=1)
+
+outputs(hsigmoid(input=[fc1, fc2, fc3, fc4],
+                 label=lbl,
+                 num_classes=3))
diff --git a/paddle/trainer/tests/sample_trainer_config_opt_a.conf b/paddle/trainer/tests/sample_trainer_config_opt_a.conf
new file mode 100644
index 00000000000000..61d2c62d4296a3
--- /dev/null
+++ b/paddle/trainer/tests/sample_trainer_config_opt_a.conf
@@ -0,0 +1,43 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
+
+################################### Data Configuration ###################################
+TrainData(ProtoData(files = "train.list"))
+################################### Algorithm Configuration ###################################
+Settings(
+    learning_rate_decay_a = 0.0,
+    learning_rate_decay_b = 0.0,
+    learning_rate = 1e-03,
+    batch_size = 1000,
+    algorithm = 'sgd',
+    num_batches_per_send_parameter = 1,
+    num_batches_per_get_parameter = 1,
+    learning_method='sparse_momentum',
+)
+default_momentum(0.5)
+################################### Network Configuration ###################################
+Layer(type = "data", name = "input", size = 784)
+Layer(inputs = [Input("input", parameter_name = "_layer1.w")], name = "layer1", bias = Bias(parameter_name = "_layer1.bias"), active_type = "sigmoid", type = "fc", size = 800)
+Layer(inputs = [Input("layer1", parameter_name = "_layer2.w")], name = "layer2", bias = Bias(parameter_name = "_layer2.bias"), active_type = "sigmoid", type = "fc", size = 800)
+#Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w", decay_rate = 0.02)], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), active_type = "margin", type = "fc", size = 10)
+#Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w", decay_rate = 0.02)], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), type = "fc", size = 10)
+Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w")], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), active_type = "softmax", type = "fc", size = 10)
+Layer(type = "data", name = "label", size = 1)
+Layer(inputs = [Input("output"), Input("label")], type = "multi-class-cross-entropy", name = "cost")
+#Layer(inputs = [Input("output"), Input("label")], type = "huber", name = "cost")
+Evaluator(inputs=["output", "label"], type = "classification_error", name = "classification_error")
+Inputs("input", "label")
+Outputs("cost")
diff --git a/paddle/trainer/tests/sample_trainer_config_opt_b.conf b/paddle/trainer/tests/sample_trainer_config_opt_b.conf
new file mode 100644
index 00000000000000..82d547dd8a0d05
--- /dev/null
+++ b/paddle/trainer/tests/sample_trainer_config_opt_b.conf
@@ -0,0 +1,43 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
+
+################################### Data Configuration ###################################
+TrainData(ProtoData(files = "train.list"))
+################################### Algorithm Configuration ###################################
+Settings(
+    learning_rate_decay_a = 0.0,
+    learning_rate_decay_b = 0.0,
+    learning_rate = 1e-03,
+    batch_size = 1000,
+    algorithm = 'sgd',
+    num_batches_per_send_parameter = 1,
+    num_batches_per_get_parameter = 1,
+    learning_method='momentum',
+)
+default_momentum(0.5)
+################################### Network Configuration ###################################
+Layer(type = "data", name = "input", size = 784)
+Layer(inputs = [Input("input", parameter_name = "_layer1.w")], name = "layer1", bias = Bias(parameter_name = "_layer1.bias"), active_type = "sigmoid", type = "fc", size = 800)
+Layer(inputs = [Input("layer1", parameter_name = "_layer2.w")], name = "layer2", bias = Bias(parameter_name = "_layer2.bias"), active_type = "sigmoid", type = "fc", size = 800)
+#Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w", decay_rate = 0.02)], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), active_type = "margin", type = "fc", size = 10)
+#Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w", decay_rate = 0.02)], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), type = "fc", size = 10)
+Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w")], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), active_type = "softmax", type = "fc", size = 10)
+Layer(type = "data", name = "label", size = 1)
+Layer(inputs = [Input("output"), Input("label")], type = "multi-class-cross-entropy", name = "cost")
+#Layer(inputs = [Input("output"), Input("label")], type = "huber", name = "cost")
+Evaluator(inputs=["output", "label"], type = "classification_error", name = "classification_error")
+Inputs("input", "label")
+Outputs("cost")
diff --git a/paddle/trainer/tests/sample_trainer_config_parallel.conf b/paddle/trainer/tests/sample_trainer_config_parallel.conf
new file mode 100644
index 00000000000000..3563fede1c1826
--- /dev/null
+++ b/paddle/trainer/tests/sample_trainer_config_parallel.conf
@@ -0,0 +1,149 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
+
+TrainData(
+    SimpleData(
+        files = "trainer/tests/sample_filelist.txt",
+        feat_dim = 3,
+        context_len = 0,
+        buffer_capacity = 1000000,
+    )
+)
+
+TestData(
+    SimpleData(
+        files = "trainer/tests/sample_filelist.txt",
+        feat_dim = 3,
+        context_len = 0,
+        buffer_capacity = 1000000,
+    )
+)
+
+Settings(
+    algorithm = "sgd",
+    num_batches_per_send_parameter = 1,
+    num_batches_per_get_parameter = 1,
+    batch_size = 100,
+    learning_rate = 0.001,
+    learning_rate_decay_a = 1e-5,
+    learning_rate_decay_b = 0.5,
+)
+
+default_initial_std(0.2)
+# Output layer, label layer, cost layer, preferably set to the same environment.
+output_device = 0
+
+model_type("nn")
+
+# Input Layer does not need to specify the device number.
+Layer(
+    name = "input",
+    type = "data",
+    size = 3,
+)
+
+# Calculate in the CPU.
+Layer(
+    name = "layer1_1",
+    type = "fc",
+    size = 5,
+    active_type = "sigmoid",
+    device = -1,
+    inputs = "input",
+)
+
+# Calculate in the GPU 0.
+Layer(
+    name = "layer2_1",
+    type = "fc",
+    size = 10,
+    active_type = "sigmoid",
+    device = 0,
+    inputs = "layer1_1",
+)
+
+# Calculate in the GPU 1.
+Layer(
+    name = "layer2_2",
+    type = "fc",
+    size = 10,
+    active_type = "sigmoid",
+    device = 1,
+    inputs = "layer1_1",
+)
+
+# Calculate in the GPU 0.
+Layer(
+    name = "layer3_1",
+    type = "fc",
+    size = 10,
+    device = 0,
+    active_type = "sigmoid",
+    inputs = ["layer2_1", "layer2_2"],
+)
+
+# Calculate in the GPU 1.
+Layer(
+    name = "layer3_2",
+    type = "fc",
+    size = 10,
+    device = 1,
+    active_type = "sigmoid",
+    inputs = ["layer2_1", "layer2_2"],
+)
+
+
+Layer(
+    name = "output",
+    type = "fc",
+    size = 10,
+    device = output_device,
+    active_type = "sigmoid",
+    inputs = ["layer3_1", "layer3_2"],
+)
+
+if get_config_arg('with_cost', bool, True):
+    # This is for training the neural network.
+    # We need to have another data layer for label
+    # and a layer for calculating cost
+    Layer(
+        name = "label",
+        type = "data",
+        device = output_device,
+        size = 1,
+    )
+
+    Layer(
+        name = "cost",
+        type = "multi-class-cross-entropy",
+        device = output_device,
+        inputs = ["output", "label"],
+    )
+
+    Evaluator(
+        name = "error",
+        type = "classification_error",
+        inputs = ["output", "label"])
+
+    Inputs("input", "label")
+    Outputs("cost")
+
+else:
+    # This is for prediction where we don't have label
+    # and don't need to calculate cost
+    Inputs("input")
+    Outputs("output")
diff --git a/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf b/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf
new file mode 100644
index 00000000000000..d254cc5700abfb
--- /dev/null
+++ b/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf
@@ -0,0 +1,154 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
+
+# Note: when making change to this file, please make sure
+# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest
+# for comparing these two nets can pass (test_CompareTwoNets)
+
+default_initial_std(0.1)
+default_device(0)
+
+word_dim = 1451594
+l1 = 0
+l2 = 0
+
+model_type("nn")
+
+sparse_update = get_config_arg("sparse_update", bool, False)
+
+TrainData(ProtoData(        
+            type = "proto_sequence",
+            files = ('trainer/tests/train.list'), 
+            ))
+
+Settings(
+    algorithm='sgd',
+    batch_size=100,
+    learning_rate=0.0001,
+    learning_rate_decay_a=4e-08,
+    learning_rate_decay_b=0.0,
+    learning_rate_schedule='poly',
+)
+
+
+wordvec_dim = 128
+layer2_dim = 96
+layer3_dim = 96
+hidden_dim = 128
+
+slot_names = ["qb", "qw", "tb", "tw"]
+
+def ltr_network(network_name,
+                word_dim=word_dim,
+                wordvec_dim=wordvec_dim,
+                layer2_dim=layer2_dim,
+                layer3_dim=layer3_dim,
+                hidden_dim=hidden_dim,
+                slot_names=slot_names,
+                l1=l1,
+                l2=l2):
+
+    slotnum = len(slot_names)
+    for i in xrange(slotnum):
+        Inputs(slot_names[i] + network_name)
+    for i in xrange(slotnum):
+        Layer(
+            name = slot_names[i] + network_name,
+            type = "data",
+            size = word_dim,
+            device = -1,
+        )
+        Layer(
+            name = slot_names[i] + "_embedding_" + network_name,
+            type = "mixed",
+            size = wordvec_dim,
+            bias = False,
+            device = -1,
+            inputs = TableProjection(slot_names[i] + network_name,
+                                     parameter_name = "embedding.w0",
+                                     decay_rate_l1=l1,
+                                     sparse_remote_update = True,
+                                     sparse_update = sparse_update,
+                                     ),
+        )
+        Layer(
+            name = slot_names[i] + "_rnn1_" + network_name,
+            type = "recurrent",
+            active_type = "tanh",
+            bias = Bias(initial_std = 0,
+                        parameter_name = "rnn1.bias"),
+            inputs = Input(slot_names[i] + "_embedding_" + network_name,
+                           parameter_name = "rnn1.w0")
+        )
+        Layer(
+            name = slot_names[i] + "_rnnlast_" + network_name,
+            type = "seqlastins",
+            inputs = [
+                slot_names[i] + "_rnn1_" + network_name,
+            ],
+        )
+
+    Layer(
+        name = "layer2_" + network_name,
+        type = "fc",
+        active_type = "tanh",
+        size = layer2_dim,
+        bias = Bias(parameter_name = "layer2.bias"),
+        inputs = [Input(slot_name + "_rnnlast_" + network_name, 
+                        parameter_name = "_layer2_" + slot_name + ".w", 
+                        decay_rate = l2, 
+                        initial_smart = True) for slot_name in slot_names]
+    )
+    Layer(
+        name = "layer3_" + network_name,
+        type = "fc",
+        active_type = "tanh",
+        size = layer3_dim,
+        bias = Bias(parameter_name = "layer3.bias"),
+        inputs = [
+            Input("layer2_" + network_name, 
+                  parameter_name = "_layer3.w", 
+                  decay_rate = l2, 
+                  initial_smart = True),
+        ]
+    )
+    Layer(
+        name = "output_" + network_name,
+        type = "fc",
+        size = 1,
+        bias = False,
+        inputs = [
+                  Input("layer3_" + network_name,
+                       parameter_name = "_layerO.w"),
+                 ],
+        )
+
+
+ltr_network("left")
+ltr_network("right")
+Inputs("label")
+Layer(
+    name = "label",
+    type = "data",
+    size = 1,
+    )
+Outputs("cost", "qb_rnnlast_left")
+Layer(
+    name = "cost",
+    type = "rank-cost",
+    inputs = ["output_left", "output_right", "label"],
+    )
diff --git a/paddle/trainer/tests/sample_trainer_config_rnn.conf b/paddle/trainer/tests/sample_trainer_config_rnn.conf
new file mode 100644
index 00000000000000..cbb66630294302
--- /dev/null
+++ b/paddle/trainer/tests/sample_trainer_config_rnn.conf
@@ -0,0 +1,180 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
+
+# Note: when making change to this file, please make sure
+# sample_trainer_config_qb_rnn.conf is changed accordingly so that the uniitest
+# for comparing these two nets can pass (test_CompareTwoNets)
+
+default_initial_std(0.1)
+default_device(0)
+
+word_dim = 1451594
+l1 = 0
+l2 = 0
+
+model_type("recurrent_nn")
+
+sparse_update = get_config_arg("sparse_update", bool, False)
+
+TrainData(ProtoData(
+            type = "proto_sequence",
+            files = ('trainer/tests/train.list'), 
+            ))
+
+Settings(
+    algorithm='sgd',
+    batch_size=100,
+    learning_rate=0.0001,
+    learning_rate_decay_a=4e-08,
+    learning_rate_decay_b=0.0,
+    learning_rate_schedule='poly',
+)
+
+
+wordvec_dim = 128
+layer2_dim = 96
+layer3_dim = 96
+hidden_dim = 128
+
+slot_names = ["qb", "qw", "tb", "tw"]
+
+def SimpleRecurrentLayer(name, 
+                         size, 
+                         active_type, 
+                         bias, 
+                         input_layer_name, 
+                         parameter_name,
+                         seq_reversed = False):
+    RecurrentLayerGroupBegin(name + "_layer_group", 
+                             in_links=[input_layer_name], 
+                             out_links=[name],
+                             seq_reversed=seq_reversed)
+    memory_name = Memory(name=name, size=size)
+    Layer(
+        name = name,
+        type = "mixed",
+        size = size,
+        active_type = active_type,
+        bias = bias,
+        inputs = [IdentityProjection(input_layer_name),
+                  FullMatrixProjection(memory_name,
+                                       parameter_name = parameter_name,
+                                       ),
+                  ]
+        )
+    RecurrentLayerGroupEnd(name + "_layer_group")
+
+
+def ltr_network(network_name,
+                word_dim=word_dim,
+                wordvec_dim=wordvec_dim,
+                layer2_dim=layer2_dim,
+                layer3_dim=layer3_dim,
+                hidden_dim=hidden_dim,
+                slot_names=slot_names,
+                l1=l1,
+                l2=l2):
+
+    slotnum = len(slot_names)
+    for i in xrange(slotnum):
+        Inputs(slot_names[i] + network_name)
+    for i in xrange(slotnum):
+        Layer(
+            name = slot_names[i] + network_name,
+            type = "data",
+            size = word_dim,
+            device = -1,
+        )
+        Layer(
+            name = slot_names[i] + "_embedding_" + network_name,
+            type = "mixed",
+            size = wordvec_dim,
+            bias = False,
+            device = -1,
+            inputs = TableProjection(slot_names[i] + network_name,
+                                     parameter_name = "embedding.w0",
+                                     decay_rate_l1=l1,
+                                     sparse_remote_update = True,
+                                     sparse_update = sparse_update,
+                                     ),
+        )
+        SimpleRecurrentLayer(
+            name = slot_names[i] + "_rnn1_" + network_name,
+            size = hidden_dim,
+            active_type = "tanh",
+            bias = Bias(initial_std = 0,
+                        parameter_name = "rnn1.bias"),
+            input_layer_name = slot_names[i] + "_embedding_" + network_name,
+            parameter_name = "rnn1.w0",
+            )
+        Layer(
+            name = slot_names[i] + "_rnnlast_" + network_name,
+            type = "seqlastins",
+            inputs = [
+                slot_names[i] + "_rnn1_" + network_name,
+            ],
+        )
+    Layer(
+        name = "layer2_" + network_name,
+        type = "fc",
+        active_type = "tanh",
+        size = layer2_dim,
+        bias = Bias(parameter_name = "layer2.bias"),
+        inputs = [Input(slot_name + "_rnnlast_" + network_name, 
+                        parameter_name = "_layer2_" + slot_name + ".w", 
+                        decay_rate = l2, 
+                        initial_smart = True) for slot_name in slot_names]
+    )
+    Layer(
+        name = "layer3_" + network_name,
+        type = "fc",
+        active_type = "tanh",
+        size = layer3_dim,
+        bias = Bias(parameter_name = "layer3.bias"),
+        inputs = [
+            Input("layer2_" + network_name, 
+                  parameter_name = "_layer3.w", 
+                  decay_rate = l2, 
+                  initial_smart = True),
+        ]
+    )
+    Layer(
+        name = "output_" + network_name,
+        type = "fc",
+        size = 1,
+        bias = False,
+        inputs = [
+                  Input("layer3_" + network_name,
+                       parameter_name = "_layerO.w"),
+                 ],
+        )
+
+
+ltr_network("left")
+ltr_network("right")
+Inputs("label")
+Layer(
+    name = "label",
+    type = "data",
+    size = 1,
+    )
+Outputs("cost", "qb_rnnlast_left")
+Layer(
+    name = "cost",
+    type = "rank-cost",
+    inputs = ["output_left", "output_right", "label"],
+    )
diff --git a/paddle/trainer/tests/sample_trainer_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_rnn_gen.conf
new file mode 100644
index 00000000000000..5b65310e7649cb
--- /dev/null
+++ b/paddle/trainer/tests/sample_trainer_rnn_gen.conf
@@ -0,0 +1,108 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
+
+import math
+
+beam_search = get_config_arg('beam_search', bool, False)
+
+model_type("recurrent_nn")
+
+Settings(learning_rate=0, batch_size=15, algorithm='sgd')
+
+Inputs("sent_id", "dummy_data_input")
+Outputs("predict_word")
+
+num_words = 5
+
+DataLayer(name="sent_id", size=1, )
+
+# This layer has no actual use, but only to decide batch_size in generation.
+# When generating, at least one Memory in RecurrentLayer MUST have a boot layer.
+DataLayer(name="dummy_data_input", size=2, )
+
+if beam_search:
+    RecurrentLayerGroupBegin("decoding_layer_group",
+                             in_links=[],
+                             out_links=["predict_word"],
+                             generator=Generator(max_num_frames=10,
+                                                 beam_size=2,
+                                                 num_results_per_sample=2, ))
+else:
+    RecurrentLayerGroupBegin("decoding_layer_group",
+                             in_links=[],
+                             out_links=["predict_word"],
+                             generator=Generator(max_num_frames=10, ))
+dummy_memory = Memory(name="dummy_memory",
+                      size=2,
+                      boot_layer="dummy_data_input")
+MixedLayer(name="dummy_memory",
+           size=2,
+           bias=False,
+           inputs=[IdentityProjection(dummy_memory)], )
+state_memory = Memory(name="state",
+                      size=num_words,
+                      #boot_bias=True,
+                      #boot_bias_active_type = "tanh",
+                      )
+
+predict_word_memory = Memory(name="predict_word",
+                             size=num_words,
+                             boot_with_const_id=0, )
+
+MixedLayer(
+        name = "word_embedding",
+        size = num_words, # word embedding dim is the same as num_words in this test.
+        bias = False,
+        inputs = TableProjection(predict_word_memory,
+                                 initial_std=1,
+                                 learning_rate=0,
+                                 parameter_name="wordvec"))
+
+Layer(  # simplified RNN for testing
+    name="state",
+    type="mixed",
+    size=num_words,
+    bias=False,
+    inputs=[FullMatrixProjection("word_embedding",
+                                 parameter_name="transtable")])
+
+Layer(name="output",
+      type="mixed",
+      size=num_words,
+      active_type="exponential",
+      bias=False,
+      inputs=TransposedFullMatrixProjection("state",
+                                            initial_std=1,
+                                            learning_rate=0,
+                                            parameter_name="wordvec"), )
+
+Layer(name="predict_word", type="maxid", inputs=["output"], )
+
+Layer(name="eos_check",
+      type="eos_id",
+      eos_id=num_words - 1,
+      inputs=["predict_word"], )
+RecurrentLayerGroupEnd("decoding_layer_group")
+
+Evaluator(name="answer_printer",
+          type="seq_text_printer",
+          dict_file="./trainer/tests/test_gen_dict.txt",
+          result_file="./trainer/tests/dump_text.test",
+          inputs=[
+              "sent_id",
+              "predict_word",
+          ], )
diff --git a/paddle/trainer/tests/test.txt b/paddle/trainer/tests/test.txt
new file mode 100644
index 00000000000000..68e7f72e3d8d4e
--- /dev/null
+++ b/paddle/trainer/tests/test.txt
@@ -0,0 +1,1001 @@
+Confidence NN B-NP
+in IN B-PP
+the DT B-NP
+pound NN I-NP
+is VBZ B-VP
+widely RB I-VP
+expected VBN I-VP
+to TO I-VP
+take VB I-VP
+another DT B-NP
+sharp JJ I-NP
+dive NN I-NP
+if IN B-SBAR
+trade NN B-NP
+figures NNS I-NP
+for IN B-PP
+September NNP B-NP
+, , O
+due JJ B-ADJP
+for IN B-PP
+release NN B-NP
+tomorrow NN B-NP
+, , O
+fail VB B-VP
+to TO I-VP
+show VB I-VP
+a DT B-NP
+substantial JJ I-NP
+improvement NN I-NP
+from IN B-PP
+July NNP B-NP
+and CC I-NP
+August NNP I-NP
+'s POS B-NP
+near-record JJ I-NP
+deficits NNS I-NP
+. . O
+
+Chancellor NNP O
+of IN B-PP
+the DT B-NP
+Exchequer NNP I-NP
+Nigel NNP B-NP
+Lawson NNP I-NP
+'s POS B-NP
+restated VBN I-NP
+commitment NN I-NP
+to TO B-PP
+a DT B-NP
+firm NN I-NP
+monetary JJ I-NP
+policy NN I-NP
+has VBZ B-VP
+helped VBN I-VP
+to TO I-VP
+prevent VB I-VP
+a DT B-NP
+freefall NN I-NP
+in IN B-PP
+sterling NN B-NP
+over IN B-PP
+the DT B-NP
+past JJ I-NP
+week NN I-NP
+. . O
+
+But CC O
+analysts NNS B-NP
+reckon VBP B-VP
+underlying VBG B-NP
+support NN I-NP
+for IN B-PP
+sterling NN B-NP
+has VBZ B-VP
+been VBN I-VP
+eroded VBN I-VP
+by IN B-PP
+the DT B-NP
+chancellor NN I-NP
+'s POS B-NP
+failure NN I-NP
+to TO B-VP
+announce VB I-VP
+any DT B-NP
+new JJ I-NP
+policy NN I-NP
+measures NNS I-NP
+in IN B-PP
+his PRP$ B-NP
+Mansion NNP I-NP
+House NNP I-NP
+speech NN I-NP
+last JJ B-NP
+Thursday NNP I-NP
+. . O
+
+This DT B-NP
+has VBZ B-VP
+increased VBN I-VP
+the DT B-NP
+risk NN I-NP
+of IN B-PP
+the DT B-NP
+government NN I-NP
+being VBG B-VP
+forced VBN I-VP
+to TO I-VP
+increase VB I-VP
+base NN B-NP
+rates NNS I-NP
+to TO B-PP
+16 CD B-NP
+% NN I-NP
+from IN B-PP
+their PRP$ B-NP
+current JJ I-NP
+15 CD I-NP
+% NN I-NP
+level NN I-NP
+to TO B-VP
+defend VB I-VP
+the DT B-NP
+pound NN I-NP
+, , O
+economists NNS B-NP
+and CC O
+foreign JJ B-NP
+exchange NN I-NP
+market NN I-NP
+analysts NNS I-NP
+say VBP B-VP
+. . O
+
+`` `` O
+The DT B-NP
+risks NNS I-NP
+for IN B-PP
+sterling NN B-NP
+of IN B-PP
+a DT B-NP
+bad JJ I-NP
+trade NN I-NP
+figure NN I-NP
+are VBP B-VP
+very RB B-ADVP
+heavily RB I-ADVP
+on IN B-PP
+the DT B-NP
+down JJ I-NP
+side NN I-NP
+, , O
+'' '' O
+said VBD B-VP
+Chris NNP B-NP
+Dillow NNP I-NP
+, , O
+senior JJ B-NP
+U.K. NNP I-NP
+economist NN I-NP
+at IN B-PP
+Nomura NNP B-NP
+Research NNP I-NP
+Institute NNP I-NP
+. . O
+
+`` `` O
+If IN B-SBAR
+there EX B-NP
+is VBZ B-VP
+another DT B-NP
+bad JJ I-NP
+trade NN I-NP
+number NN I-NP
+, , O
+there EX B-NP
+could MD B-VP
+be VB I-VP
+an DT B-NP
+awful JJ I-NP
+lot NN I-NP
+of IN B-PP
+pressure NN B-NP
+, , O
+'' '' O
+noted VBD B-VP
+Simon NNP B-NP
+Briscoe NNP I-NP
+, , O
+U.K. NNP B-NP
+economist NN I-NP
+for IN B-PP
+Midland NNP B-NP
+Montagu NNP I-NP
+, , O
+a DT B-NP
+unit NN I-NP
+of IN B-PP
+Midland NNP B-NP
+Bank NNP I-NP
+PLC NNP I-NP
+. . O
+
+Forecasts NNS B-NP
+for IN B-PP
+the DT B-NP
+trade NN I-NP
+figures NNS I-NP
+range VBP B-VP
+widely RB B-ADVP
+, , O
+but CC O
+few JJ B-NP
+economists NNS I-NP
+expect VBP B-VP
+the DT B-NP
+data NNS I-NP
+to TO B-VP
+show VB I-VP
+a DT B-NP
+very RB I-NP
+marked VBN I-NP
+improvement NN I-NP
+from IN B-PP
+the DT O
+# # O
+2 CD O
+billion CD O
+-LRB- ( O
+$ $ B-ADJP
+3.2 CD O
+billion CD O
+-RRB- ) O
+deficit NN B-NP
+in IN B-PP
+the DT B-NP
+current JJ I-NP
+account NN I-NP
+reported VBD B-VP
+for IN B-PP
+August NNP B-NP
+. . O
+
+The DT B-NP
+August NNP I-NP
+deficit NN I-NP
+and CC O
+the DT B-NP
+# # I-NP
+2.2 CD I-NP
+billion CD I-NP
+gap NN I-NP
+registered VBN B-VP
+in IN B-PP
+July NNP B-NP
+are VBP B-VP
+topped VBN I-VP
+only RB B-ADVP
+by IN B-PP
+the DT B-NP
+# # I-NP
+2.3 CD I-NP
+billion CD I-NP
+deficit NN I-NP
+of IN B-PP
+October NNP B-NP
+1988 CD I-NP
+. . O
+
+Sanjay NNP B-NP
+Joshi NNP I-NP
+, , O
+European JJ B-NP
+economist NN I-NP
+at IN B-PP
+Baring NNP B-NP
+Brothers NNPS I-NP
+& CC I-NP
+Co. NNP I-NP
+, , O
+said VBD B-VP
+there EX B-NP
+is VBZ B-VP
+no DT B-NP
+sign NN I-NP
+that IN B-SBAR
+Britain NNP B-NP
+'s POS B-NP
+manufacturing NN I-NP
+industry NN I-NP
+is VBZ B-VP
+transforming VBG I-VP
+itself PRP B-NP
+to TO B-VP
+boost VB I-VP
+exports NNS B-NP
+. . O
+
+At IN B-PP
+the DT B-NP
+same JJ I-NP
+time NN I-NP
+, , O
+he PRP B-NP
+remains VBZ B-VP
+fairly RB B-ADJP
+pessimistic JJ I-ADJP
+about IN B-PP
+the DT B-NP
+outlook NN I-NP
+for IN B-PP
+imports NNS B-NP
+, , O
+given VBN B-PP
+continued VBD B-NP
+high JJ I-NP
+consumer NN I-NP
+and CC I-NP
+capital NN I-NP
+goods NNS I-NP
+inflows NNS I-NP
+. . O
+
+He PRP B-NP
+reckons VBZ B-VP
+the DT B-NP
+current JJ I-NP
+account NN I-NP
+deficit NN I-NP
+will MD B-VP
+narrow VB I-VP
+to TO B-PP
+only RB B-NP
+# # I-NP
+1.8 CD I-NP
+billion CD I-NP
+in IN B-PP
+September NNP B-NP
+. . O
+
+However RB B-ADVP
+, , O
+Mr. NNP B-NP
+Dillow NNP I-NP
+said VBD B-VP
+he PRP B-NP
+believes VBZ B-VP
+that IN B-SBAR
+a DT B-NP
+reduction NN I-NP
+in IN B-PP
+raw JJ B-NP
+material NN I-NP
+stockbuilding VBG I-NP
+by IN B-PP
+industry NN B-NP
+could MD B-VP
+lead VB I-VP
+to TO B-PP
+a DT B-NP
+sharp JJ I-NP
+drop NN I-NP
+in IN B-PP
+imports NNS B-NP
+. . O
+
+Combined VBN B-PP
+with IN B-PP
+at IN B-ADVP
+least JJS I-ADVP
+some DT B-NP
+rebound NN I-NP
+in IN B-PP
+exports NNS B-NP
+after IN B-PP
+August NNP B-NP
+'s POS B-NP
+unexpected JJ I-NP
+decline NN I-NP
+, , O
+the DT B-NP
+deficit NN I-NP
+could MD B-VP
+narrow VB I-VP
+to TO B-PP
+as RB B-NP
+little JJ I-NP
+as IN I-NP
+# # I-NP
+1.3 CD I-NP
+billion CD I-NP
+. . O
+
+Mr. NNP B-NP
+Briscoe NNP I-NP
+, , O
+who WP B-NP
+also RB B-ADVP
+forecasts VBZ B-VP
+a DT B-NP
+# # I-NP
+1.3 CD I-NP
+billion CD I-NP
+current JJ I-NP
+account NN I-NP
+gap NN I-NP
+, , O
+warns VBZ B-VP
+that IN B-SBAR
+even RB B-SBAR
+if IN I-SBAR
+the DT B-NP
+trade NN I-NP
+figures NNS I-NP
+are VBP B-VP
+bullish JJ B-ADJP
+for IN B-PP
+sterling NN B-NP
+, , O
+the DT B-NP
+currency NN I-NP
+wo MD B-VP
+n't RB I-VP
+advance VB I-VP
+much JJ B-NP
+because IN B-SBAR
+investors NNS B-NP
+will MD B-VP
+want VB I-VP
+to TO I-VP
+see VB I-VP
+further JJ B-NP
+evidence NN I-NP
+of IN B-PP
+the DT B-NP
+turnaround NN I-NP
+before IN B-PP
+adjusting VBG B-VP
+positions NNS B-NP
+. . O
+
+Nevertheless RB B-ADVP
+, , O
+he PRP B-NP
+noted VBD B-VP
+, , O
+`` `` O
+No DT B-NP
+one PRP I-NP
+will MD B-VP
+want VB I-VP
+to TO I-VP
+go VB I-VP
+into IN B-PP
+the DT B-NP
+trade NN I-NP
+figures NNS I-NP
+without IN B-PP
+a DT B-NP
+flat JJ I-NP
+position NN I-NP
+'' '' O
+in IN B-PP
+the DT B-NP
+pound NN I-NP
+. . O
+
+Meanwhile RB B-ADVP
+, , O
+overall JJ B-NP
+evidence NN I-NP
+on IN B-PP
+the DT B-NP
+economy NN I-NP
+remains VBZ B-VP
+fairly RB B-ADJP
+clouded VBN I-ADJP
+. . O
+
+In IN B-PP
+his PRP$ B-NP
+Mansion NNP I-NP
+House NNP I-NP
+speech NN I-NP
+, , O
+Mr. NNP B-NP
+Lawson NNP I-NP
+warned VBD B-VP
+that IN B-SBAR
+a DT B-NP
+further JJ I-NP
+slowdown NN I-NP
+can MD B-VP
+be VB I-VP
+expected VBN I-VP
+as IN B-SBAR
+the DT B-NP
+impact NN I-NP
+of IN B-PP
+the DT B-NP
+last JJ I-NP
+rise NN I-NP
+in IN B-PP
+interest NN B-NP
+rates NNS I-NP
+earlier RBR B-NP
+this DT I-NP
+month NN I-NP
+takes VBZ B-VP
+effect NN B-NP
+. . O
+
+U.K. JJ B-NP
+base NN I-NP
+rates NNS I-NP
+are VBP B-VP
+at IN B-PP
+their PRP$ B-NP
+highest JJS I-NP
+level NN I-NP
+in IN B-PP
+eight CD B-NP
+years NNS I-NP
+. . O
+
+But CC O
+consumer NN B-NP
+expenditure NN I-NP
+data NNS I-NP
+released VBD B-VP
+Friday NNP B-NP
+do VBP B-VP
+n't RB I-VP
+suggest VB I-VP
+that IN B-SBAR
+the DT B-NP
+U.K. NNP I-NP
+economy NN I-NP
+is VBZ B-VP
+slowing VBG I-VP
+that DT B-ADVP
+quickly RB I-ADVP
+. . O
+
+The DT B-NP
+figures NNS I-NP
+show VBP B-VP
+that DT O
+spending NN B-NP
+rose VBD B-VP
+0.1 CD B-NP
+% NN I-NP
+in IN B-PP
+the DT B-NP
+third JJ I-NP
+quarter NN I-NP
+from IN B-PP
+the DT B-NP
+second JJ I-NP
+quarter NN I-NP
+and CC O
+was VBD B-VP
+up IN B-ADVP
+3.8 CD B-NP
+% NN I-NP
+from IN B-PP
+a DT B-NP
+year NN I-NP
+ago RB B-ADVP
+. . O
+
+This DT B-NP
+compares VBZ B-VP
+with IN B-PP
+a DT B-NP
+1.6 CD I-NP
+% NN I-NP
+rise NN I-NP
+in IN B-PP
+the DT B-NP
+second NN I-NP
+from IN B-PP
+the DT B-NP
+first JJ I-NP
+quarter NN I-NP
+and CC O
+a DT B-NP
+5.4 CD I-NP
+% NN I-NP
+increase NN I-NP
+from IN B-PP
+the DT B-NP
+second JJ I-NP
+quarter NN I-NP
+of IN B-PP
+1988 CD B-NP
+. . O
+
+Mr. NNP B-NP
+Dillow NNP I-NP
+said VBD B-VP
+the DT B-NP
+data NNS I-NP
+show VBP B-VP
+the DT B-NP
+economy NN I-NP
+`` `` O
+is VBZ B-VP
+still RB B-ADVP
+quite RB B-ADJP
+strong JJ I-ADJP
+, , O
+'' '' O
+but CC O
+suggestions NNS B-NP
+that IN B-SBAR
+much NN B-NP
+of IN B-PP
+the DT B-NP
+spending NN I-NP
+went VBD B-VP
+on IN B-PP
+services NNS B-NP
+rather RB B-PP
+than IN I-PP
+consumer NN B-NP
+goods NNS I-NP
+should MD B-VP
+reduce VB I-VP
+fears NNS B-NP
+of IN B-PP
+more JJR B-NP
+import NN I-NP
+rises NNS I-NP
+. . O
+
+Certainly RB B-ADVP
+, , O
+the DT B-NP
+chancellor NN I-NP
+has VBZ B-VP
+made VBN I-VP
+it PRP B-NP
+clear JJ B-ADJP
+that IN B-SBAR
+he PRP B-NP
+is VBZ B-VP
+prepared VBN I-VP
+to TO I-VP
+increase VB I-VP
+interest NN B-NP
+rates NNS I-NP
+again RB B-ADVP
+if IN B-SBAR
+necessary JJ B-ADJP
+to TO B-VP
+both DT I-VP
+ensure VB I-VP
+that IN B-SBAR
+a DT B-NP
+substantial JJ I-NP
+slowdown NN I-NP
+does VBZ B-VP
+take VB I-VP
+place NN B-NP
+and CC O
+that DT O
+sterling NN B-NP
+does VBZ B-VP
+n't RB I-VP
+decline VB I-VP
+further JJ B-ADVP
+. . O
+
+Thursday NNP B-NP
+, , O
+he PRP B-NP
+reminded VBD B-VP
+his PRP$ B-NP
+audience NN I-NP
+that IN B-SBAR
+the DT B-NP
+government NN I-NP
+`` `` O
+can MD B-VP
+not RB I-VP
+allow VB I-VP
+the DT B-NP
+necessary JJ I-NP
+rigor NN I-NP
+of IN B-PP
+monetary JJ B-NP
+policy NN I-NP
+to TO B-VP
+be VB I-VP
+undermined VBN I-VP
+by IN B-PP
+exchange NN B-NP
+rate NN I-NP
+weakness NN I-NP
+. . O
+'' '' O
+
+Analysts NNS B-NP
+agree VBP B-VP
+there EX B-NP
+is VBZ B-VP
+little JJ B-NP
+holding NN B-VP
+sterling NN B-NP
+firm NN B-ADJP
+at IN B-PP
+the DT B-NP
+moment NN I-NP
+other JJ B-ADJP
+than IN B-PP
+Mr. NNP B-NP
+Lawson NNP I-NP
+'s POS B-NP
+promise NN I-NP
+that IN B-SBAR
+rates NNS B-NP
+will MD B-VP
+be VB I-VP
+pushed VBN I-VP
+higher JJR B-ADJP
+if IN B-SBAR
+necessary JJ B-ADJP
+. . O
+
+And CC O
+, , O
+they PRP B-NP
+warn VBP B-VP
+, , O
+any DT B-NP
+further JJ I-NP
+drop NN I-NP
+in IN B-PP
+the DT B-NP
+government NN I-NP
+'s POS B-NP
+popularity NN I-NP
+could MD B-VP
+swiftly RB I-VP
+make VB I-VP
+this DT B-NP
+promise NN I-NP
+sound NN B-VP
+hollow JJ B-ADJP
+. . O
+
+Sterling NNP B-NP
+was VBD B-VP
+already RB I-VP
+showing VBG I-VP
+some DT B-NP
+signs NNS I-NP
+of IN B-PP
+a DT B-NP
+lack NN I-NP
+of IN B-PP
+confidence NN B-NP
+in IN B-PP
+Mr. NNP B-NP
+Lawson NNP I-NP
+'s POS B-NP
+promise NN I-NP
+Friday NNP B-NP
+. . O
+
+In IN B-PP
+European JJ B-NP
+trading NN I-NP
+it PRP B-NP
+declined VBD B-VP
+to TO B-PP
+$ $ B-NP
+1.5890 CD I-NP
+and CC O
+2.9495 CD B-NP
+marks NNS I-NP
+from IN B-PP
+$ $ B-NP
+1.5940 CD I-NP
+and CC O
+2.9429 CD B-NP
+marks NNS I-NP
+late JJ B-NP
+Thursday NNP I-NP
+. . O
+
+Economists NNS B-NP
+suggested VBD B-VP
+that IN B-SBAR
+if IN B-SBAR
+the DT B-NP
+pound NN I-NP
+falls VBZ B-VP
+much JJ B-NP
+below IN B-PP
+2.90 CD B-NP
+marks NNS I-NP
+, , O
+the DT B-NP
+government NN I-NP
+will MD B-VP
+be VB I-VP
+forced VBN I-VP
+to TO I-VP
+increase VB I-VP
+rates NNS B-NP
+to TO B-PP
+16 CD B-NP
+% NN I-NP
+, , O
+both DT B-VP
+to TO I-VP
+halt VB B-VP
+any DT B-NP
+further JJ I-NP
+decline NN I-NP
+and CC O
+ensure VB B-VP
+that IN B-SBAR
+the DT B-NP
+balance NN I-NP
+of IN B-PP
+monetary JJ B-NP
+policy NN I-NP
+remains VBZ B-VP
+unchanged JJ B-ADJP
+. . O
+
+Friday NNP B-NP
+'s POS B-NP
+Market NNP I-NP
+Activity NN I-NP
+
+The DT B-NP
+dollar NN I-NP
+posted VBD B-VP
+gains NNS B-NP
+in IN B-PP
+quiet JJ B-NP
+trading NN I-NP
+as IN B-SBAR
+concerns NNS B-NP
+about IN B-PP
+equities NNS B-NP
+abated VBN B-VP
+. . O
+
+Foreign JJ B-NP
+exchange NN I-NP
+dealers NNS I-NP
+said VBD B-VP
+that IN B-SBAR
+the DT B-NP
+currency NN I-NP
+market NN I-NP
+has VBZ B-VP
+begun VBN I-VP
+to TO I-VP
+distance VB I-VP
+itself PRP B-NP
+from IN B-PP
+the DT B-NP
+volatile JJ I-NP
+stock NN I-NP
+exchange NN I-NP
+, , O
+which WDT B-NP
+has VBZ B-VP
+preoccupied VBN I-VP
+the DT B-NP
+market NN I-NP
+since IN B-PP
+Oct. NNP B-NP
+13 CD I-NP
+, , O
+when WRB B-ADVP
+the DT B-NP
+Dow NNP I-NP
+Jones NNP I-NP
+Industrial NNP I-NP
+Average NNP I-NP
+plunged VBD B-VP
+more JJR B-NP
+than IN I-NP
+190 CD I-NP
+points NNS I-NP
+. . O
+
+Currency NN B-NP
+analysts NNS I-NP
+predict VBP B-VP
+that IN B-SBAR
+in IN B-PP
+the DT B-NP
+coming VBG I-NP
+week NN I-NP
+the DT B-NP
+foreign JJ I-NP
+exchange NN I-NP
+market NN I-NP
+will MD B-VP
+shift VB I-VP
+its PRP$ B-NP
+focus NN I-NP
+back RB B-ADVP
+to TO B-PP
+economic JJ B-NP
+fundamentals NNS I-NP
+, , O
+keeping VBG B-VP
+a DT B-NP
+close NN I-NP
+eye NN I-NP
+out IN B-ADVP
+for IN B-PP
+any DT B-NP
+signs NNS I-NP
+of IN B-PP
+monetary JJ B-NP
+easing NN I-NP
+by IN B-PP
+U.S. NNP B-NP
+Federal NNP I-NP
+Reserve NNP I-NP
+. . O
+
+Late RB B-ADVP
+in IN B-PP
+the DT B-NP
+New NNP I-NP
+York NNP I-NP
+trading NN I-NP
+day NN I-NP
+, , O
+the DT B-NP
+dollar NN I-NP
+was VBD B-VP
+quoted VBN I-VP
+at IN B-PP
+1.8578 CD B-NP
+marks NNS I-NP
+, , O
+up IN B-ADVP
+from IN B-PP
+1.8470 CD B-NP
+marks NNS I-NP
+late JJ B-NP
+Thursday NNP I-NP
+in IN B-PP
+New NNP B-NP
+York NNP I-NP
+. . O
+
+The DT B-NP
+U.S. NNP I-NP
+currency NN I-NP
+was VBD B-VP
+also RB I-VP
+changing VBG I-VP
+hands NNS B-NP
+at IN B-PP
+142.43 CD B-NP
+yen NN I-NP
+, , O
+up IN B-ADVP
+from IN B-PP
+141.70 CD B-NP
+yen NN I-NP
+in IN B-PP
+New NNP B-NP
+York NNP I-NP
+late JJ B-NP
+Thursday NNP I-NP
+. . O
+
+In IN B-PP
+Tokyo NNP B-NP
+on IN B-PP
+Monday NNP B-NP
+, , O
+the DT B-NP
+U.S. NNP I-NP
+currency NN I-NP
+opened VBD B-VP
+for IN B-PP
+trading NN B-NP
+at IN B-PP
+141.95 CD B-NP
+yen NN I-NP
+, , O
+up IN B-ADVP
+from IN B-PP
+Friday NNP B-NP
+'s POS B-NP
+Tokyo NNP I-NP
+
diff --git a/paddle/trainer/tests/testPyDataWrapper.py b/paddle/trainer/tests/testPyDataWrapper.py
new file mode 100644
index 00000000000000..49bd760f4e20e2
--- /dev/null
+++ b/paddle/trainer/tests/testPyDataWrapper.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append("../")
+
+from paddle.trainer.PyDataProviderWrapper import *
+import random
+import json
+import string
+
+
+@provider(slots=[SparseNonValueSlot(10), DenseSlot(2), SparseValueSlot(10), StringSlot(1), IndexSlot(3)])
+def processNonSequenceData(obj, filename):
+    with open(filename, "rb") as f:
+        for line in f:
+            slots_str = line.split(';')
+            index = int(slots_str[0])
+            non_values = map(int, slots_str[1].split()[1:])
+            dense = map(float, slots_str[2].split()[1:])
+            strs = slots_str[4].strip().split(' ', 1)[1]
+
+            def __values_mapper__(s):
+                s = s.split(":")
+                return int(s[0]), float(s[1])
+
+            values = map(__values_mapper__, slots_str[3].split()[1:])
+            yield [non_values, dense, values, strs, index]
+
+
+SPARSE_ID_LIMIT = 1000
+SPARSE_ID_COUNT = 100
+SEQUENCE_LIMIT = 50
+STRING_LIMIT = 10
+
+sparse_id_randomer = lambda: random.randrange(0, SPARSE_ID_LIMIT - 1)
+sparse_count_randomer = lambda: random.randrange(1, SPARSE_ID_COUNT)
+val_randomer = lambda: random.uniform(-1.0, 1.0)
+seq_count_randomer = lambda: random.randrange(1, SEQUENCE_LIMIT)
+str_count_randomer = lambda: random.randrange(1, STRING_LIMIT)
+
+class IDRandomer():  # A random generator, return unique id
+    def __init__(self):
+        self.id_set = set()
+
+    def __call__(self):
+        idx = sparse_id_randomer()
+        if idx not in self.id_set:
+            self.id_set.add(idx)
+            return idx
+        else:
+            return self.__call__()
+# SparseValueSlot
+def sparse_value_creator(_):
+    rand = IDRandomer()
+    return [(rand(), val_randomer()) for _ in xrange(sparse_count_randomer())]
+sparse_value = map(sparse_value_creator, range(seq_count_randomer()))
+
+# DenseSlot
+def dense_creator(_):
+    return [val_randomer() for _ in xrange(SPARSE_ID_LIMIT)]
+dense = map(dense_creator, range(seq_count_randomer()))
+
+# SparseNonValueSlot
+def sparse_creator(_):
+    rand = IDRandomer()
+    return [rand() for _ in xrange(sparse_count_randomer())]
+sparse_nonvalue = map(sparse_creator, range(seq_count_randomer()))
+
+# IndexSlot
+ids = [sparse_id_randomer() for _ in range(seq_count_randomer())]
+
+# StringSlot
+def random_str(size = 8, chars=string.ascii_letters + string.digits):
+    return ''.join(random.choice(chars) for _ in range(size))
+strs = [random_str(str_count_randomer()) for _ in range(seq_count_randomer())]
+
+def processSeqAndGenerateDataInit(obj, *args, **kwargs):
+    obj.json_filename = kwargs.get("load_data_args", "test_data.json")
+
+@provider(slots=[SparseValueSlot(SPARSE_ID_LIMIT), DenseSlot(SPARSE_ID_LIMIT),
+                 SparseNonValueSlot(SPARSE_ID_LIMIT), IndexSlot(SPARSE_ID_LIMIT),
+                 StringSlot(SPARSE_ID_LIMIT)],
+          use_seq=True, init_hook=processSeqAndGenerateDataInit)
+def processSeqAndGenerateData(obj, name):
+    retv = [sparse_value, dense, sparse_nonvalue, ids, strs]
+    # Write to protoseq.
+    with open(obj.json_filename, "w") as f:
+        json.dump(retv, f)
+    yield retv
+
+
+def processSubSeqAndGenerateDataInit(obj, *args, **kwargs):
+    obj.json_filename = kwargs.get("load_data_args", "test_data.json")
+
+@provider(slots=[SparseValueSlot(SPARSE_ID_LIMIT), DenseSlot(SPARSE_ID_LIMIT),
+                 SparseNonValueSlot(SPARSE_ID_LIMIT), IndexSlot(SPARSE_ID_LIMIT),
+                 StringSlot(SPARSE_ID_LIMIT)],
+          use_seq=True, init_hook=processSubSeqAndGenerateDataInit)
+def processSubSeqAndGenerateData(obj, name):
+    retv_json = [sparse_value, dense, sparse_nonvalue, ids, strs]
+    retv_wrapper = [[sparse_value], [dense], [sparse_nonvalue], [ids], [strs]]
+    # Write to protoseq.
+    with open(obj.json_filename, "w") as f:
+        json.dump(retv_json, f)
+    yield retv_wrapper
+
+if __name__ == "__main__":
+    pvd = processNonSequenceData("test.txt")
+    print pvd.getNextBatch(100)
+    pvd = processSeqAndGenerateData("_")
+    print pvd.getNextBatch(100)
+    pvd = processSubSeqAndGenerateData("_")
+    print pvd.getNextBatch(1)
diff --git a/paddle/trainer/tests/test_Compare.cpp b/paddle/trainer/tests/test_Compare.cpp
new file mode 100644
index 00000000000000..735c5a5b27d818
--- /dev/null
+++ b/paddle/trainer/tests/test_Compare.cpp
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/utils/PythonUtil.h>
+
+#include "paddle/trainer/Trainer.h"
+
+#include <cstdlib>
+#include <gtest/gtest.h>
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static const string& configFile = "trainer/tests/sample_trainer_config.conf";
+
+P_DECLARE_int32(gpu_id);
+P_DECLARE_bool(use_gpu);
+P_DECLARE_string(config);
+P_DECLARE_string(config_args);
+
+struct comData {
+  vector<Argument> outArgs;
+  vector<ParameterPtr> parameters;
+};
+
+void calcGradient(bool useGpu, comData& Data) {
+  FLAGS_use_gpu = useGpu;
+  FLAGS_config = configFile;
+
+  *ThreadLocalRand::getSeed() = 0;
+  srand(0);
+  Trainer trainer;
+  trainer.init(TrainerConfigHelper::createFromFlagConfig());
+
+  Data.parameters = trainer.getGradientMachine()->getParameters();
+  DataBatch dataBatch;
+  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
+  trainer.getDataProvider()->setSkipShuffle();
+  trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+  vector<Argument>& inArgs = dataBatch.getStreams();
+  trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
+  for (int i = 0; i < 2; ++i) {
+    trainer.getGradientMachine()->forwardBackward(inArgs, &Data.outArgs,
+                                                  PASS_TRAIN);
+  }
+  trainer.getGradientMachine()->finish();
+}
+
+void compareGradient(comData& comDataCpu, comData& comDataGpu);
+
+TEST(Trainer, create) {
+  int devCount = 0;
+  devCount = hl_get_device_count();
+  FLAGS_config_args = "drop_rate=0";
+
+  comData comDataCpu;
+  calcGradient(false, comDataCpu);
+  LOG(INFO) << "Cpu is completed";
+
+  {
+    LOG(INFO) << "Test GPU";
+    comData comData;
+    calcGradient(true, comData);
+    compareGradient(comDataCpu, comData);
+    LOG(INFO) << "Gpu is completed";
+  }
+
+  {
+    LOG(INFO) << "Test test multi gpu";
+    comData comData;
+    FLAGS_trainer_count = devCount;
+    calcGradient(true, comData);
+    compareGradient(comDataCpu, comData);
+    LOG(INFO) << "Gpu4 is completed";
+  }
+
+  {
+    LOG(INFO) << "Test use_sparse_update=true";
+    comData comData;
+    calcGradient(false, comData);
+    compareGradient(comDataCpu, comData);
+    LOG(INFO) << "Cpu4 is completed";
+  }
+}
+
+double checkBuffer(real* A, real* B, size_t len) {
+#ifdef PADDLE_TYPE_DOUBLE
+  double precision = 1e-7;
+#else
+  double precision = 2e-3;
+#endif
+  int nNum = 0;
+  double maxE = 0;
+  for (size_t i = 0; i < len; ++i) {
+    double e = fabs(A[i] - B[i]);
+    maxE = std::max(e, maxE);
+    nNum += e > precision * fabs(A[i]);
+  }
+  EXPECT_EQ(0, nNum);
+  return maxE;
+}
+
+void compareGradient(comData& comDataCpu, comData& comDataGpu) {
+  /*compare outArgs*/
+  vector<Argument> outArgs1 = comDataCpu.outArgs;
+  vector<Argument> outArgs2 = comDataGpu.outArgs;
+  CpuMatrix out1(outArgs1[0].value->getHeight(), outArgs1[0].value->getWidth());
+  CpuMatrix out2(outArgs2[0].value->getHeight(), outArgs2[0].value->getWidth());
+  out1.copyFrom(*outArgs1[0].value);
+  out2.copyFrom(*outArgs2[0].value);
+  checkBuffer(out1.getData(), out2.getData(), out1.getElementCnt());
+
+  /*compare parameters*/
+  vector<ParameterPtr>& parameters1 = comDataCpu.parameters;
+  vector<ParameterPtr>& parameters2 = comDataGpu.parameters;
+  for (size_t i = 0; i < parameters1.size(); ++i) {
+    ParameterPtr parameter1, parameter2;
+    parameter1 = parameters1[i];
+    parameter2 = parameters2[i];
+    /*compare parameters value*/
+    CpuVector para1(parameter1->getSize());
+    CpuVector para2(parameter2->getSize());
+    para1.copyFrom(*parameter1->getBuf(PARAMETER_VALUE));
+    para2.copyFrom(*parameter2->getBuf(PARAMETER_VALUE));
+    checkBuffer(para1.getData(), para2.getData(), para1.getSize());
+
+    /*compare parameters grad*/
+    CpuVector cpuGrad1(*parameter1->getBuf(PARAMETER_GRADIENT));
+    CpuVector cpuGrad2(*parameter2->getBuf(PARAMETER_GRADIENT));
+    double e =
+        checkBuffer(cpuGrad1.getData(), cpuGrad2.getData(), cpuGrad1.getSize());
+    LOG(INFO) << parameter1->getName() << " max error=" << e;
+  }
+}
+
+int main(int argc, char** argv) {
+#ifdef PADDLE_ONLY_CPU
+  exit(0);
+#endif
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  initPython(argc, argv);
+  int ret = RUN_ALL_TESTS();
+  exit(ret);
+}
diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/trainer/tests/test_CompareSparse.cpp
new file mode 100644
index 00000000000000..3070682c0a2ef9
--- /dev/null
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
@@ -0,0 +1,221 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/utils/PythonUtil.h>
+
+#include "paddle/trainer/Trainer.h"
+
+#include <gtest/gtest.h>
+#include <paddle/pserver/ParameterServer2.h>
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static const string& configFile1 =
+              "trainer/tests/sample_trainer_config_qb_rnn.conf";
+
+P_DECLARE_bool(use_gpu);
+P_DECLARE_string(config);
+P_DECLARE_int32(gpu_id);
+P_DECLARE_int32(seed);
+P_DECLARE_int32(num_passes);
+P_DECLARE_int32(saving_period);
+
+P_DECLARE_int32(num_gradient_servers);
+P_DECLARE_int32(port);
+P_DECLARE_bool(local);
+P_DECLARE_bool(use_old_updater);
+P_DECLARE_bool(parallel_nn);
+P_DECLARE_string(config_args);
+P_DEFINE_double(max_diff_ratio, 0.0f,
+              "max diff ratio allowed for parameters value");
+
+int gNumDevices = 0;
+
+std::vector<ParameterPtr> trainerOnePassTest(const string& configFile,
+                                             bool sparseUpdate,
+                                             int trainerCount = 1,
+                                             bool useGpu = false) {
+  FLAGS_use_gpu = useGpu;
+  FLAGS_config = configFile;
+  FLAGS_trainer_count = trainerCount;
+  FLAGS_config_args = sparseUpdate ? "sparse_update=1" : "sparse_update=0";
+
+  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
+            << " configFile=" << configFile
+            << " sparseUpdate=" << sparseUpdate;
+  srand(FLAGS_seed);
+  *ThreadLocalRand::getSeed() = FLAGS_seed;
+
+  if (useGpu) {
+    CHECK_LE(trainerCount, gNumDevices);
+  }
+
+  std::vector<std::shared_ptr<ParameterServer2>> pservers;
+  if (!FLAGS_local) {
+    int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
+    pservers.resize(numPorts);
+
+    for (int i = 0; i < numPorts; ++i) {
+      pservers[i].reset(new ParameterServer2(std::string(), FLAGS_port + i));
+      pservers[i]->init();
+      pservers[i]->start();
+    }
+  }
+
+  Trainer trainer;
+  trainer.init(TrainerConfigHelper::createFromFlagConfig());
+  trainer.train();
+  return trainer.getGradientMachine()->getParameters();
+}
+
+std::vector<ParameterPtr>& getDenseParameters() {
+  static std::vector<ParameterPtr> denseParameters;
+  if (denseParameters.empty()) {
+    // use dense training as base
+    FLAGS_local = true;
+    denseParameters = trainerOnePassTest(configFile1, false);
+  }
+
+  return denseParameters;
+}
+
+void checkBuffer(real* A, const char* desA, real* B, const char* desB,
+                 size_t len, double maxDiffRatio) {
+  double maxDiff = 0;
+  double maxValue = 0;
+  for (size_t i = 0; i < len; ++i) {
+    double diff = fabs(A[i] - B[i]);
+    maxValue = std::max<double>(maxValue, std::max(fabs(A[i]), fabs(B[i])));
+    maxDiff = std::max(maxDiff, diff);
+  }
+  EXPECT_LE(maxDiff / maxValue, maxDiffRatio);
+  LOG(INFO) << " maxDiff=" << maxDiff
+            << " maxValue=" << maxValue
+            << " maxDiff/maxValue=" << maxDiff / maxValue
+            << "\n\n";
+}
+
+void compareValue(const vector<ParameterPtr>& parametersA,
+                  const vector<ParameterPtr>& parametersB,
+                  double maxDiffRatio = 0.0) {
+  LOG(INFO) << "\n\n--------------------------------"
+            << " Check Gradient Machine Parameters:"
+            << " -------------------------------------\n";
+  for (size_t i = 0; i < parametersA.size(); ++i) {
+    ParameterPtr parameterA, parameterB;
+    parameterA = parametersA[i];
+    parameterB = parametersB[i];
+
+    CpuVector paraA(parameterA->getSize());
+    CpuVector paraB(parameterB->getSize());
+    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
+    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
+
+    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
+              << " ; size : " << paraA.getSize() << " ------------";
+    checkBuffer(paraA.getData(), "para_A", paraB.getData(), "para_B",
+                paraA.getSize(), maxDiffRatio);
+  }
+}
+
+TEST(compareSparse, cpu) {
+  FLAGS_local = 1;  // disable remote sparse update in parameter config
+  std::vector<ParameterPtr> parameters = trainerOnePassTest(configFile1, true);
+  compareValue(getDenseParameters(), parameters);
+}
+
+TEST(compareSparse, remote_cpu) {
+  FLAGS_local = 0;  // will enable remote sparse update
+  FLAGS_ports_num_for_sparse = 5;
+  std::vector<ParameterPtr> parameters = trainerOnePassTest(configFile1, true);
+  compareValue(getDenseParameters(), parameters);
+}
+
+TEST(compareSparse, cpu10_local_vs_remote) {
+  FLAGS_local = 1;  // disable remote sparse update in parameter config
+  std::vector<ParameterPtr> localParameters =
+      trainerOnePassTest(configFile1, true, 10);
+
+  FLAGS_local = 0;  // will enable remote sparse update
+  FLAGS_ports_num_for_sparse = 5;
+  std::vector<ParameterPtr> remoteParameters =
+      trainerOnePassTest(configFile1, true, 10);
+
+  compareValue(localParameters, remoteParameters);
+}
+
+TEST(compareSparse, multiGradientMachine) {
+  int numGpu;
+#ifdef PADDLE_TYPE_DOUBLE
+  double eps = 1e-8;
+#else
+  double eps = 1e-4;
+#endif
+  numGpu = hl_get_device_count();
+  for (bool local : {false, true}) {
+    FLAGS_local = local;
+    FLAGS_ports_num_for_sparse = 5;
+    for (bool useGpu : {false, true}) {
+#ifdef PADDLE_ONLY_CPU
+      if (useGpu) continue;
+#endif
+      FLAGS_parallel_nn = useGpu;
+      LOG(INFO) << " local=" << local
+                << " useGpu=" << useGpu;
+      int trainerCount = useGpu ? numGpu : 10;
+      std::vector<ParameterPtr> parameters =
+          trainerOnePassTest(configFile1, true, trainerCount, useGpu);
+      compareValue(getDenseParameters(), parameters, eps);
+    }
+  }
+  FLAGS_parallel_nn = false;
+}
+
+TEST(compareSparse, NeuralNetwork) {
+#ifdef PADDLE_TYPE_DOUBLE
+  double eps = 1e-8;
+#else
+  double eps = 1e-4;
+#endif
+  for (bool local : {false, true}) {
+    FLAGS_local = local;
+    FLAGS_ports_num_for_sparse = 5;
+    for (bool useGpu : {false, true}) {
+#ifdef PADDLE_ONLY_CPU
+      if (useGpu) continue;
+#endif
+      FLAGS_parallel_nn = useGpu;
+      LOG(INFO) << " local=" << local
+                << " useGpu=" << useGpu;
+      int trainerCount = 1;
+      std::vector<ParameterPtr> parameters =
+          trainerOnePassTest(configFile1, true, trainerCount, useGpu);
+      compareValue(getDenseParameters(), parameters, useGpu ? eps : 0);
+    }
+  }
+  FLAGS_parallel_nn = false;
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  initPython(argc, argv);
+
+  gNumDevices = hl_get_device_count();
+  FLAGS_num_passes = 1;          // train one pass
+  FLAGS_saving_period = 100000;  // do not save parameter
+
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/trainer/tests/test_CompareTwoNets.cpp b/paddle/trainer/tests/test_CompareTwoNets.cpp
new file mode 100644
index 00000000000000..d1057f2aeabd3b
--- /dev/null
+++ b/paddle/trainer/tests/test_CompareTwoNets.cpp
@@ -0,0 +1,190 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/utils/PythonUtil.h>
+#include <cstdlib>
+#include <algorithm>
+#include <gtest/gtest.h>
+
+#include "paddle/trainer/Trainer.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+P_DECLARE_int32(gpu_id);
+
+P_DECLARE_bool(local);
+P_DECLARE_bool(use_gpu);
+
+P_DECLARE_string(config);
+P_DECLARE_string(nics);
+
+P_DEFINE_string(config_file_a, "", "config of one network to compare");
+P_DEFINE_string(config_file_b, "", "config of another network to compare");
+P_DEFINE_bool(need_high_accuracy, false,
+              "whether need to run in double accuracy");
+P_DEFINE_double(
+    max_diff_ratio, 0.0f,
+    "max diff ratio allowed for outputs and parameters (value/gradient)");
+P_DECLARE_bool(thread_local_rand_use_global_seed);
+P_DECLARE_int32(seed);
+
+struct ComData {
+  vector<Argument> outArgs;
+  vector<ParameterPtr> parameters;
+};
+
+void calcGradient(ComData& data, const string configFile) {
+  FLAGS_config = configFile;
+
+  FLAGS_local = true;
+  FLAGS_use_gpu = false;
+
+  FLAGS_nics = "";
+
+  *ThreadLocalRand::getSeed() = FLAGS_seed;
+  srand(FLAGS_seed);
+
+  Trainer trainer;
+  trainer.init(TrainerConfigHelper::createFromFlagConfig(), false);
+
+  data.parameters = trainer.getGradientMachine()->getParameters();
+
+  DataBatch dataBatch;
+  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
+
+  trainer.getDataProvider()->setSkipShuffle();
+  trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
+
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+  vector<Argument>& inArgs = dataBatch.getStreams();
+
+  trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
+  trainer.getGradientMachine()->forwardBackward(inArgs, &data.outArgs,
+                                                PASS_TRAIN);
+
+  trainer.getGradientMachine()->finish();
+}
+
+void checkBuffer(real* A, const char* desA, real* B, const char* desB,
+                 size_t len, size_t width = 1) {
+  int nNum = 0;
+  real maxVal = 0;
+  for (size_t i = 0; i < len; ++i) {
+    maxVal = std::max(maxVal, std::max(A[i], B[i]));
+  }
+  real maxDiff = 0;
+  for (size_t i = 0; i < len; ++i) {
+    real diff = fabs(A[i] - B[i]);
+    maxDiff = std::max(maxDiff, diff);
+    if (diff > maxVal * FLAGS_max_diff_ratio) {
+      nNum++;
+      VLOG(1) << "Row: " << i / width << ", " << desA << " : " << A[i]
+              << "    " << desB << " : " << B[i] << " diff=" << diff;
+    }
+  }
+  EXPECT_EQ(0, nNum);
+  LOG(INFO) << "maxValue=" << maxVal << " maxDiff=" << maxDiff << "\n\n";
+}
+
+void compareGradient(ComData& comDataA, ComData& comDataB) {
+  vector<Argument> outArgsA = comDataA.outArgs;
+  vector<Argument> outArgsB = comDataB.outArgs;
+
+  for (size_t i = 0; i < outArgsA.size(); ++i) {
+    CpuMatrix matA(outArgsA[i].value->getHeight(),
+                   outArgsA[i].value->getWidth());
+    CpuMatrix matB(outArgsB[i].value->getHeight(),
+                   outArgsB[i].value->getWidth());
+
+    matA.copyFrom(*outArgsA[i].value);
+    matB.copyFrom(*outArgsB[i].value);
+
+    LOG(INFO) << "\n--------------------------------"
+              << " Check Network Output_" << i << ":"
+              << " -------------------------------------\n";
+    checkBuffer(matA.getData(), "network A output", matB.getData(),
+                "network B output", matA.getElementCnt(), matA.getWidth());
+  }
+
+  vector<ParameterPtr>& parametersA = comDataA.parameters;
+  vector<ParameterPtr>& parametersB = comDataB.parameters;
+
+  LOG(INFO) << "\n\n--------------------------------"
+            << " Check Gradient Machine Parameters:"
+            << " -------------------------------------\n";
+  for (size_t i = 0; i < parametersA.size(); ++i) {
+    ParameterPtr parameterA, parameterB;
+    parameterA = parametersA[i];
+    parameterB = parametersB[i];
+
+    CpuVector paraA(parameterA->getSize());
+    CpuVector paraB(parameterB->getSize());
+    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
+    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
+
+    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
+              << " ; size : " << paraA.getSize() << " ------------";
+    checkBuffer(paraA.getData(), "Network A", paraB.getData(), "Network B",
+                paraA.getSize());
+
+    CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
+    CpuVector gradB(*parameterB->getBuf(PARAMETER_GRADIENT));
+
+    LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
+              << " ; size : " << gradA.getSize() << " -----------";
+    checkBuffer(gradA.getData(), "Network A", gradB.getData(), "Network B",
+                gradA.getSize());
+  }
+}
+
+TEST(Trainer, create) {
+  ComData dataA;
+  calcGradient(dataA, FLAGS_config_file_a);
+  LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
+
+  ComData dataB;
+  calcGradient(dataB, FLAGS_config_file_b);
+  LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
+
+  compareGradient(dataA, dataB);
+}
+
+int main(int argc, char** argv) {
+  FLAGS_thread_local_rand_use_global_seed = true;
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  initPython(argc, argv);
+
+#ifndef PADDLE_TYPE_DOUBLE
+  if (FLAGS_need_high_accuracy) {
+    LOG(INFO) << "skip test due to it's need high accuracy";
+    return 0;
+  }
+  if (FLAGS_max_diff_ratio == 0.0f) {
+    FLAGS_max_diff_ratio = 1e-5;
+    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
+              << " in low accuracy mode";
+  }
+#else
+  if (FLAGS_max_diff_ratio == 0.0f) {
+    FLAGS_max_diff_ratio = 1e-10;
+    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
+              << " in high accuracy mode";
+  }
+#endif
+
+  int ret = RUN_ALL_TESTS();
+  return ret;
+}
diff --git a/paddle/trainer/tests/test_CompareTwoOpts.cpp b/paddle/trainer/tests/test_CompareTwoOpts.cpp
new file mode 100644
index 00000000000000..2c44da43fcd698
--- /dev/null
+++ b/paddle/trainer/tests/test_CompareTwoOpts.cpp
@@ -0,0 +1,168 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/utils/PythonUtil.h>
+#include <cstdlib>
+#include <algorithm>
+#include <gtest/gtest.h>
+
+#include "paddle/trainer/Trainer.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+P_DECLARE_int32(gpu_id);
+
+P_DECLARE_bool(local);
+P_DECLARE_bool(use_gpu);
+
+P_DECLARE_string(config);
+P_DECLARE_string(nics);
+
+P_DEFINE_string(config_file_a, "", "config of one network to compare");
+P_DEFINE_string(config_file_b, "", "config of another network to compare");
+P_DEFINE_bool(need_high_accuracy, true,
+              "whether need to run in double accuracy (recommended)");
+P_DEFINE_double(
+      max_diff_ratio, 0.0f,
+      "max diff ratio allowed for outputs and parameters (value/gradient)");
+
+struct ComData {
+  vector<Argument> outArgs;
+  vector<ParameterPtr> parameters;
+};
+
+void calcGradient(ComData& data, const string configFile) {
+  FLAGS_config = configFile;
+
+  FLAGS_local = true;
+  FLAGS_use_gpu = false;
+
+  FLAGS_nics = "";
+
+  *ThreadLocalRand::getSeed() = 0;
+  srand(0);
+
+  Trainer trainer;
+  trainer.init(TrainerConfigHelper::createFromFlagConfig(), false);
+
+  data.parameters = trainer.getGradientMachine()->getParameters();
+  trainer.getDataProvider()->setSkipShuffle();
+  trainer.train();
+}
+
+void checkBuffer(real* A, const char* desA, real* B, const char* desB,
+                 size_t len, size_t width = 1) {
+  int nNum = 0;
+  for (size_t i = 0; i < len; ++i) {
+    real diff = fabs(A[i] - B[i]);
+    if (diff > 0.0f &&
+        diff / std::max(fabs(A[i]), fabs(B[i])) > FLAGS_max_diff_ratio) {
+      nNum++;
+      LOG(INFO) << "Row: " << i / width << ", " << desA << " : " << A[i]
+                << "    " << desB << " : " << B[i];
+    }
+  }
+  EXPECT_EQ(0, nNum);
+  LOG(INFO) << "\n\n";
+}
+
+void compareGradient(ComData& comDataA, ComData& comDataB) {
+  vector<Argument> outArgsA = comDataA.outArgs;
+  vector<Argument> outArgsB = comDataB.outArgs;
+
+  for (size_t i = 0; i < outArgsA.size(); ++i) {
+    CpuMatrix matA(outArgsA[i].value->getHeight(),
+                   outArgsA[i].value->getWidth());
+    CpuMatrix matB(outArgsB[i].value->getHeight(),
+                   outArgsB[i].value->getWidth());
+
+    matA.copyFrom(*outArgsA[i].value);
+    matB.copyFrom(*outArgsB[i].value);
+
+    LOG(INFO) << "\n--------------------------------"
+              << " Check Network Output_" << i << ":"
+              << " -------------------------------------\n";
+    checkBuffer(matA.getData(), "network A output", matB.getData(),
+                "network B output", matA.getElementCnt(), matA.getWidth());
+  }
+
+  vector<ParameterPtr>& parametersA = comDataA.parameters;
+  vector<ParameterPtr>& parametersB = comDataB.parameters;
+
+  LOG(INFO) << "\n\n--------------------------------"
+            << " Check Gradient Machine Parameters:"
+            << " -------------------------------------\n";
+  for (size_t i = 0; i < parametersA.size(); ++i) {
+    ParameterPtr parameterA, parameterB;
+    parameterA = parametersA[i];
+    parameterB = parametersB[i];
+
+    CpuVector paraA(parameterA->getSize());
+    CpuVector paraB(parameterB->getSize());
+    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
+    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
+
+    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
+              << " ; size : " << paraA.getSize() << " ------------";
+    checkBuffer(paraA.getData(), "Network A", paraB.getData(), "Network B",
+                paraA.getSize());
+
+    CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
+    CpuVector gradB(*parameterB->getBuf(PARAMETER_GRADIENT));
+
+    LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
+              << " ; size : " << gradA.getSize() << " -----------";
+    checkBuffer(gradA.getData(), "Network A", gradB.getData(), "Network B",
+                gradA.getSize());
+  }
+}
+
+TEST(Trainer, create) {
+  ComData dataA;
+  calcGradient(dataA, FLAGS_config_file_a);
+  LOG(INFO) << "\n\ntraining of Network A is finished\n\n";
+
+  ComData dataB;
+  calcGradient(dataB, FLAGS_config_file_b);
+  LOG(INFO) << "\n\ntraining of the Network B is finished\n\n";
+
+  compareGradient(dataA, dataB);
+}
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  initPython(argc, argv);
+
+#ifndef PADDLE_TYPE_DOUBLE
+  if (FLAGS_need_high_accuracy) {
+    LOG(INFO) << "skip test due to it's need high accuracy";
+    return 0;
+  }
+  if (FLAGS_max_diff_ratio == 0.0f) {
+    FLAGS_max_diff_ratio = 2e-4;
+    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
+              << " in low accuracy mode";
+  }
+#else
+  if (FLAGS_max_diff_ratio == 0.0f) {
+    FLAGS_max_diff_ratio = 2e-7;
+    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
+              << " in high accuracy mode";
+  }
+#endif
+  int ret = RUN_ALL_TESTS();
+  return ret;
+}
diff --git a/paddle/trainer/tests/test_Prediction.cpp b/paddle/trainer/tests/test_Prediction.cpp
new file mode 100644
index 00000000000000..1c7f93666b8dfd
--- /dev/null
+++ b/paddle/trainer/tests/test_Prediction.cpp
@@ -0,0 +1,171 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/utils/PythonUtil.h>
+
+#include "paddle/trainer/Trainer.h"
+
+#include <gtest/gtest.h>
+
+P_DECLARE_string(config);
+P_DECLARE_string(config_args);
+P_DEFINE_string(merger, "./paddle_merge_model",
+                "path to paddle_merge_model binary");
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static const string& configFile = "trainer/tests/sample_trainer_config.conf";
+static const string& mergedModelFile = "./test_model_file";
+static const string& modelDir = "./test_model_dir";
+
+void checkBuffer(real* vec1, real* vec2, size_t len) {
+  for (size_t i = 0; i < len; i++) {
+    EXPECT_EQ(vec1[i], vec2[i]) << "vec1:" << vec1[i] << " vec2:" << vec2[i];
+  }
+}
+
+void checkParameters(vector<ParameterPtr> A, vector<ParameterPtr> B) {
+  CHECK_EQ(B.size(), A.size()) << "parameter size not equal";
+  for (size_t i = 0; i < A.size(); i++) {
+    auto vec1 = A[i]->getBuf(PARAMETER_VALUE);
+    auto vec2 = B[i]->getBuf(PARAMETER_VALUE);
+    CHECK_EQ(vec1->useGpu_, vec2->useGpu_) << "use gpu not equal";
+    CHECK_EQ(vec1->getSize(), vec2->getSize()) << "size not equal";
+
+    if (vec1->useGpu_ == false) {
+      checkBuffer(vec1->getData(), vec2->getData(), vec1->getSize());
+    } else {
+      VectorPtr cpuVec1 = Vector::create(vec1->getSize(), false);
+      VectorPtr cpuVec2 = Vector::create(vec2->getSize(), false);
+      cpuVec1->copyFrom(*vec1, HPPL_STREAM_DEFAULT);
+      cpuVec2->copyFrom(*vec2, HPPL_STREAM_DEFAULT);
+      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+      checkBuffer(cpuVec1->getData(), cpuVec2->getData(), cpuVec1->getSize());
+    }
+  }
+}
+
+TEST(GradientMachine, create) {
+#ifdef PADDLE_ONLY_CPU
+  FLAGS_use_gpu = false;
+#endif
+  mkDir(modelDir.c_str());
+  FLAGS_config = configFile;
+  FLAGS_config_args = "with_cost=False";
+  auto config = TrainerConfigHelper::createFromFlagConfig();
+
+  // save model to directory
+  unique_ptr<GradientMachine> gradientMachine1(
+      GradientMachine::create(*config));
+  gradientMachine1->saveParameters(modelDir);
+  Trainer trainer;
+  trainer.init(config);
+  ParameterUtil* paramUtil = trainer.getParameterUtilPtr();
+  if (paramUtil != NULL) {
+    paramUtil->saveConfigWithPath(modelDir);
+  }
+
+  // create a different GradientMachine
+  unique_ptr<GradientMachine> gradientMachine2(
+      GradientMachine::create(*config));
+  gradientMachine2->randParameters();
+
+  // merge config and model to one file
+  string cmd = FLAGS_merger + " --model_dir=" + modelDir +
+               " --config_args=with_cost=False" + " --model_file=" +
+               mergedModelFile;
+  LOG(INFO) << cmd;
+  int ret = system(cmd.c_str());
+  EXPECT_EQ(0, ret);
+  if (ret) {
+    return;
+  }
+
+  // create GradientMachine from the merged model
+  DataConfig dataConfig;
+  unique_ptr<GradientMachine> gradientMachine3(
+      GradientMachine::create(mergedModelFile, &dataConfig));
+  CHECK(gradientMachine3);
+  EXPECT_EQ(dataConfig.type(), "simple");
+  EXPECT_EQ(dataConfig.feat_dim(), 3);
+
+  // compare the parameters of GradientMachine and GradientMachine3
+  std::vector<ParameterPtr> paraMachine1 = gradientMachine1->getParameters();
+  std::vector<ParameterPtr> paraMachine3 = gradientMachine3->getParameters();
+  checkParameters(paraMachine1, paraMachine3);
+
+  // Test that the GradientMachine created from the merged model
+  // is same as the orginnal one.
+  vector<Argument> inArgs(1);
+  vector<Argument> outArgs;
+
+  int inputDim = 3;
+  int numSamples = 2;
+  CpuMatrix cpuInput(numSamples, inputDim);
+  for (int i = 0; i < numSamples; ++i) {
+    for (int j = 0; j < inputDim; ++j) {
+      cpuInput.getData()[i * inputDim + j] =
+          rand() / (real)RAND_MAX;  // NOLINT TODO(yuyang): use rand_r
+    }
+  }
+  MatrixPtr input = Matrix::create(numSamples, inputDim,
+                                   /* trans */ false, FLAGS_use_gpu);
+  input->copyFrom(cpuInput);
+  inArgs[0].value = input;
+  gradientMachine1->forward(inArgs, &outArgs, PASS_TEST);
+  EXPECT_EQ((size_t)1, outArgs.size());
+
+  vector<Argument> outArgs2;
+  gradientMachine2->forward(inArgs, &outArgs2, PASS_TEST);
+  CpuMatrix out1(outArgs[0].value->getHeight(), outArgs[0].value->getWidth());
+  CpuMatrix out2(outArgs2[0].value->getHeight(), outArgs2[0].value->getWidth());
+  out1.copyFrom(*outArgs[0].value);
+  out2.copyFrom(*outArgs2[0].value);
+  for (size_t i = 0; i < out1.getHeight() * out1.getWidth(); i++) {
+    EXPECT_NE(out1.getData()[i], out2.getData()[i]);
+  }
+
+  gradientMachine3->forward(inArgs, &outArgs2, PASS_TEST);
+  out2.copyFrom(*outArgs2[0].value);
+  checkBuffer(out1.getData(), out2.getData(),
+              out2.getHeight() * out2.getWidth());
+
+  cmd = " rm -rf " + modelDir + "/*";
+  LOG(INFO) << "cmd " << cmd;
+  ret = system(cmd.c_str());
+  EXPECT_EQ(0, ret);
+  if (ret) {
+    return;
+  }
+
+  cmd = " rm -rf " + mergedModelFile;
+  LOG(INFO) << "cmd " << cmd;
+  ret = system(cmd.c_str());
+  EXPECT_EQ(0, ret);
+  if (ret) {
+    return;
+  }
+
+  // clean up
+  rmDir(modelDir.c_str());
+  remove(mergedModelFile.c_str());
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  initPython(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
new file mode 100644
index 00000000000000..49332b877db646
--- /dev/null
+++ b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
@@ -0,0 +1,317 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef PADDLE_NO_PYTHON
+#include <gtest/gtest.h>
+#include <paddle/utils/PythonUtil.h>
+#include <paddle/gserver/dataproviders/DataProvider.h>
+#include <DataConfig.pb.h>
+#include <paddle/math/Matrix.h>
+#include <paddle/parameter/Argument.h>
+#include <unordered_map>
+#include <unordered_set>
+#include <typeinfo>
+#include <fstream>
+#include "picojson.h"
+
+void checkEqual(const paddle::Argument& expect, const paddle::Argument& actual);
+void checkValue(std::vector<paddle::Argument>& arguments, picojson::array& arr);
+const std::string kDir = "./trainer/tests/pydata_provider_wrapper_dir/";
+
+TEST(PyDataProviderWrapper, NoSequenceData) {
+  paddle::DataConfig conf;
+  conf.set_type("py");
+  conf.set_load_data_module(std::string("testPyDataWrapper"));
+  conf.set_load_data_object(std::string("processNonSequenceData"));
+  conf.set_async_load_data(false);
+  conf.clear_files();
+  conf.set_files(kDir + "test_pydata_provider_wrapper.list");
+  paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
+  provider->setSkipShuffle();
+  provider->reset();
+  paddle::DataBatch batchFromPy;
+  provider->getNextBatch(100, &batchFromPy);
+
+  paddle::DataConfig conf2;
+  conf2.set_type("proto");
+  conf2.set_async_load_data(false);
+  conf2.clear_files();
+  conf2.set_files(kDir + "test_pydata_provider_wrapper.protolist");
+
+  provider.reset(paddle::DataProvider::create(conf2, false));
+  provider->setSkipShuffle();
+  provider->reset();
+  paddle::DataBatch batchFromProto;
+  provider->getNextBatch(100, &batchFromProto);
+
+  std::vector<paddle::Argument>& pyArguments = batchFromPy.getStreams();
+  std::vector<paddle::Argument>& protoArguments = batchFromProto.getStreams();
+  EXPECT_EQ(pyArguments.size(), protoArguments.size());
+
+  for (size_t i = 0; i < pyArguments.size(); ++i) {
+    checkEqual(protoArguments[i], pyArguments[i]);
+  }
+}
+
+TEST(PyDataProviderWrapper, SequenceData) {
+  paddle::DataConfig conf;
+  conf.set_type("py");
+  conf.set_load_data_module("testPyDataWrapper");
+  conf.set_load_data_object("processSeqAndGenerateData");
+  conf.set_load_data_args(kDir + "test_pydata_provider_wrapper.json");
+  conf.clear_files();
+  conf.set_files(kDir + "test_pydata_provider_wrapper.list");
+  paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
+  provider->setSkipShuffle();
+  provider->reset();
+  paddle::DataBatch batchFromPy;
+  provider->getNextBatch(100, &batchFromPy);
+
+  picojson::value val;
+  std::fstream fin;
+  fin.open(kDir + "test_pydata_provider_wrapper.json", std::ios_base::in);
+  EXPECT_TRUE(fin.is_open());
+  if (fin.is_open()) {
+    std::string err = picojson::parse(val, fin);
+    EXPECT_TRUE(err.empty());
+    EXPECT_TRUE(val.is<picojson::array>());
+    picojson::array& arr = val.get<picojson::array>();
+    std::vector<paddle::Argument>& arguments = batchFromPy.getStreams();
+    // CHECK Value
+    checkValue(arguments, arr);
+    // CHECK sequenceStartPositions
+    for (size_t i = 0; i < arr.size(); i++) {
+      int row_id = arr[i].get<picojson::array>().size();
+      EXPECT_EQ(0, arguments[i].sequenceStartPositions->getData(false)[0]);
+      EXPECT_EQ((int)row_id,
+                arguments[i].sequenceStartPositions->getData(false)[1]);
+    }
+    fin.close();
+  }
+}
+
+TEST(PyDataProviderWrapper, HasSubSequenceData) {
+  paddle::DataConfig conf;
+  conf.set_type("py");
+  conf.set_load_data_module("testPyDataWrapper");
+  conf.set_load_data_object("processSubSeqAndGenerateData");
+  conf.set_load_data_args(kDir + "test_pydata_provider_wrapper.json");
+  conf.clear_files();
+  conf.set_files(kDir + "test_pydata_provider_wrapper.list");
+  paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
+  provider->setSkipShuffle();
+  provider->reset();
+  paddle::DataBatch batchFromPy;
+  provider->getNextBatch(1, &batchFromPy);
+
+  picojson::value val;
+  std::fstream fin;
+  fin.open(kDir + "test_pydata_provider_wrapper.json", std::ios_base::in);
+  EXPECT_TRUE(fin.is_open());
+  if (fin.is_open()) {
+    std::string err = picojson::parse(val, fin);
+    EXPECT_TRUE(err.empty());
+    EXPECT_TRUE(val.is<picojson::array>());
+    picojson::array& arr = val.get<picojson::array>();
+    std::vector<paddle::Argument>& arguments = batchFromPy.getStreams();
+    // CHECK Value
+    checkValue(arguments, arr);
+    // CHECK sequenceStartPositions and subSequenceStartPositions
+    for (size_t i = 0; i < arr.size(); i++) {
+      int row_id = arr[i].get<picojson::array>().size();
+      EXPECT_EQ(0, arguments[i].sequenceStartPositions->getData(false)[0]);
+      EXPECT_EQ((int)row_id,
+                arguments[i].sequenceStartPositions->getData(false)[1]);
+      EXPECT_EQ(0, arguments[i].subSequenceStartPositions->getData(false)[0]);
+      EXPECT_EQ((int)row_id,
+                arguments[i].subSequenceStartPositions->getData(false)[1]);
+    }
+    fin.close();
+  }
+}
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  paddle::initPython(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+void checkEqual(const paddle::Argument& expect,
+                const paddle::Argument& actual) {
+  if (expect.value) {
+    EXPECT_TRUE(actual.value != nullptr);
+    paddle::Matrix* e = expect.value.get();
+    paddle::Matrix* a = actual.value.get();
+    EXPECT_EQ(e->getWidth(), a->getWidth());
+    EXPECT_EQ(e->getHeight(), a->getHeight());
+    if (dynamic_cast<paddle::CpuSparseMatrix*>(e)) {
+      paddle::CpuSparseMatrix* se = dynamic_cast<paddle::CpuSparseMatrix*>(e);
+      paddle::CpuSparseMatrix* sa = dynamic_cast<paddle::CpuSparseMatrix*>(a);
+      EXPECT_EQ(se->getFormat(), sa->getFormat());
+      EXPECT_EQ(se->getElementCnt(), sa->getElementCnt());
+      size_t rowSize = se->getFormat() == paddle::SPARSE_CSC
+                           ? se->getElementCnt()
+                           : se->getHeight() + 1;
+      size_t colSize = se->getFormat() == paddle::SPARSE_CSC
+                           ? se->getWidth() + 1
+                           : se->getElementCnt();
+      for (size_t i = 0; i < rowSize; ++i) {
+        EXPECT_EQ(se->getRows()[i], sa->getRows()[i]);
+      }
+      for (size_t i = 0; i < colSize; ++i) {
+        EXPECT_EQ(se->getCols()[i], sa->getCols()[i]);
+      }
+      if (se->getValueType() == paddle::FLOAT_VALUE) {
+        EXPECT_EQ(paddle::FLOAT_VALUE, sa->getValueType());
+        for (size_t i = 0; i < se->getElementCnt(); ++i) {
+          EXPECT_EQ(se->getValue()[i], sa->getValue()[i]);
+        }
+      }
+    } else if (dynamic_cast<paddle::CpuMatrix*>(e)) {
+      EXPECT_EQ(e->getElementCnt(), a->getElementCnt());
+      for (size_t i = 0; i < e->getElementCnt(); ++i) {
+        EXPECT_EQ(e->getData()[i], a->getData()[i]);
+      }
+    }
+  }
+
+  if (expect.ids) {
+    EXPECT_TRUE(actual.ids != nullptr);
+    paddle::VectorT<int>* e = expect.ids.get();
+    paddle::VectorT<int>* a = actual.ids.get();
+    EXPECT_EQ(e->getSize(), a->getSize());
+    for (size_t i = 0; i < e->getSize(); ++i) {
+      EXPECT_EQ(e->getData()[i], a->getData()[i]);
+    }
+  }
+
+  if (expect.strs) {
+    EXPECT_TRUE(actual.strs != nullptr);
+    std::vector<std::string>* e = expect.strs.get();
+    std::vector<std::string>* a = actual.strs.get();
+    EXPECT_EQ(e->size(), a->size());
+    for (size_t i = 0; i < e->size(); ++i) {
+      EXPECT_EQ((*e)[i], (*a)[i]);
+    }
+  }
+}
+
+void checkValue(std::vector<paddle::Argument>& arguments,
+                picojson::array& arr) {
+  // CHECK SLOT 0, Sparse Value.
+  paddle::Argument& sparse_values_seq = arguments[0];
+  paddle::MatrixPtr& sparse_values_seq_rawmatrix = sparse_values_seq.value;
+  EXPECT_TRUE(sparse_values_seq_rawmatrix != nullptr);
+  paddle::CpuSparseMatrix* sparse_val_seq_sparse_mat =
+      dynamic_cast<paddle::CpuSparseMatrix*>(sparse_values_seq_rawmatrix.get());
+  EXPECT_TRUE(sparse_val_seq_sparse_mat != nullptr);
+  EXPECT_EQ(arr.size(), arguments.size());
+  EXPECT_TRUE(arr[0].is<picojson::array>());
+  size_t row_id = 0;
+  for (picojson::value& sparse_val_seq : arr[0].get<picojson::array>()) {
+    std::unordered_map<int, real> cols;
+    for (picojson::value& kv : sparse_val_seq.get<picojson::array>()) {
+      EXPECT_TRUE(kv.get(0).is<double>());
+      EXPECT_TRUE(kv.get(1).is<double>());
+      int col = (int)(kv.get(0).get<double>());
+      real val = (real)(kv.get(1).get<double>());
+      cols.insert({col, val});
+    }
+    size_t colNum = sparse_val_seq_sparse_mat->getColNum(row_id);
+    EXPECT_EQ(cols.size(), colNum);
+    int* rowIds = sparse_val_seq_sparse_mat->getRowCols(row_id);
+    real* rowBuf = sparse_val_seq_sparse_mat->getRowValues(row_id);
+    for (size_t i = 0; i < colNum; ++i) {
+      int id = rowIds[i];
+      auto it = cols.find(id);
+      EXPECT_NE(cols.end(), it);
+      real expect = it->second;
+      EXPECT_NEAR(expect, *rowBuf, 1e-5);
+      ++rowBuf;
+    }
+    ++row_id;
+  }
+
+  // CHECK SLOT 1, Dense Value.
+  paddle::Argument& dense_arg = arguments[1];
+  paddle::MatrixPtr& dense_mat = dense_arg.value;
+  EXPECT_NE(nullptr, dense_mat);
+  EXPECT_TRUE(arr[1].is<picojson::array>());
+  row_id = 0;
+  for (picojson::value& dense_seq : arr[1].get<picojson::array>()) {
+    EXPECT_TRUE(dense_seq.is<picojson::array>());
+    picojson::array& row = dense_seq.get<picojson::array>();
+    EXPECT_EQ(row.size(), dense_mat->getWidth());
+    real* rowBuf = dense_mat->getRowBuf(row_id++);
+
+    for (picojson::value& val : row) {
+      EXPECT_TRUE(val.is<double>());
+      real expect = val.get<double>();
+      EXPECT_NEAR(expect, *rowBuf++, 1e-5);
+    }
+  }
+
+  // CHECK SLOT 2, Sparse Non Value.
+  paddle::Argument& sparse_non_val_arg = arguments[2];
+  paddle::MatrixPtr& sparse_non_val_rawm = sparse_non_val_arg.value;
+  EXPECT_NE(nullptr, sparse_non_val_rawm);
+  paddle::CpuSparseMatrix* sparse_non_val_m =
+      dynamic_cast<paddle::CpuSparseMatrix*>(sparse_non_val_rawm.get());
+  EXPECT_NE(nullptr, sparse_non_val_m);
+  row_id = 0;
+  for (picojson::value& row : arr[2].get<picojson::array>()) {
+    EXPECT_TRUE(row.is<picojson::array>());
+    std::unordered_set<int> ids;
+    for (picojson::value& id : row.get<picojson::array>()) {
+      EXPECT_TRUE(id.is<double>());
+      ids.insert((int)(id.get<double>()));
+    }
+    size_t colNum = sparse_non_val_m->getColNum(row_id);
+    EXPECT_EQ(ids.size(), colNum);
+    for (size_t i = 0; i < colNum; ++i) {
+      int col = sparse_non_val_m->getRowCols(row_id)[i];
+      EXPECT_TRUE(ids.find(col) != ids.end());
+    }
+    ++row_id;
+  }
+
+  // CHECK SLOT 3, Index.
+  paddle::Argument& index_arg = arguments[3];
+  paddle::IVectorPtr indices = index_arg.ids;
+  EXPECT_NE(nullptr, indices);
+  int* idPtr = indices->getData();
+  for (picojson::value& id : arr[3].get<picojson::array>()) {
+    EXPECT_TRUE(id.is<double>());
+    int _id = (int)(id.get<double>());
+    EXPECT_EQ(_id, *idPtr++);
+  }
+
+  // CHECK SLOT 4, String.
+  paddle::Argument& strArg = arguments[4];
+  std::vector<std::string>* strPtr = strArg.strs.get();
+  EXPECT_NE(nullptr, strPtr);
+  size_t vecIndex = 0;
+  for (picojson::value& str : arr[4].get<picojson::array>()) {
+    EXPECT_TRUE(str.is<std::string>());
+    std::string _str = str.get<std::string>();
+    EXPECT_EQ(_str, (*strPtr)[vecIndex++]);
+  }
+}
+
+#else
+int main() { return 0; }
+
+#endif
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
new file mode 100644
index 00000000000000..8ca9be71de9ac8
--- /dev/null
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/utils/PythonUtil.h>
+#include <paddle/utils/Version.h>
+#include "paddle/trainer/Trainer.h"
+
+#include <gtest/gtest.h>
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static const string& configFile1 = "trainer/tests/sample_trainer_config.conf";
+static const string& configFile2 =
+    "trainer/tests/sample_trainer_config_hsigmoid.conf";
+static const string& configFile3 = "trainer/tests/chunking.conf";
+static const string& configFile4 =
+    "trainer/tests/sample_trainer_config_parallel.conf";
+
+P_DECLARE_bool(use_gpu);
+P_DECLARE_string(config);
+P_DECLARE_int32(gpu_id);
+P_DECLARE_bool(allow_only_one_model_on_one_gpu);
+
+void checkGradientTest(const string& configFile, bool useGpu, bool parallel,
+                       int trainerCount = 1) {
+  FLAGS_use_gpu = useGpu;
+  FLAGS_parallel_nn = parallel;
+  FLAGS_config = configFile;
+  FLAGS_trainer_count = trainerCount;
+  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
+            << " configFile=" << configFile;
+
+  Trainer trainer;
+  trainer.init(TrainerConfigHelper::createFromFlagConfig());
+  EXPECT_LE(fabs(trainer.checkGradient()), 0.02);
+}
+
+TEST(checkGradient, cpu) { checkGradientTest(configFile1, false, false); }
+
+#ifndef PADDLE_ONLY_CPU
+TEST(checkGradient, gpu) { checkGradientTest(configFile1, true, false); }
+
+TEST(checkGradient, multiGpu) {
+  int numGpu;
+  numGpu = hl_get_device_count();
+  for (auto count : {2, 4}) {
+    if (count <= numGpu) {
+      checkGradientTest(configFile1, true, false, count);
+    }
+  }
+}
+
+TEST(checkGradient, parallel) { checkGradientTest(configFile4, true, true); }
+
+TEST(checkGradient, multiParallel) {
+  FLAGS_allow_only_one_model_on_one_gpu = false;
+  checkGradientTest(configFile4, true, true, 2);
+  FLAGS_allow_only_one_model_on_one_gpu = true;
+}
+
+#endif
+
+TEST(checkGradient, multi) {
+  int numGpu;
+  if (version::isWithGpu()) {
+    numGpu = hl_get_device_count();
+  } else {
+    numGpu = 0;
+  }
+  for (bool useGpu : {false, true}) {
+    for (auto count : {2, 4}) {
+      if (useGpu && count > numGpu) continue;
+      checkGradientTest(configFile1, useGpu, false, count);
+    }
+  }
+}
+
+TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
+
+TEST(checkGradient, chunk) {
+  EXPECT_EQ(0, system("python2 trainer/tests/gen_proto_data.py"));
+  checkGradientTest(configFile3, false, false);
+#ifndef PADDLE_ONLY_CPU
+  checkGradientTest(configFile3, true, true);
+#endif
+}
+
+TEST(checkGradient, non_parallel) {
+  checkGradientTest(configFile4, false, false);
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  initPython(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
new file mode 100644
index 00000000000000..6d8b8e0ca5c98b
--- /dev/null
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
@@ -0,0 +1,303 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/utils/PythonUtil.h>
+#include <paddle/utils/GlobalConstants.h>
+#include "paddle/trainer/Trainer.h"
+#include "paddle/trainer/TrainerInternal.h"
+
+#include <gtest/gtest.h>
+#include <paddle/pserver/ParameterServer2.h>
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static const string& configFile1 = "trainer/tests/sample_trainer_config.conf";
+static const string& configFile2 =
+    "trainer/tests/sample_trainer_config_parallel.conf";
+
+P_DECLARE_bool(use_gpu);
+P_DECLARE_string(config);
+P_DECLARE_int32(gpu_id);
+P_DECLARE_int32(seed);
+P_DECLARE_int32(num_passes);
+P_DECLARE_int32(saving_period);
+
+class TrainerForTest : public paddle::Trainer {
+public:
+  inline const std::shared_ptr<ParameterUpdater>& getParameterUpdaterForTest() {
+    return this->trainerInternal_.getParameterUpdater();
+  }
+};
+
+
+
+int gNumDevices = 0;
+
+void trainerOnePassTest(const string& configFile, bool useGpu, bool parallel,
+                        int trainerCount = 1, double averageWindow = 0.0f,
+                        bool doAverageInCpu = false) {
+  FLAGS_use_gpu = useGpu;
+  FLAGS_parallel_nn = parallel;
+  FLAGS_config = configFile;
+  FLAGS_trainer_count = trainerCount;
+  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
+            << " configFile=" << configFile;
+  srand(FLAGS_seed);
+
+  if (useGpu) {
+    if (gNumDevices < trainerCount) {
+      return;
+    }
+  }
+
+  Trainer trainer;
+  auto config = TrainerConfigHelper::createFromFlagConfig();
+  if (averageWindow > 0) {
+    config->getOptConfig().set_average_window(averageWindow);
+    config->getOptConfig().set_do_average_in_cpu(doAverageInCpu);
+  }
+  trainer.init(config);
+  trainer.train();
+}
+
+// 1. test trainer (cpu, gpu).
+TEST(trainerOnePass, cpu) { trainerOnePassTest(configFile1, false, false); }
+
+#ifndef PADDLE_ONLY_CPU
+TEST(trainerOnePass, gpu) { trainerOnePassTest(configFile1, true, false); }
+
+TEST(trainerOnePass, gpu2) { trainerOnePassTest(configFile1, true, false, 2); }
+
+TEST(trainerOnePass, gpu4) { trainerOnePassTest(configFile1, true, false, 4); }
+
+TEST(trainerOnePass, parallel) { trainerOnePassTest(configFile2, true, true); }
+#endif
+
+// 2. test average_window.
+#ifndef PADDLE_ONLY_CPU
+TEST(average_window, gpu) {
+  trainerOnePassTest(configFile1, true, false, 4, 0.01);
+}
+
+TEST(average_window, gpu2) {
+  FLAGS_num_passes = 100;
+  trainerOnePassTest(configFile1, true, false, 2, 0.01);
+  FLAGS_num_passes = 1;
+}
+
+TEST(average_window, gpu4) {
+  FLAGS_num_passes = 100;
+  trainerOnePassTest(configFile1, true, false, 4, 0.01);
+  FLAGS_num_passes = 1;
+}
+
+TEST(average_window_cpu, gpu2) {
+  FLAGS_num_passes = 100;
+  trainerOnePassTest(configFile1, true, false, 2, 0.01, true);
+  FLAGS_num_passes = 1;
+}
+
+TEST(average_window_cpu, gpu4) {
+  FLAGS_num_passes = 100;
+  trainerOnePassTest(configFile1, true, false, 4, 0.01, true);
+  FLAGS_num_passes = 1;
+}
+#endif
+
+// 3. test trainer + pserver.
+P_DECLARE_int32(num_gradient_servers);
+P_DECLARE_int32(port);
+P_DECLARE_bool(local);
+P_DECLARE_bool(use_old_updater);
+
+double checkRemoteParameterUpdater(TrainerForTest& trainer) {
+  auto gradientMachine = trainer.getGradientMachine();
+  auto parameterUpdater = trainer.getParameterUpdaterForTest();
+  auto dataProvider = trainer.getDataProvider();
+  auto& parameters = gradientMachine->getParameters();
+  const TrainerConfig& config = trainer.getConfig();
+  const string& alg = config.opt_config().algorithm();
+
+  vector<ParameterPtr> parameterCheck;
+  for (auto& parameter : parameters) {
+    parameterCheck.emplace_back(
+        new Parameter(parameter->getConfig(), /* useGpu= */ false));
+    parameterCheck.back()
+        ->getBuf(PARAMETER_VALUE)
+        ->copyFrom(*parameter->getBuf(PARAMETER_VALUE));
+    parameterCheck.back()
+        ->getBuf(PARAMETER_GRADIENT)
+        ->copyFrom(*parameter->getBuf(PARAMETER_GRADIENT));
+  }
+
+  std::unique_ptr<ParameterUpdater> parameterUpdaterCheck;
+  if (alg == TrainAlgorithm::SGD) {
+    parameterUpdaterCheck.reset(new SgdLocalUpdater(config.opt_config()));
+  } else {
+    LOG(INFO) << "unsupported algorithm in remote parameter check: " << alg;
+    return -1.0;
+  }
+  parameterUpdaterCheck->init(parameterCheck);
+
+  // gradientMachine->start(config, *dataProvider);
+  DataBatch dataBatch;
+  int32_t batchSize = config.opt_config().batch_size();
+  dataProvider->getNextBatch(batchSize, &dataBatch);
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+  int64_t actualBatchSize = dataBatch.getSize();
+  const vector<Argument>& inArgs = dataBatch.getStreams();
+  vector<Argument> outArgs;
+
+  UpdateCallback updateCallback =
+      [parameterUpdater, parameterCheck](Parameter* para) {
+        parameterCheck[para->getID()]
+            ->getBuf(PARAMETER_GRADIENT)
+            ->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
+        parameterUpdater->update(para);
+      };
+
+  parameterUpdater->startPass();
+  parameterUpdaterCheck->startPass();
+
+  for (int i = 0; i < config.opt_config().num_batches_per_get_parameter() * 2;
+       ++i) {
+    PassType passType = parameterUpdater->startBatch(actualBatchSize);
+    gradientMachine->forwardBackward(inArgs, &outArgs, passType,
+                                     updateCallback);
+    parameterUpdater->finishBatch(0);
+
+    parameterUpdaterCheck->startBatch(actualBatchSize);
+    for (auto& para : parameterCheck) {
+      parameterUpdaterCheck->update(para.get());
+    }
+    parameterUpdaterCheck->finishBatch(0);
+  }
+
+  double sum = 0.0f;
+  for (size_t i = 0; i != parameters.size(); ++i) {
+    real* v1, *v2;
+    CpuVector trainerPara(parameters[i]->getSize());
+    trainerPara.copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
+    if (!FLAGS_use_gpu) {
+      v1 = parameters[i]->getBuf(PARAMETER_VALUE)->getData();
+    } else {
+      v1 = trainerPara.getData();
+    }
+    v2 = parameterCheck[i]->getBuf(PARAMETER_VALUE)->getData();
+
+    size_t size = parameters[i]->getSize();
+    double diff = 0;
+    for (size_t j = 0; j < size; ++j) {
+      diff += fabs(v1[j] - v2[j]);
+    }
+    sum += diff;
+    LOG(INFO) << setiosflags(ios::left) << setfill(' ') << setw(20)
+              << parameters[i]->getName() << "diff=" << setw(15) << diff;
+  }
+
+  parameterUpdater->finishPass();
+  parameterUpdaterCheck->finishPass();
+  gradientMachine->finish();
+  return sum;
+}
+
+void checkRemoteParameterUpdaterTest(const string& configFile, bool useGpu,
+                                     bool parallel, int trainerCount = 1,
+                                     bool useOldUpdater = false,
+                                     int num_batches_per_get_parameter = 1) {
+  FLAGS_use_gpu = useGpu;
+  FLAGS_parallel_nn = parallel;
+  FLAGS_config = configFile;
+  FLAGS_trainer_count = trainerCount;
+  FLAGS_use_old_updater = useOldUpdater;
+  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
+            << " configFile=" << configFile;
+  srand(FLAGS_seed);
+
+  if (useGpu) {
+    if (gNumDevices < trainerCount) {
+      return;
+    }
+  }
+
+  FLAGS_local = 0;
+  std::shared_ptr<ParameterServer2> pserver;
+  pserver.reset(new ParameterServer2(std::string(), FLAGS_port));
+  pserver->init();
+  pserver->start();
+
+  TrainerForTest trainer;
+  auto config = TrainerConfigHelper::createFromFlagConfig();
+  config->getOptConfig().set_num_batches_per_get_parameter(
+      num_batches_per_get_parameter);
+  trainer.init(config);
+  EXPECT_EQ(checkRemoteParameterUpdater(trainer), 0);
+
+  FLAGS_local = 1;
+}
+
+TEST(checkRemoteUpdater, cpuTrainer) {
+  checkRemoteParameterUpdaterTest(configFile1, false, false);
+}
+
+TEST(checkRemoteUpdater, cpuTrainerOldUpdater) {
+  checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true);
+}
+
+#ifndef PADDLE_ONLY_CPU
+TEST(checkRemoteUpdater, gpuTrainer) {
+  checkRemoteParameterUpdaterTest(configFile1, true, false);
+}
+
+TEST(checkRemoteUpdater, gpu2Trainer) {
+  checkRemoteParameterUpdaterTest(configFile1, true, false, 2);
+}
+
+TEST(checkRemoteUpdater, gpu4Trainer) {
+  checkRemoteParameterUpdaterTest(configFile1, true, false, 4);
+}
+
+TEST(checkRemoteUpdater, gpuTrainerOldUpdater) {
+  checkRemoteParameterUpdaterTest(configFile1, true, false, 1, true);
+}
+
+TEST(checkRemoteUpdater, gpu2TrainerOldUpdater) {
+  checkRemoteParameterUpdaterTest(configFile1, true, false, 2, true);
+}
+
+TEST(checkRemoteUpdater, gpu4TrainerOldUpdater) {
+  checkRemoteParameterUpdaterTest(configFile1, true, false, 4, true);
+}
+
+#endif
+
+TEST(checkRemoteUpdater, cpuDeltaTrainer) {
+  checkRemoteParameterUpdaterTest(configFile1, false, false, 1, false, 10);
+}
+
+TEST(checkRemoteUpdater, cpuDeltaTrainerOldUpdater) {
+  checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true, 10);
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  initPython(argc, argv);
+  gNumDevices = hl_get_device_count();
+  testing::InitGoogleTest(&argc, argv);
+
+  FLAGS_num_passes = 1;          // train one pass
+  FLAGS_saving_period = 100000;  // do not save parameteres
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/trainer/tests/test_config.conf b/paddle/trainer/tests/test_config.conf
new file mode 100644
index 00000000000000..5d2e2ba9df5c71
--- /dev/null
+++ b/paddle/trainer/tests/test_config.conf
@@ -0,0 +1,169 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
+
+default_initial_std(0.5)
+
+model_type("nn")
+
+DataLayer(
+    name = "input",
+    size = 3,
+)
+
+DataLayer(
+    name = "weight",
+    size = 1,
+)
+
+Layer(
+    name = "layer1_1",
+    type = "fc",
+    size = 5,
+    active_type = "sigmoid",
+    inputs = "input",
+)
+
+Layer(
+    name = "layer1_2",
+    type = "fc",
+    size = 12,
+    active_type = "linear",
+    inputs = Input("input", parameter_name='sharew'),
+)
+
+Layer(
+    name = "layer1_3",
+    type = "fc",
+    size = 3,
+    active_type = "tanh",
+    inputs = "input",
+)
+
+Layer(
+    name = "layer1_5",
+    type = "fc",
+    size = 3,
+    active_type = "tanh",
+    inputs = Input("input",
+              learning_rate=0.01,
+              momentum=0.9,
+              decay_rate=0.05,
+              initial_mean=0.0,
+              initial_std=0.01,
+              format = "csc",
+              nnz = 4)
+)
+
+FCLayer(
+    name = "layer1_4",
+    size = 5,
+    active_type = "square",
+    inputs = "input",
+    drop_rate = 0.5,
+)
+
+Layer(
+    name = "pool",
+    type = "pool",
+    inputs = Input("layer1_2",
+                   pool = Pool(pool_type="cudnn-avg-pool",
+                               channels = 1,
+                               size_x = 2,
+                               size_y = 3,
+                               img_width = 3,
+                               padding = 1,
+                               padding_y = 2,
+                               stride = 2,
+                               stride_y = 3))
+)
+
+Layer(
+    name = "concat",
+    type = "concat",
+    inputs = ["layer1_3", "layer1_4"],
+)
+
+MixedLayer(
+    name = "output",
+    size = 3,
+    active_type = "softmax",
+    inputs = [
+        FullMatrixProjection("layer1_1",
+              learning_rate=0.1),
+        TransposedFullMatrixProjection("layer1_2", parameter_name='sharew'),
+        FullMatrixProjection("concat"),
+        IdentityProjection("layer1_3"),
+    ],
+)
+
+Layer(
+    name = "label",
+    type = "data",
+    size = 1,
+)
+
+Layer(
+    name = "cost",
+    type = "multi-class-cross-entropy",
+    inputs = ["output", "label", "weight"],
+)
+
+Layer(
+    name = "cost2",
+    type = "nce",
+    num_classes = 3,
+    active_type = "sigmoid",
+    neg_sampling_dist = [0.1, 0.3, 0.6],
+    inputs = ["layer1_2", "label", "weight"],
+)
+
+Evaluator(
+    name = "error",
+    type = "classification_error",
+    inputs = ["output", "label", "weight"]
+)
+
+Inputs("input", "label", "weight")
+Outputs("cost", "cost2")
+
+TrainData(
+    ProtoData(
+        files = "dummy_list",
+        constant_slots = [1.0],
+        async_load_data = True,
+    )
+)
+
+TestData(
+    SimpleData(
+        files = "trainer/tests/sample_filelist.txt",
+        feat_dim = 3,
+        context_len = 0,
+        buffer_capacity = 1000000,
+        async_load_data = False,
+    ),
+)
+
+Settings(
+    algorithm = "sgd",
+    num_batches_per_send_parameter = 1,
+    num_batches_per_get_parameter = 1,
+    batch_size = 100,
+    learning_rate = 0.001,
+    learning_rate_decay_a = 1e-5,
+    learning_rate_decay_b = 0.5,
+)
diff --git a/paddle/trainer/tests/test_files.txt b/paddle/trainer/tests/test_files.txt
new file mode 100644
index 00000000000000..49002677a848c4
--- /dev/null
+++ b/paddle/trainer/tests/test_files.txt
@@ -0,0 +1 @@
+trainer/tests/test_proto.bin
diff --git a/paddle/trainer/tests/test_gen_dict.txt b/paddle/trainer/tests/test_gen_dict.txt
new file mode 100644
index 00000000000000..91a84146180e01
--- /dev/null
+++ b/paddle/trainer/tests/test_gen_dict.txt
@@ -0,0 +1,9 @@
+0
+1
+2
+3
+4
+5
+6
+7
+8
\ No newline at end of file
diff --git a/paddle/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
new file mode 100644
index 00000000000000..cf52c568e5868e
--- /dev/null
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <fstream>
+
+#include <paddle/utils/PythonUtil.h>
+#include <paddle/trainer/Trainer.h>
+
+#include <gtest/gtest.h>
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static const string& CONFIG_FILE = "trainer/tests/sample_trainer_rnn_gen.conf";
+static const string& OUTPUT_DIR = "trainer/tests/dump_text.test";
+static string modelDir = "trainer/tests/rnn_gen_test_model_dir/t1";  // NOLINT
+static string expectFile =                                           // NOLINT
+    "trainer/tests/rnn_gen_test_model_dir/r1.test";                  // NOLINT
+
+P_DECLARE_string(config_args);
+
+vector<float> readRetFile(const string& fname) {
+  ifstream inFile(fname);
+  float ret;
+  vector<float> nums;
+  while (inFile >> ret) {
+    nums.push_back(ret);
+  }
+  return nums;
+}
+
+void checkOutput(const string& expRetFile) {
+  vector<float> rets = readRetFile(OUTPUT_DIR);
+  vector<float> expRets = readRetFile(expRetFile);
+  EXPECT_EQ(rets.size(), expRets.size());
+  for (size_t i = 0; i < rets.size(); i++) {
+    EXPECT_FLOAT_EQ(rets[i], expRets[i]);
+  }
+}
+
+void prepareInArgs(vector<Argument>& inArgs,
+                   const size_t batchSize, bool useGpu) {
+  inArgs.clear();
+  // sentence id
+  Argument sentId;
+  sentId.value = nullptr;
+  IVector::resizeOrCreate(sentId.ids, batchSize, useGpu);
+  for (size_t i = 0; i < batchSize; ++i) sentId.ids->setElement(i, i);
+  inArgs.emplace_back(sentId);
+
+  // a dummy layer to decide batch size
+  Argument dummyInput;
+  dummyInput.value = Matrix::create(batchSize, 2, false, useGpu);
+  dummyInput.value->randomizeUniform();
+  inArgs.emplace_back(dummyInput);
+}
+
+void testGeneration(bool useGpu, const string& expRetFile) {
+  FLAGS_use_gpu = useGpu;
+  auto config = std::make_shared<TrainerConfigHelper>(CONFIG_FILE);
+  unique_ptr<GradientMachine> gradientMachine(GradientMachine::create(*config));
+  gradientMachine->loadParameters(modelDir);
+  vector<Argument> inArgs(2);
+
+  const size_t batchSize = 15;
+  prepareInArgs(inArgs, batchSize, useGpu);
+  vector<Argument> outArgs;
+  unique_ptr<Evaluator> testEvaluator(gradientMachine->makeEvaluator());
+  testEvaluator->start();
+  gradientMachine->forward(inArgs, &outArgs, PASS_TEST);
+  gradientMachine->eval(testEvaluator.get());
+  testEvaluator->finish();
+  checkOutput(expRetFile);
+}
+
+#ifndef PADDLE_TYPE_DOUBLE
+
+TEST(RecurrentGradientMachine, test_generation) {
+#ifdef PADDLE_ONLY_CPU
+  const auto useGpuConfs = {false};
+#else
+  const auto useGpuConfs = {true, false};
+#endif
+  FLAGS_config_args = "beam_search=0";  // no beam search
+  string expectRetFileNoBeam = expectFile + ".nobeam";
+  for (auto useGpu : useGpuConfs) {
+    testGeneration(useGpu, expectRetFileNoBeam);
+  }
+  FLAGS_config_args = "beam_search=1";  // no beam search
+  string expectRetFileBeam = expectFile + ".beam";
+  for (auto useGpu : useGpuConfs) {
+    testGeneration(useGpu, expectRetFileBeam);
+  }
+}
+#endif
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  initPython(argc, argv);
+  CHECK(argc == 1 || argc == 3);
+  if (argc == 3) {
+    modelDir = argv[1];
+    expectFile = argv[2];
+  }
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/trainer/tests/train.list b/paddle/trainer/tests/train.list
new file mode 100644
index 00000000000000..f41e8e8893de60
--- /dev/null
+++ b/paddle/trainer/tests/train.list
@@ -0,0 +1 @@
+trainer/tests/data_bin_part
diff --git a/paddle/trainer/tests/train.txt b/paddle/trainer/tests/train.txt
new file mode 100644
index 00000000000000..8d9b15dcf5bba4
--- /dev/null
+++ b/paddle/trainer/tests/train.txt
@@ -0,0 +1,5001 @@
+Confidence NN B-NP
+in IN B-PP
+the DT B-NP
+pound NN I-NP
+is VBZ B-VP
+widely RB I-VP
+expected VBN I-VP
+to TO I-VP
+take VB I-VP
+another DT B-NP
+sharp JJ I-NP
+dive NN I-NP
+if IN B-SBAR
+trade NN B-NP
+figures NNS I-NP
+for IN B-PP
+September NNP B-NP
+, , O
+due JJ B-ADJP
+for IN B-PP
+release NN B-NP
+tomorrow NN B-NP
+, , O
+fail VB B-VP
+to TO I-VP
+show VB I-VP
+a DT B-NP
+substantial JJ I-NP
+improvement NN I-NP
+from IN B-PP
+July NNP B-NP
+and CC I-NP
+August NNP I-NP
+'s POS B-NP
+near-record JJ I-NP
+deficits NNS I-NP
+. . O
+
+Chancellor NNP O
+of IN B-PP
+the DT B-NP
+Exchequer NNP I-NP
+Nigel NNP B-NP
+Lawson NNP I-NP
+'s POS B-NP
+restated VBN I-NP
+commitment NN I-NP
+to TO B-PP
+a DT B-NP
+firm NN I-NP
+monetary JJ I-NP
+policy NN I-NP
+has VBZ B-VP
+helped VBN I-VP
+to TO I-VP
+prevent VB I-VP
+a DT B-NP
+freefall NN I-NP
+in IN B-PP
+sterling NN B-NP
+over IN B-PP
+the DT B-NP
+past JJ I-NP
+week NN I-NP
+. . O
+
+But CC O
+analysts NNS B-NP
+reckon VBP B-VP
+underlying VBG B-NP
+support NN I-NP
+for IN B-PP
+sterling NN B-NP
+has VBZ B-VP
+been VBN I-VP
+eroded VBN I-VP
+by IN B-PP
+the DT B-NP
+chancellor NN I-NP
+'s POS B-NP
+failure NN I-NP
+to TO B-VP
+announce VB I-VP
+any DT B-NP
+new JJ I-NP
+policy NN I-NP
+measures NNS I-NP
+in IN B-PP
+his PRP$ B-NP
+Mansion NNP I-NP
+House NNP I-NP
+speech NN I-NP
+last JJ B-NP
+Thursday NNP I-NP
+. . O
+
+This DT B-NP
+has VBZ B-VP
+increased VBN I-VP
+the DT B-NP
+risk NN I-NP
+of IN B-PP
+the DT B-NP
+government NN I-NP
+being VBG B-VP
+forced VBN I-VP
+to TO I-VP
+increase VB I-VP
+base NN B-NP
+rates NNS I-NP
+to TO B-PP
+16 CD B-NP
+% NN I-NP
+from IN B-PP
+their PRP$ B-NP
+current JJ I-NP
+15 CD I-NP
+% NN I-NP
+level NN I-NP
+to TO B-VP
+defend VB I-VP
+the DT B-NP
+pound NN I-NP
+, , O
+economists NNS B-NP
+and CC O
+foreign JJ B-NP
+exchange NN I-NP
+market NN I-NP
+analysts NNS I-NP
+say VBP B-VP
+. . O
+
+`` `` O
+The DT B-NP
+risks NNS I-NP
+for IN B-PP
+sterling NN B-NP
+of IN B-PP
+a DT B-NP
+bad JJ I-NP
+trade NN I-NP
+figure NN I-NP
+are VBP B-VP
+very RB B-ADVP
+heavily RB I-ADVP
+on IN B-PP
+the DT B-NP
+down JJ I-NP
+side NN I-NP
+, , O
+'' '' O
+said VBD B-VP
+Chris NNP B-NP
+Dillow NNP I-NP
+, , O
+senior JJ B-NP
+U.K. NNP I-NP
+economist NN I-NP
+at IN B-PP
+Nomura NNP B-NP
+Research NNP I-NP
+Institute NNP I-NP
+. . O
+
+`` `` O
+If IN B-SBAR
+there EX B-NP
+is VBZ B-VP
+another DT B-NP
+bad JJ I-NP
+trade NN I-NP
+number NN I-NP
+, , O
+there EX B-NP
+could MD B-VP
+be VB I-VP
+an DT B-NP
+awful JJ I-NP
+lot NN I-NP
+of IN B-PP
+pressure NN B-NP
+, , O
+'' '' O
+noted VBD B-VP
+Simon NNP B-NP
+Briscoe NNP I-NP
+, , O
+U.K. NNP B-NP
+economist NN I-NP
+for IN B-PP
+Midland NNP B-NP
+Montagu NNP I-NP
+, , O
+a DT B-NP
+unit NN I-NP
+of IN B-PP
+Midland NNP B-NP
+Bank NNP I-NP
+PLC NNP I-NP
+. . O
+
+Forecasts NNS B-NP
+for IN B-PP
+the DT B-NP
+trade NN I-NP
+figures NNS I-NP
+range VBP B-VP
+widely RB B-ADVP
+, , O
+but CC O
+few JJ B-NP
+economists NNS I-NP
+expect VBP B-VP
+the DT B-NP
+data NNS I-NP
+to TO B-VP
+show VB I-VP
+a DT B-NP
+very RB I-NP
+marked VBN I-NP
+improvement NN I-NP
+from IN B-PP
+the DT O
+# # O
+2 CD O
+billion CD O
+-LRB- ( O
+$ $ B-ADJP
+3.2 CD O
+billion CD O
+-RRB- ) O
+deficit NN B-NP
+in IN B-PP
+the DT B-NP
+current JJ I-NP
+account NN I-NP
+reported VBD B-VP
+for IN B-PP
+August NNP B-NP
+. . O
+
+The DT B-NP
+August NNP I-NP
+deficit NN I-NP
+and CC O
+the DT B-NP
+# # I-NP
+2.2 CD I-NP
+billion CD I-NP
+gap NN I-NP
+registered VBN B-VP
+in IN B-PP
+July NNP B-NP
+are VBP B-VP
+topped VBN I-VP
+only RB B-ADVP
+by IN B-PP
+the DT B-NP
+# # I-NP
+2.3 CD I-NP
+billion CD I-NP
+deficit NN I-NP
+of IN B-PP
+October NNP B-NP
+1988 CD I-NP
+. . O
+
+Sanjay NNP B-NP
+Joshi NNP I-NP
+, , O
+European JJ B-NP
+economist NN I-NP
+at IN B-PP
+Baring NNP B-NP
+Brothers NNPS I-NP
+& CC I-NP
+Co. NNP I-NP
+, , O
+said VBD B-VP
+there EX B-NP
+is VBZ B-VP
+no DT B-NP
+sign NN I-NP
+that IN B-SBAR
+Britain NNP B-NP
+'s POS B-NP
+manufacturing NN I-NP
+industry NN I-NP
+is VBZ B-VP
+transforming VBG I-VP
+itself PRP B-NP
+to TO B-VP
+boost VB I-VP
+exports NNS B-NP
+. . O
+
+At IN B-PP
+the DT B-NP
+same JJ I-NP
+time NN I-NP
+, , O
+he PRP B-NP
+remains VBZ B-VP
+fairly RB B-ADJP
+pessimistic JJ I-ADJP
+about IN B-PP
+the DT B-NP
+outlook NN I-NP
+for IN B-PP
+imports NNS B-NP
+, , O
+given VBN B-PP
+continued VBD B-NP
+high JJ I-NP
+consumer NN I-NP
+and CC I-NP
+capital NN I-NP
+goods NNS I-NP
+inflows NNS I-NP
+. . O
+
+He PRP B-NP
+reckons VBZ B-VP
+the DT B-NP
+current JJ I-NP
+account NN I-NP
+deficit NN I-NP
+will MD B-VP
+narrow VB I-VP
+to TO B-PP
+only RB B-NP
+# # I-NP
+1.8 CD I-NP
+billion CD I-NP
+in IN B-PP
+September NNP B-NP
+. . O
+
+However RB B-ADVP
+, , O
+Mr. NNP B-NP
+Dillow NNP I-NP
+said VBD B-VP
+he PRP B-NP
+believes VBZ B-VP
+that IN B-SBAR
+a DT B-NP
+reduction NN I-NP
+in IN B-PP
+raw JJ B-NP
+material NN I-NP
+stockbuilding VBG I-NP
+by IN B-PP
+industry NN B-NP
+could MD B-VP
+lead VB I-VP
+to TO B-PP
+a DT B-NP
+sharp JJ I-NP
+drop NN I-NP
+in IN B-PP
+imports NNS B-NP
+. . O
+
+Combined VBN B-PP
+with IN B-PP
+at IN B-ADVP
+least JJS I-ADVP
+some DT B-NP
+rebound NN I-NP
+in IN B-PP
+exports NNS B-NP
+after IN B-PP
+August NNP B-NP
+'s POS B-NP
+unexpected JJ I-NP
+decline NN I-NP
+, , O
+the DT B-NP
+deficit NN I-NP
+could MD B-VP
+narrow VB I-VP
+to TO B-PP
+as RB B-NP
+little JJ I-NP
+as IN I-NP
+# # I-NP
+1.3 CD I-NP
+billion CD I-NP
+. . O
+
+Mr. NNP B-NP
+Briscoe NNP I-NP
+, , O
+who WP B-NP
+also RB B-ADVP
+forecasts VBZ B-VP
+a DT B-NP
+# # I-NP
+1.3 CD I-NP
+billion CD I-NP
+current JJ I-NP
+account NN I-NP
+gap NN I-NP
+, , O
+warns VBZ B-VP
+that IN B-SBAR
+even RB B-SBAR
+if IN I-SBAR
+the DT B-NP
+trade NN I-NP
+figures NNS I-NP
+are VBP B-VP
+bullish JJ B-ADJP
+for IN B-PP
+sterling NN B-NP
+, , O
+the DT B-NP
+currency NN I-NP
+wo MD B-VP
+n't RB I-VP
+advance VB I-VP
+much JJ B-NP
+because IN B-SBAR
+investors NNS B-NP
+will MD B-VP
+want VB I-VP
+to TO I-VP
+see VB I-VP
+further JJ B-NP
+evidence NN I-NP
+of IN B-PP
+the DT B-NP
+turnaround NN I-NP
+before IN B-PP
+adjusting VBG B-VP
+positions NNS B-NP
+. . O
+
+Nevertheless RB B-ADVP
+, , O
+he PRP B-NP
+noted VBD B-VP
+, , O
+`` `` O
+No DT B-NP
+one PRP I-NP
+will MD B-VP
+want VB I-VP
+to TO I-VP
+go VB I-VP
+into IN B-PP
+the DT B-NP
+trade NN I-NP
+figures NNS I-NP
+without IN B-PP
+a DT B-NP
+flat JJ I-NP
+position NN I-NP
+'' '' O
+in IN B-PP
+the DT B-NP
+pound NN I-NP
+. . O
+
+Meanwhile RB B-ADVP
+, , O
+overall JJ B-NP
+evidence NN I-NP
+on IN B-PP
+the DT B-NP
+economy NN I-NP
+remains VBZ B-VP
+fairly RB B-ADJP
+clouded VBN I-ADJP
+. . O
+
+In IN B-PP
+his PRP$ B-NP
+Mansion NNP I-NP
+House NNP I-NP
+speech NN I-NP
+, , O
+Mr. NNP B-NP
+Lawson NNP I-NP
+warned VBD B-VP
+that IN B-SBAR
+a DT B-NP
+further JJ I-NP
+slowdown NN I-NP
+can MD B-VP
+be VB I-VP
+expected VBN I-VP
+as IN B-SBAR
+the DT B-NP
+impact NN I-NP
+of IN B-PP
+the DT B-NP
+last JJ I-NP
+rise NN I-NP
+in IN B-PP
+interest NN B-NP
+rates NNS I-NP
+earlier RBR B-NP
+this DT I-NP
+month NN I-NP
+takes VBZ B-VP
+effect NN B-NP
+. . O
+
+U.K. JJ B-NP
+base NN I-NP
+rates NNS I-NP
+are VBP B-VP
+at IN B-PP
+their PRP$ B-NP
+highest JJS I-NP
+level NN I-NP
+in IN B-PP
+eight CD B-NP
+years NNS I-NP
+. . O
+
+But CC O
+consumer NN B-NP
+expenditure NN I-NP
+data NNS I-NP
+released VBD B-VP
+Friday NNP B-NP
+do VBP B-VP
+n't RB I-VP
+suggest VB I-VP
+that IN B-SBAR
+the DT B-NP
+U.K. NNP I-NP
+economy NN I-NP
+is VBZ B-VP
+slowing VBG I-VP
+that DT B-ADVP
+quickly RB I-ADVP
+. . O
+
+The DT B-NP
+figures NNS I-NP
+show VBP B-VP
+that DT O
+spending NN B-NP
+rose VBD B-VP
+0.1 CD B-NP
+% NN I-NP
+in IN B-PP
+the DT B-NP
+third JJ I-NP
+quarter NN I-NP
+from IN B-PP
+the DT B-NP
+second JJ I-NP
+quarter NN I-NP
+and CC O
+was VBD B-VP
+up IN B-ADVP
+3.8 CD B-NP
+% NN I-NP
+from IN B-PP
+a DT B-NP
+year NN I-NP
+ago RB B-ADVP
+. . O
+
+This DT B-NP
+compares VBZ B-VP
+with IN B-PP
+a DT B-NP
+1.6 CD I-NP
+% NN I-NP
+rise NN I-NP
+in IN B-PP
+the DT B-NP
+second NN I-NP
+from IN B-PP
+the DT B-NP
+first JJ I-NP
+quarter NN I-NP
+and CC O
+a DT B-NP
+5.4 CD I-NP
+% NN I-NP
+increase NN I-NP
+from IN B-PP
+the DT B-NP
+second JJ I-NP
+quarter NN I-NP
+of IN B-PP
+1988 CD B-NP
+. . O
+
+Mr. NNP B-NP
+Dillow NNP I-NP
+said VBD B-VP
+the DT B-NP
+data NNS I-NP
+show VBP B-VP
+the DT B-NP
+economy NN I-NP
+`` `` O
+is VBZ B-VP
+still RB B-ADVP
+quite RB B-ADJP
+strong JJ I-ADJP
+, , O
+'' '' O
+but CC O
+suggestions NNS B-NP
+that IN B-SBAR
+much NN B-NP
+of IN B-PP
+the DT B-NP
+spending NN I-NP
+went VBD B-VP
+on IN B-PP
+services NNS B-NP
+rather RB B-PP
+than IN I-PP
+consumer NN B-NP
+goods NNS I-NP
+should MD B-VP
+reduce VB I-VP
+fears NNS B-NP
+of IN B-PP
+more JJR B-NP
+import NN I-NP
+rises NNS I-NP
+. . O
+
+Certainly RB B-ADVP
+, , O
+the DT B-NP
+chancellor NN I-NP
+has VBZ B-VP
+made VBN I-VP
+it PRP B-NP
+clear JJ B-ADJP
+that IN B-SBAR
+he PRP B-NP
+is VBZ B-VP
+prepared VBN I-VP
+to TO I-VP
+increase VB I-VP
+interest NN B-NP
+rates NNS I-NP
+again RB B-ADVP
+if IN B-SBAR
+necessary JJ B-ADJP
+to TO B-VP
+both DT I-VP
+ensure VB I-VP
+that IN B-SBAR
+a DT B-NP
+substantial JJ I-NP
+slowdown NN I-NP
+does VBZ B-VP
+take VB I-VP
+place NN B-NP
+and CC O
+that DT O
+sterling NN B-NP
+does VBZ B-VP
+n't RB I-VP
+decline VB I-VP
+further JJ B-ADVP
+. . O
+
+Thursday NNP B-NP
+, , O
+he PRP B-NP
+reminded VBD B-VP
+his PRP$ B-NP
+audience NN I-NP
+that IN B-SBAR
+the DT B-NP
+government NN I-NP
+`` `` O
+can MD B-VP
+not RB I-VP
+allow VB I-VP
+the DT B-NP
+necessary JJ I-NP
+rigor NN I-NP
+of IN B-PP
+monetary JJ B-NP
+policy NN I-NP
+to TO B-VP
+be VB I-VP
+undermined VBN I-VP
+by IN B-PP
+exchange NN B-NP
+rate NN I-NP
+weakness NN I-NP
+. . O
+'' '' O
+
+Analysts NNS B-NP
+agree VBP B-VP
+there EX B-NP
+is VBZ B-VP
+little JJ B-NP
+holding NN B-VP
+sterling NN B-NP
+firm NN B-ADJP
+at IN B-PP
+the DT B-NP
+moment NN I-NP
+other JJ B-ADJP
+than IN B-PP
+Mr. NNP B-NP
+Lawson NNP I-NP
+'s POS B-NP
+promise NN I-NP
+that IN B-SBAR
+rates NNS B-NP
+will MD B-VP
+be VB I-VP
+pushed VBN I-VP
+higher JJR B-ADJP
+if IN B-SBAR
+necessary JJ B-ADJP
+. . O
+
+And CC O
+, , O
+they PRP B-NP
+warn VBP B-VP
+, , O
+any DT B-NP
+further JJ I-NP
+drop NN I-NP
+in IN B-PP
+the DT B-NP
+government NN I-NP
+'s POS B-NP
+popularity NN I-NP
+could MD B-VP
+swiftly RB I-VP
+make VB I-VP
+this DT B-NP
+promise NN I-NP
+sound NN B-VP
+hollow JJ B-ADJP
+. . O
+
+Sterling NNP B-NP
+was VBD B-VP
+already RB I-VP
+showing VBG I-VP
+some DT B-NP
+signs NNS I-NP
+of IN B-PP
+a DT B-NP
+lack NN I-NP
+of IN B-PP
+confidence NN B-NP
+in IN B-PP
+Mr. NNP B-NP
+Lawson NNP I-NP
+'s POS B-NP
+promise NN I-NP
+Friday NNP B-NP
+. . O
+
+In IN B-PP
+European JJ B-NP
+trading NN I-NP
+it PRP B-NP
+declined VBD B-VP
+to TO B-PP
+$ $ B-NP
+1.5890 CD I-NP
+and CC O
+2.9495 CD B-NP
+marks NNS I-NP
+from IN B-PP
+$ $ B-NP
+1.5940 CD I-NP
+and CC O
+2.9429 CD B-NP
+marks NNS I-NP
+late JJ B-NP
+Thursday NNP I-NP
+. . O
+
+Economists NNS B-NP
+suggested VBD B-VP
+that IN B-SBAR
+if IN B-SBAR
+the DT B-NP
+pound NN I-NP
+falls VBZ B-VP
+much JJ B-NP
+below IN B-PP
+2.90 CD B-NP
+marks NNS I-NP
+, , O
+the DT B-NP
+government NN I-NP
+will MD B-VP
+be VB I-VP
+forced VBN I-VP
+to TO I-VP
+increase VB I-VP
+rates NNS B-NP
+to TO B-PP
+16 CD B-NP
+% NN I-NP
+, , O
+both DT B-VP
+to TO I-VP
+halt VB B-VP
+any DT B-NP
+further JJ I-NP
+decline NN I-NP
+and CC O
+ensure VB B-VP
+that IN B-SBAR
+the DT B-NP
+balance NN I-NP
+of IN B-PP
+monetary JJ B-NP
+policy NN I-NP
+remains VBZ B-VP
+unchanged JJ B-ADJP
+. . O
+
+Friday NNP B-NP
+'s POS B-NP
+Market NNP I-NP
+Activity NN I-NP
+
+The DT B-NP
+dollar NN I-NP
+posted VBD B-VP
+gains NNS B-NP
+in IN B-PP
+quiet JJ B-NP
+trading NN I-NP
+as IN B-SBAR
+concerns NNS B-NP
+about IN B-PP
+equities NNS B-NP
+abated VBN B-VP
+. . O
+
+Foreign JJ B-NP
+exchange NN I-NP
+dealers NNS I-NP
+said VBD B-VP
+that IN B-SBAR
+the DT B-NP
+currency NN I-NP
+market NN I-NP
+has VBZ B-VP
+begun VBN I-VP
+to TO I-VP
+distance VB I-VP
+itself PRP B-NP
+from IN B-PP
+the DT B-NP
+volatile JJ I-NP
+stock NN I-NP
+exchange NN I-NP
+, , O
+which WDT B-NP
+has VBZ B-VP
+preoccupied VBN I-VP
+the DT B-NP
+market NN I-NP
+since IN B-PP
+Oct. NNP B-NP
+13 CD I-NP
+, , O
+when WRB B-ADVP
+the DT B-NP
+Dow NNP I-NP
+Jones NNP I-NP
+Industrial NNP I-NP
+Average NNP I-NP
+plunged VBD B-VP
+more JJR B-NP
+than IN I-NP
+190 CD I-NP
+points NNS I-NP
+. . O
+
+Currency NN B-NP
+analysts NNS I-NP
+predict VBP B-VP
+that IN B-SBAR
+in IN B-PP
+the DT B-NP
+coming VBG I-NP
+week NN I-NP
+the DT B-NP
+foreign JJ I-NP
+exchange NN I-NP
+market NN I-NP
+will MD B-VP
+shift VB I-VP
+its PRP$ B-NP
+focus NN I-NP
+back RB B-ADVP
+to TO B-PP
+economic JJ B-NP
+fundamentals NNS I-NP
+, , O
+keeping VBG B-VP
+a DT B-NP
+close NN I-NP
+eye NN I-NP
+out IN B-ADVP
+for IN B-PP
+any DT B-NP
+signs NNS I-NP
+of IN B-PP
+monetary JJ B-NP
+easing NN I-NP
+by IN B-PP
+U.S. NNP B-NP
+Federal NNP I-NP
+Reserve NNP I-NP
+. . O
+
+Late RB B-ADVP
+in IN B-PP
+the DT B-NP
+New NNP I-NP
+York NNP I-NP
+trading NN I-NP
+day NN I-NP
+, , O
+the DT B-NP
+dollar NN I-NP
+was VBD B-VP
+quoted VBN I-VP
+at IN B-PP
+1.8578 CD B-NP
+marks NNS I-NP
+, , O
+up IN B-ADVP
+from IN B-PP
+1.8470 CD B-NP
+marks NNS I-NP
+late JJ B-NP
+Thursday NNP I-NP
+in IN B-PP
+New NNP B-NP
+York NNP I-NP
+. . O
+
+The DT B-NP
+U.S. NNP I-NP
+currency NN I-NP
+was VBD B-VP
+also RB I-VP
+changing VBG I-VP
+hands NNS B-NP
+at IN B-PP
+142.43 CD B-NP
+yen NN I-NP
+, , O
+up IN B-ADVP
+from IN B-PP
+141.70 CD B-NP
+yen NN I-NP
+in IN B-PP
+New NNP B-NP
+York NNP I-NP
+late JJ B-NP
+Thursday NNP I-NP
+. . O
+
+In IN B-PP
+Tokyo NNP B-NP
+on IN B-PP
+Monday NNP B-NP
+, , O
+the DT B-NP
+U.S. NNP I-NP
+currency NN I-NP
+opened VBD B-VP
+for IN B-PP
+trading NN B-NP
+at IN B-PP
+141.95 CD B-NP
+yen NN I-NP
+, , O
+up IN B-ADVP
+from IN B-PP
+Friday NNP B-NP
+'s POS B-NP
+Tokyo NNP I-NP
+close NN I-NP
+of IN B-PP
+141.35 CD B-NP
+yen NN I-NP
+. . O
+
+On IN B-PP
+the DT B-NP
+Commodity NNP I-NP
+Exchange NNP I-NP
+in IN B-PP
+New NNP B-NP
+York NNP I-NP
+, , O
+gold NN B-NP
+for IN B-PP
+current JJ B-NP
+delivery NN I-NP
+settled VBD B-VP
+at IN B-PP
+$ $ B-NP
+367.30 CD I-NP
+an DT B-NP
+ounce NN I-NP
+, , O
+up IN B-ADVP
+20 CD B-NP
+cents NNS I-NP
+. . O
+
+Estimated VBN B-NP
+volume NN I-NP
+was VBD B-VP
+a DT B-NP
+light NN I-NP
+2.4 CD I-NP
+million CD I-NP
+ounces NNS I-NP
+. . O
+
+In IN B-PP
+early JJ B-NP
+trading NN I-NP
+in IN B-PP
+Hong NNP B-NP
+Kong NNP I-NP
+Monday NNP B-NP
+, , O
+gold NN B-NP
+was VBD B-VP
+quoted VBN I-VP
+at IN B-PP
+$ $ B-NP
+366.50 CD I-NP
+an DT B-NP
+ounce NN I-NP
+. . O
+
+East NNP B-NP
+Rock NNP I-NP
+Partners NNP I-NP
+Limited NNP I-NP
+Partnership NNP I-NP
+said VBD B-VP
+it PRP B-NP
+proposed VBD B-VP
+to TO I-VP
+acquire VB I-VP
+A.P. NNP B-NP
+Green NNP I-NP
+Industries NNP I-NP
+Inc. NNP I-NP
+for IN B-PP
+$ $ B-NP
+40 CD I-NP
+a DT B-NP
+share NN I-NP
+. . O
+
+In IN B-PP
+an DT B-NP
+Oct. NNP I-NP
+19 CD I-NP
+letter NN I-NP
+to TO B-PP
+A.P. NNP B-NP
+Green NNP I-NP
+'s POS B-NP
+board NN I-NP
+, , O
+East NNP B-NP
+Rock NNP I-NP
+said VBD B-VP
+the DT B-NP
+offer NN I-NP
+is VBZ B-VP
+subject NN B-ADJP
+to TO B-PP
+the DT B-NP
+signing NN I-NP
+of IN B-PP
+a DT B-NP
+merger NN I-NP
+agreement NN I-NP
+by IN B-PP
+no DT B-ADVP
+later RB I-ADVP
+than IN B-PP
+Oct. NNP B-NP
+31 CD I-NP
+. . O
+
+The DT B-NP
+letter NN I-NP
+, , O
+attached VBN B-VP
+to TO B-PP
+a DT B-NP
+filing NN I-NP
+with IN B-PP
+the DT B-NP
+Securities NNP I-NP
+and CC I-NP
+Exchange NNP I-NP
+Commission NNP I-NP
+, , O
+said VBD B-VP
+the DT B-NP
+approval NN I-NP
+is VBZ B-VP
+also RB B-ADVP
+contingent JJ B-ADJP
+upon IN B-PP
+obtaining VBG B-VP
+satisfactory JJ B-NP
+financing NN I-NP
+. . O
+
+An DT B-NP
+A.P. NNP I-NP
+Green NNP I-NP
+official NN I-NP
+declined VBD B-VP
+to TO I-VP
+comment VB I-VP
+on IN B-PP
+the DT B-NP
+filing NN I-NP
+. . O
+
+The DT B-NP
+$ $ I-NP
+40-a-share JJ I-NP
+proposal NN I-NP
+values VBZ B-VP
+the DT B-NP
+company NN I-NP
+at IN B-PP
+about RB B-NP
+$ $ I-NP
+106.6 CD I-NP
+million CD I-NP
+. . O
+
+A.P. NNP B-NP
+Green NNP I-NP
+currently RB B-ADVP
+has VBZ B-VP
+2,664,098 CD B-NP
+shares NNS I-NP
+outstanding JJ B-ADJP
+. . O
+
+Its PRP$ B-NP
+stock NN I-NP
+closed VBD B-VP
+at IN B-PP
+$ $ B-NP
+38 CD I-NP
+, , O
+up IN B-ADVP
+$ $ B-NP
+1.875 CD I-NP
+, , O
+in IN B-PP
+national JJ B-NP
+over-the-counter JJ I-NP
+trading NN I-NP
+. . O
+
+The DT B-NP
+company NN I-NP
+is VBZ B-VP
+a DT B-NP
+Mexico NNP I-NP
+, , I-NP
+Mo. NNP I-NP
+, , I-NP
+maker NN I-NP
+of IN B-PP
+refractory JJ B-NP
+products NNS I-NP
+. . O
+
+East NNP B-NP
+Rock NNP I-NP
+also RB B-ADVP
+said VBD B-VP
+in IN B-PP
+the DT B-NP
+filing NN I-NP
+that IN B-SBAR
+it PRP B-NP
+boosted VBD B-VP
+its PRP$ B-NP
+stake NN I-NP
+in IN B-PP
+A.P. NNP B-NP
+Green NNP I-NP
+to TO B-PP
+8.7 CD B-NP
+% NN I-NP
+. . O
+
+It PRP B-NP
+now RB B-ADVP
+holds VBZ B-VP
+233,000 CD B-NP
+A.P. NNP I-NP
+Green NNP I-NP
+common JJ I-NP
+shares NNS I-NP
+, , O
+including VBG B-PP
+30,000 CD B-NP
+shares NNS I-NP
+bought VBD B-VP
+last JJ B-NP
+Thursday NNP I-NP
+for IN B-PP
+$ $ B-NP
+35.50 CD I-NP
+to TO I-NP
+$ $ I-NP
+36.50 CD I-NP
+a DT B-NP
+share NN I-NP
+. . O
+
+New NNP B-NP
+York-based JJ I-NP
+John NNP I-NP
+Kuhns NNP I-NP
+and CC I-NP
+Robert NNP I-NP
+MacDonald NNP I-NP
+control NN B-VP
+East NNP B-NP
+Rock NNP I-NP
+Partners NNP I-NP
+Inc. NNP I-NP
+, , O
+the DT B-NP
+sole JJ I-NP
+general JJ I-NP
+partner NN I-NP
+of IN B-PP
+East NNP B-NP
+Rock NNP I-NP
+Partners NNP I-NP
+L.P NNP I-NP
+. . O
+
+The DT B-NP
+sole JJ I-NP
+limited JJ I-NP
+partner NN I-NP
+of IN B-PP
+the DT B-NP
+partnership NN I-NP
+is VBZ B-VP
+Westwood NNP B-NP
+Brick NNP I-NP
+Lime NNP I-NP
+Inc. NNP I-NP
+, , O
+an DT B-NP
+indirect JJ I-NP
+subsidiary NN I-NP
+of IN B-PP
+Westwood NNP B-NP
+Group NNP I-NP
+Inc NNP I-NP
+. . O
+
+Both DT B-NP
+Westwood NNP B-NP
+Brick NNP I-NP
+and CC O
+Westwood NNP B-NP
+Group NNP I-NP
+are VBP B-VP
+based VBN I-VP
+in IN B-PP
+Boston NNP B-NP
+. . O
+
+Freight NN B-NP
+rates NNS I-NP
+, , O
+declining VBG B-VP
+for IN B-PP
+most RBS B-NP
+of IN B-PP
+the DT B-NP
+decade NN I-NP
+because IN B-PP
+of IN I-PP
+competition NN B-NP
+spurred VBN B-VP
+by IN B-PP
+deregulation NN B-NP
+, , O
+are VBP B-VP
+bottoming VBG I-VP
+out IN B-PRT
+, , O
+turning VBG B-VP
+upward RB B-ADVP
+and CC O
+threatening VBG B-VP
+to TO I-VP
+fuel VB I-VP
+inflation NN B-NP
+. . O
+
+Trucking NNP B-NP
+, , I-NP
+shipping VBG I-NP
+and CC I-NP
+air-freight NN I-NP
+companies NNS I-NP
+have VBP B-VP
+announced VBN I-VP
+rate NN B-NP
+increases NNS I-NP
+, , O
+scheduled VBN B-VP
+for IN B-PP
+this DT B-NP
+fall NN I-NP
+or CC O
+early JJ B-NP
+next JJ I-NP
+year NN I-NP
+, , O
+reflecting VBG B-VP
+higher JJR B-NP
+costs NNS I-NP
+and CC O
+tightened VBD B-NP
+demand NN I-NP
+for IN B-PP
+freight NN B-NP
+transport NN I-NP
+. . O
+
+Major JJ B-NP
+shippers NNS I-NP
+say VBP B-VP
+they PRP B-NP
+expect VBP B-VP
+freight NN B-NP
+rates NNS I-NP
+to TO B-VP
+rise VB I-VP
+at IN B-ADVP
+least JJS I-ADVP
+as RB B-ADVP
+fast RB I-ADVP
+as IN B-PP
+inflation NN B-NP
+and CC B-ADVP
+maybe RB I-ADVP
+faster RBR B-ADVP
+in IN B-PP
+the DT B-NP
+next JJ I-NP
+few JJ I-NP
+years NNS I-NP
+. . O
+
+That DT B-NP
+'s VBZ B-VP
+a DT B-NP
+big JJ I-NP
+change NN I-NP
+from IN B-PP
+recent JJ B-NP
+years NNS I-NP
+when WRB B-ADVP
+freight NN B-NP
+haulage NN I-NP
+was VBD B-VP
+a DT B-NP
+bright JJ I-NP
+spot NN I-NP
+for IN B-PP
+U.S. NNP B-NP
+productivity NN I-NP
+, , O
+helping VBG B-VP
+to TO I-VP
+restrain VB I-VP
+inflation NN B-NP
+and CC O
+make VB B-VP
+U.S. NNP B-NP
+industry NN I-NP
+more RBR B-ADJP
+competitive JJ I-ADJP
+abroad RB B-ADVP
+. . O
+
+`` `` O
+Demand NN B-NP
+has VBZ B-VP
+caught VBN I-VP
+up IN B-PRT
+with IN B-PP
+the DT B-NP
+supply NN I-NP
+of IN B-PP
+certain JJ B-NP
+types NNS I-NP
+of IN B-PP
+freight NN B-NP
+transportation NN I-NP
+, , O
+and CC O
+rates NNS B-NP
+are VBP B-VP
+starting VBG I-VP
+to TO I-VP
+move VB I-VP
+up IN B-ADVP
+'' '' O
+at IN B-PP
+a DT B-NP
+rate NN I-NP
+`` `` O
+close RB B-ADJP
+to TO B-PP
+or CC O
+slightly RB B-ADJP
+more JJR I-ADJP
+than IN B-PP
+the DT B-NP
+inflation NN I-NP
+rate NN I-NP
+, , O
+'' '' O
+said VBD B-VP
+Clifford NNP B-NP
+Sayre NNP I-NP
+, , O
+director NN B-NP
+of IN B-PP
+logistics NNS B-NP
+at IN B-PP
+Du NNP B-NP
+Pont NNP I-NP
+Co NNP I-NP
+. . O
+
+Shippers NNS B-NP
+surveyed VBN B-VP
+recently RB B-ADVP
+by IN B-PP
+Ohio NNP B-NP
+State NNP I-NP
+University NNP I-NP
+said VBD B-VP
+they PRP B-NP
+expect VBP B-VP
+their PRP$ B-NP
+freight-transport JJ I-NP
+, , I-NP
+storage NN I-NP
+and CC I-NP
+distribution NN I-NP
+costs NNS I-NP
+to TO B-VP
+rise VB I-VP
+about IN B-NP
+4 CD I-NP
+% NN I-NP
+this DT B-NP
+year NN I-NP
+. . O
+
+Only RB B-NP
+10 CD I-NP
+% NN I-NP
+of IN B-PP
+the DT B-NP
+250 CD I-NP
+shippers NNS I-NP
+polled VBN B-VP
+expected VBN B-VP
+their PRP$ B-NP
+freight-transport JJ I-NP
+costs NNS I-NP
+to TO B-VP
+decrease VB I-VP
+, , O
+compared VBN B-PP
+with IN B-PP
+30 CD B-NP
+% NN I-NP
+who WP B-NP
+had VBD B-VP
+looked VBN I-VP
+to TO B-PP
+freight VB B-NP
+transport NN I-NP
+to TO B-VP
+reduce VB I-VP
+costs NNS B-NP
+in IN B-PP
+past JJ B-NP
+years NNS I-NP
+. . O
+
+`` `` O
+This DT B-NP
+is VBZ B-VP
+the DT B-NP
+first JJ I-NP
+year NN I-NP
+since IN B-PP
+transportation NN B-NP
+deregulation NN I-NP
+in IN B-PP
+1980 CD B-NP
+that IN B-ADVP
+we PRP B-NP
+have VBP B-VP
+had VBN I-VP
+such JJ B-NP
+a DT I-NP
+dramatic JJ I-NP
+and CC I-NP
+broad-based JJ I-NP
+upturn NN I-NP
+in IN B-PP
+perceived VBN B-NP
+transportation NN I-NP
+rates NNS I-NP
+, , O
+'' '' O
+said VBD B-VP
+Bernard NNP B-NP
+LaLonde NNP I-NP
+, , O
+a DT B-NP
+transportation NN I-NP
+logistics NNS I-NP
+professor NN I-NP
+at IN B-PP
+Ohio NNP B-NP
+State NNP I-NP
+in IN B-PP
+Columbus NNP B-NP
+. . O
+
+The DT B-NP
+deregulation NN I-NP
+of IN B-PP
+railroads NNS B-NP
+and CC I-NP
+trucking NN I-NP
+companies NNS I-NP
+that WDT B-NP
+began VBD B-VP
+in IN B-PP
+1980 CD B-NP
+enabled VBD B-VP
+shippers NNS B-NP
+to TO B-VP
+bargain VB I-VP
+for IN B-PP
+transportation NN B-NP
+. . O
+
+Carriers NNP B-NP
+could MD B-VP
+use VB I-VP
+their PRP$ B-NP
+equipment NN I-NP
+more RBR B-ADVP
+efficiently RB I-ADVP
+, , O
+leading VBG B-VP
+to TO B-PP
+overcapacity NN B-NP
+they PRP B-NP
+were VBD B-VP
+eager JJ B-ADJP
+to TO B-VP
+fill VB I-VP
+. . O
+
+Shippers NNS B-NP
+cut VBP B-VP
+about RB B-NP
+$ $ I-NP
+35 CD I-NP
+billion CD I-NP
+from IN B-PP
+their PRP$ B-NP
+annual JJ I-NP
+, , I-NP
+inter-city JJ I-NP
+truck NN I-NP
+and CC I-NP
+rail NN I-NP
+costs NNS I-NP
+, , O
+to TO B-PP
+about RB B-NP
+$ $ I-NP
+150 CD I-NP
+billion CD I-NP
+, , O
+or CC O
+about IN B-NP
+6.4 CD I-NP
+% NN I-NP
+of IN B-PP
+gross JJ B-NP
+national JJ I-NP
+product NN I-NP
+, , O
+down RB B-ADVP
+from IN B-PP
+8 CD B-NP
+% NN I-NP
+of IN B-PP
+GNP NNP B-NP
+in IN B-PP
+1981 CD B-NP
+. . O
+
+But CC O
+with IN B-PP
+much NN B-NP
+of IN B-PP
+the DT B-NP
+inefficiency NN I-NP
+squeezed VBN B-VP
+out IN B-PP
+of IN B-PP
+the DT B-NP
+freight-transport JJ I-NP
+system NN I-NP
+, , O
+rising VBG B-NP
+costs NNS I-NP
+are VBP B-VP
+likely JJ B-ADJP
+to TO B-VP
+be VB I-VP
+reflected VBN I-VP
+directly RB B-ADVP
+in IN B-PP
+higher JJR B-NP
+freight NN I-NP
+rates NNS I-NP
+. . O
+
+`` `` O
+Shippers NNS B-NP
+are VBP B-VP
+saying VBG I-VP
+` `` O
+the DT B-NP
+party NN I-NP
+'s POS B-VP
+over IN B-ADJP
+, , O
+' '' O
+'' '' O
+said VBD B-VP
+Mr. NNP B-NP
+LaLonde NNP I-NP
+. . O
+
+`` `` O
+Shippers NNS B-NP
+wo MD B-VP
+n't RB I-VP
+be VB I-VP
+able JJ B-ADJP
+to TO B-VP
+look VB I-VP
+for IN B-PP
+transportation-cost JJ B-NP
+savings NNS I-NP
+as IN B-SBAR
+they PRP B-NP
+have VBP B-VP
+for IN B-PP
+the DT B-NP
+last JJ I-NP
+eight CD I-NP
+or CC I-NP
+nine CD I-NP
+years NNS I-NP
+. . O
+
+Transport NN B-NP
+rates NNS I-NP
+wo MD B-VP
+n't RB I-VP
+be VB I-VP
+an DT B-NP
+opportunity NN I-NP
+for IN B-PP
+offsetting VBG B-VP
+cost NN B-NP
+increases NNS I-NP
+in IN B-PP
+other JJ B-NP
+segments NNS I-NP
+of IN B-PP
+the DT B-NP
+economy NN I-NP
+. . O
+'' '' O
+
+Robert NNP B-NP
+Delaney NNP I-NP
+, , O
+a DT B-NP
+consultant NN I-NP
+at IN B-PP
+Arthur NNP B-NP
+D. NNP I-NP
+Little NNP I-NP
+Inc. NNP I-NP
+, , O
+Cambridge NNP B-NP
+, , O
+Mass. NNP B-NP
+, , O
+said VBD B-VP
+`` `` O
+We PRP B-NP
+'ve VBP B-VP
+gotten VBN I-VP
+all PDT B-NP
+the DT I-NP
+benefits NNS I-NP
+of IN B-PP
+deregulation NN B-NP
+in IN B-PP
+freight-cost JJ B-NP
+reductions NNS I-NP
+. . O
+
+Now RB B-ADVP
+we PRP B-NP
+are VBP B-VP
+starting VBG I-VP
+to TO I-VP
+see VB I-VP
+real JJ B-NP
+freight-rate JJ I-NP
+increases NNS I-NP
+as IN B-SBAR
+carriers NNS B-NP
+replace VBP B-VP
+equipment NN B-NP
+, , O
+pay VB B-VP
+higher JJR B-NP
+fuel NN I-NP
+costs NNS I-NP
+and CC O
+pay VB B-VP
+more JJR B-NP
+for IN B-PP
+labor NN B-NP
+. . O
+
+You PRP B-NP
+'ll MD B-VP
+see VB I-VP
+carriers NNS B-NP
+try VB B-VP
+to TO I-VP
+recoup VB I-VP
+some DT B-NP
+of IN B-PP
+the DT B-NP
+price NN I-NP
+cutting VBG I-NP
+that WDT B-NP
+occurred VBD B-VP
+previously RB B-ADVP
+. . O
+'' '' O
+
+Not RB B-NP
+everyone NN I-NP
+believes VBZ B-VP
+that IN B-SBAR
+the DT B-NP
+good JJ I-NP
+times NNS I-NP
+are VBP B-VP
+over IN B-ADJP
+for IN B-PP
+shippers NNS B-NP
+. . O
+
+`` `` O
+There EX B-NP
+'s VBZ B-VP
+still RB B-ADVP
+a DT B-NP
+lot NN I-NP
+of IN B-PP
+pressure NN B-NP
+on IN B-PP
+rates NNS B-NP
+in IN B-PP
+both DT B-NP
+rail NN I-NP
+and CC I-NP
+truck NN I-NP
+, , O
+'' '' O
+said VBD B-VP
+Gerard NNP B-NP
+McCullough NNP I-NP
+, , O
+lecturer NN B-NP
+in IN B-PP
+transportation NN B-NP
+at IN B-PP
+Massachusetts NNP B-NP
+Institute NNP I-NP
+of IN B-PP
+Technology NNP B-NP
+. . O
+
+Less-than-truckload JJ B-NP
+companies NNS I-NP
+, , O
+which WDT B-NP
+carry VBP B-VP
+the DT B-NP
+freight NN I-NP
+of IN B-PP
+several JJ B-NP
+shippers NNS I-NP
+in IN B-PP
+each DT B-NP
+truck NN I-NP
+trailer NN I-NP
+, , O
+discounted VBD B-VP
+away RB B-ADVP
+a DT B-NP
+4.7 CD I-NP
+% NN I-NP
+rate NN I-NP
+increase NN I-NP
+implemented VBD B-VP
+last JJ B-NP
+April NNP I-NP
+. . O
+
+The DT B-NP
+carriers NNS I-NP
+were VBD B-VP
+competing VBG I-VP
+fiercely RB B-ADVP
+for IN B-PP
+market NN B-NP
+share NN I-NP
+. . O
+
+Railroad-rate JJ B-NP
+increases NNS I-NP
+are VBP B-VP
+likely JJ B-ADJP
+to TO B-VP
+be VB I-VP
+restrained VBN I-VP
+by IN B-PP
+weakening VBG B-NP
+rail-traffic JJ I-NP
+levels NNS I-NP
+and CC O
+keen JJ B-NP
+competition NN I-NP
+for IN B-PP
+freight NN B-NP
+from IN B-PP
+trucks NNS B-NP
+. . O
+
+An DT B-NP
+official NN I-NP
+at IN B-PP
+Consolidated NNP B-NP
+Freightways NNP I-NP
+Inc. NNP I-NP
+, , O
+a DT B-NP
+Menlo NNP I-NP
+Park NNP I-NP
+, , I-NP
+Calif. NNP I-NP
+, , I-NP
+less-than-truckload JJ I-NP
+carrier NN I-NP
+, , O
+said VBD B-VP
+rate NN B-NP
+discounting NN I-NP
+in IN B-PP
+that DT B-NP
+industry NN I-NP
+has VBZ B-VP
+begun VBN I-VP
+to TO I-VP
+`` `` O
+stabilize VB B-VP
+. . O
+'' '' O
+
+Consolidated NNP B-NP
+Freightways NNP I-NP
+plans VBZ B-VP
+to TO I-VP
+raise VB I-VP
+its PRP$ B-NP
+rates NNS I-NP
+5.3 CD B-NP
+% NN I-NP
+late JJ B-NP
+this DT I-NP
+year NN I-NP
+or CC O
+early JJ B-NP
+next JJ I-NP
+year NN I-NP
+, , O
+and CC O
+at IN B-NP
+least JJS I-NP
+two CD I-NP
+competitors NNS I-NP
+have VBP B-VP
+announced VBN I-VP
+similar JJ B-NP
+increases NNS I-NP
+. . O
+
+Truckers NNS B-NP
+are VBP B-VP
+`` `` O
+trying VBG B-VP
+to TO I-VP
+send VB I-VP
+signals NNS B-NP
+that IN B-SBAR
+they PRP B-NP
+need VBP B-VP
+to TO I-VP
+stop VB I-VP
+the DT B-NP
+bloodletting NN I-NP
+, , O
+forget VB B-VP
+about IN B-PP
+market NN B-NP
+share NN I-NP
+and CC O
+go VB B-VP
+for IN B-PP
+higher JJR B-NP
+rates NNS I-NP
+, , O
+'' '' O
+said VBD B-VP
+Michael NNP B-NP
+Lloyd NNP I-NP
+, , O
+an DT B-NP
+analyst NN I-NP
+at IN B-PP
+Salomon NNP B-NP
+Bros NNP I-NP
+. . O
+
+And CC O
+`` `` O
+shippers NNS B-NP
+are VBP B-VP
+getting VBG I-VP
+the DT B-NP
+feeling NN I-NP
+that IN B-SBAR
+they PRP B-NP
+have VBP B-VP
+played VBN I-VP
+one CD B-NP
+trucker NN I-NP
+off IN B-ADVP
+against IN B-PP
+another DT B-NP
+as RB B-NP
+much JJ I-NP
+as IN B-SBAR
+they PRP B-NP
+can MD B-VP
+, , O
+'' '' O
+he PRP B-NP
+said VBD B-VP
+. . O
+
+Air-freight NN B-NP
+carriers NNS I-NP
+raised VBD B-VP
+their PRP$ B-NP
+rates NNS I-NP
+for IN B-PP
+U.S. NNP B-NP
+products NNS I-NP
+going VBG B-VP
+across IN B-PP
+the DT B-NP
+Pacific NNP I-NP
+to TO B-PP
+Asia NNP B-NP
+by IN B-PP
+about IN B-NP
+20 CD I-NP
+% NN I-NP
+earlier RBR B-NP
+this DT I-NP
+month NN I-NP
+. . O
+
+And CC O
+Japan NNP B-NP
+Air NNP I-NP
+Lines NNPS I-NP
+said VBD B-VP
+it PRP B-NP
+plans VBZ B-VP
+to TO I-VP
+boost VB I-VP
+its PRP$ B-NP
+rates NNS I-NP
+a DT B-NP
+further JJ I-NP
+25 CD I-NP
+% NN I-NP
+over IN B-PP
+the DT B-NP
+next JJ I-NP
+two CD I-NP
+years NNS I-NP
+. . O
+
+Such JJ B-NP
+rate NN I-NP
+increases NNS I-NP
+`` `` O
+will MD B-VP
+increase VB I-VP
+the DT B-NP
+total JJ I-NP
+cost NN I-NP
+of IN B-PP
+U.S. NNP B-NP
+products NNS I-NP
+and CC O
+slow JJ B-VP
+down RP B-PRT
+the DT B-NP
+rate NN I-NP
+of IN B-PP
+increase NN B-NP
+of IN B-PP
+U.S. NNP B-NP
+exports NNS I-NP
+, , O
+'' '' O
+said VBD B-VP
+Richard NNP B-NP
+Connors NNP I-NP
+, , O
+a DT B-NP
+senior JJ I-NP
+vice NN I-NP
+president NN I-NP
+of IN B-PP
+Yusen NNP B-NP
+Air NNP I-NP
+& CC I-NP
+Sea NNP I-NP
+Service NNP I-NP
+U.S.A. NNP I-NP
+Inc. NNP I-NP
+, , O
+the DT B-NP
+U.S. NNP I-NP
+air-freight-forwarding JJ I-NP
+subsidiary NN I-NP
+of IN B-PP
+Nippon NNP B-NP
+Yusen NNP I-NP
+Kaisha NNP I-NP
+of IN B-PP
+Japan NNP B-NP
+. . O
+
+Ship NN B-NP
+companies NNS I-NP
+carrying VBG B-VP
+bulk NN B-NP
+commodities NNS I-NP
+, , O
+such JJ B-PP
+as IN I-PP
+oil NN B-NP
+, , O
+grain NN B-NP
+, , O
+coal NN B-NP
+and CC O
+iron NN B-NP
+ore NN I-NP
+, , O
+have VBP B-VP
+been VBN I-VP
+able JJ B-ADJP
+to TO B-VP
+increase VB I-VP
+their PRP$ B-NP
+rates NNS I-NP
+in IN B-PP
+the DT B-NP
+last JJ I-NP
+couple NN I-NP
+of IN B-PP
+years NNS B-NP
+. . O
+
+Some DT B-NP
+bulk NN I-NP
+shipping VBG I-NP
+rates NNS I-NP
+have VBP B-VP
+increased VBN I-VP
+`` `` O
+3 CD B-NP
+% NN I-NP
+to TO I-NP
+4 CD I-NP
+% NN I-NP
+in IN B-PP
+the DT B-NP
+past JJ I-NP
+few JJ I-NP
+months NNS I-NP
+, , O
+'' '' O
+said VBD B-VP
+Salomon NNP B-NP
+'s POS B-NP
+Mr. NNP I-NP
+Lloyd NNP I-NP
+. . O
+
+And CC O
+ship NN B-NP
+lines NNS I-NP
+carrying VBG B-VP
+containers NNS B-NP
+are VBP B-VP
+also RB I-VP
+trying VBG I-VP
+to TO I-VP
+raise VB I-VP
+their PRP$ B-NP
+rates NNS I-NP
+. . O
+
+Carriers NNP B-NP
+boosted VBD B-VP
+rates NNS B-NP
+more JJR B-NP
+than IN I-NP
+10 CD I-NP
+% NN I-NP
+in IN B-PP
+the DT B-NP
+North NNP I-NP
+Atlantic NNP I-NP
+between IN B-PP
+the DT B-NP
+U.S. NNP I-NP
+and CC O
+Europe NNP B-NP
+last JJ B-NP
+September NNP I-NP
+, , O
+hoping VBG B-VP
+to TO I-VP
+partly RB I-VP
+restore VB I-VP
+rates NNS B-NP
+to TO B-PP
+earlier JJR B-NP
+levels NNS I-NP
+. . O
+
+Ship NN B-NP
+lines NNS I-NP
+operating VBG B-VP
+in IN B-PP
+the DT B-NP
+Pacific NNP I-NP
+plan NN B-VP
+to TO I-VP
+raise VB I-VP
+rates NNS B-NP
+on IN B-PP
+containers NNS B-NP
+carrying VBG B-VP
+U.S. NNP B-NP
+exports NNS I-NP
+to TO B-PP
+Asia NNP B-NP
+about IN B-NP
+10 CD I-NP
+% NN I-NP
+, , O
+effective JJ B-ADJP
+next JJ B-NP
+April NNP I-NP
+. . O
+
+MGM NNP B-NP
+Grand NNP I-NP
+Inc. NNP I-NP
+said VBD B-VP
+it PRP B-NP
+filed VBD B-VP
+a DT B-NP
+registration NN I-NP
+statement NN I-NP
+with IN B-PP
+the DT B-NP
+Securities NNP I-NP
+and CC I-NP
+Exchange NNP I-NP
+Commission NNP I-NP
+for IN B-PP
+a DT B-NP
+public JJ I-NP
+offering NN I-NP
+of IN B-PP
+six CD B-NP
+million CD I-NP
+common JJ I-NP
+shares NNS I-NP
+. . O
+
+The DT B-NP
+Beverly NNP I-NP
+Hills NNP I-NP
+, , I-NP
+Calif.-based JJ I-NP
+company NN I-NP
+said VBD B-VP
+it PRP B-NP
+would MD B-VP
+have VB I-VP
+26.9 CD B-NP
+million CD I-NP
+common JJ I-NP
+shares NNS I-NP
+outstanding JJ B-ADJP
+after IN B-PP
+the DT B-NP
+offering NN I-NP
+. . O
+
+The DT B-NP
+hotel NN I-NP
+and CC I-NP
+Gaming NNP I-NP
+company NN I-NP
+said VBD B-VP
+Merrill NNP B-NP
+Lynch NNP I-NP
+Capital NNP I-NP
+Markets NNPS I-NP
+will MD B-VP
+lead VB I-VP
+the DT B-NP
+underwriters NNS I-NP
+. . O
+
+Proceeds NNS B-NP
+from IN B-PP
+the DT B-NP
+sale NN I-NP
+will MD B-VP
+be VB I-VP
+used VBN I-VP
+for IN B-PP
+remodeling VBG B-NP
+and CC I-NP
+refurbishing VBG I-NP
+projects NNS I-NP
+, , B-PP
+as RB I-PP
+well RB I-PP
+as IN I-PP
+for IN B-PP
+the DT B-NP
+planned VBN I-NP
+MGM NNP I-NP
+Grand NNP I-NP
+hotel\/casino NN I-NP
+and CC I-NP
+theme NN I-NP
+park NN I-NP
+. . O
+
+Bob NNP B-NP
+Stone NNP I-NP
+stewed JJ B-VP
+over IN B-PP
+a DT B-NP
+letter NN I-NP
+from IN B-PP
+his PRP$ B-NP
+manager NN I-NP
+putting VBG B-VP
+him PRP B-NP
+on IN B-PP
+probation NN B-NP
+for IN B-PP
+insubordination NN B-NP
+. . O
+
+Mr. NNP B-NP
+Stone NNP I-NP
+thought VBD B-VP
+the DT B-NP
+discipline NN I-NP
+was VBD B-VP
+unfair JJ B-ADJP
+; : O
+he PRP B-NP
+believed VBD B-VP
+that IN B-SBAR
+his PRP$ B-NP
+manager NN I-NP
+wanted VBD B-VP
+to TO I-VP
+get VB I-VP
+rid JJ B-ADJP
+of IN B-PP
+him PRP B-NP
+for IN B-PP
+personal JJ B-NP
+reasons NNS I-NP
+. . O
+
+Unable JJ B-ADJP
+to TO B-VP
+persuade VB I-VP
+the DT B-NP
+manager NN I-NP
+to TO B-VP
+change VB I-VP
+his PRP$ B-NP
+decision NN I-NP
+, , O
+he PRP B-NP
+went VBD B-VP
+to TO B-PP
+a DT B-NP
+`` `` I-NP
+company NN I-NP
+court NN I-NP
+'' '' O
+for IN B-PP
+a DT B-NP
+hearing NN I-NP
+. . O
+
+At IN B-PP
+the DT B-NP
+scheduled VBN I-NP
+time NN I-NP
+, , O
+Mr. NNP B-NP
+Stone NNP I-NP
+entered VBD B-VP
+a DT B-NP
+conference NN I-NP
+room NN I-NP
+in IN B-PP
+a DT B-NP
+building NN I-NP
+near IN B-PP
+where WRB B-ADVP
+he PRP B-NP
+worked VBD B-VP
+. . O
+
+After IN B-SBAR
+the DT B-NP
+three CD I-NP
+members NNS I-NP
+of IN B-PP
+the DT B-NP
+court NN I-NP
+introduced VBD B-VP
+themselves PRP B-NP
+, , O
+the DT B-NP
+chairman NN I-NP
+of IN B-PP
+the DT B-NP
+panel NN I-NP
+said VBD B-VP
+: : O
+`` `` O
+Go VB B-VP
+ahead RB B-ADVP
+and CC O
+tell VB B-VP
+us PRP B-NP
+what WP B-NP
+happened VBD B-VP
+. . O
+
+We PRP B-NP
+may MD B-VP
+ask VB I-VP
+questions NNS B-NP
+as IN B-SBAR
+you PRP B-NP
+go VBP B-VP
+along IN B-PRT
+, , O
+or CC O
+we PRP B-NP
+may MD B-VP
+wait VB I-VP
+until IN B-PP
+the DT B-NP
+end NN I-NP
+. . O
+'' '' O
+
+No DT B-NP
+lawyers NNS I-NP
+or CC I-NP
+tape NN I-NP
+recorders NNS I-NP
+were VBD B-VP
+present JJ B-ADJP
+. . O
+
+The DT B-NP
+only RB I-NP
+extra JJ I-NP
+people NNS I-NP
+were VBD B-VP
+a DT B-NP
+couple NN I-NP
+of IN B-PP
+personnel NNS B-NP
+specialists NNS I-NP
+, , O
+one CD B-NP
+of IN B-PP
+whom WP B-NP
+knew VBD B-VP
+Mr. NNP B-NP
+Stone NNP I-NP
+'s POS B-NP
+case NN I-NP
+intimately RB B-ADVP
+and CC O
+would MD B-VP
+help VB I-VP
+fill VB I-VP
+in IN B-PRT
+any DT B-NP
+facts NNS I-NP
+needed VBN B-VP
+to TO B-VP
+give VB I-VP
+the DT B-NP
+court NN I-NP
+the DT B-NP
+full JJ I-NP
+picture NN I-NP
+. . O
+
+Over IN B-PP
+a DT B-NP
+cup NN I-NP
+of IN B-PP
+coffee NN B-NP
+, , O
+Mr. NNP B-NP
+Stone NNP I-NP
+told VBD B-VP
+his PRP$ B-NP
+story NN I-NP
+. . O
+
+He PRP B-NP
+talked VBD B-VP
+about IN B-NP
+20 CD I-NP
+minutes NNS I-NP
+. . O
+
+When WRB B-ADVP
+he PRP B-NP
+was VBD B-VP
+through IN B-ADJP
+, , O
+the DT B-NP
+court NN I-NP
+members NNS I-NP
+asked VBD B-VP
+many JJ B-NP
+questions NNS I-NP
+, , O
+then RB B-ADVP
+the DT B-NP
+chairman NN I-NP
+said VBD B-VP
+they PRP B-NP
+would MD B-VP
+like VB I-VP
+to TO I-VP
+hear VB I-VP
+his PRP$ B-NP
+manager NN I-NP
+'s POS B-NP
+side NN I-NP
+and CC O
+talk VB B-VP
+to TO B-PP
+witnesses NNS B-NP
+. . O
+
+The DT B-NP
+chairman NN I-NP
+promised VBD B-VP
+Mr. NNP B-NP
+Stone NNP I-NP
+a DT B-NP
+decision NN I-NP
+within IN B-PP
+two CD B-NP
+weeks NNS I-NP
+. . O
+
+Bob NNP B-NP
+Stone NNP I-NP
+is VBZ B-VP
+a DT B-NP
+fictional JJ I-NP
+name NN I-NP
+, , O
+but CC O
+the DT B-NP
+incident NN I-NP
+described VBN B-VP
+is VBZ B-VP
+real JJ B-ADJP
+. . O
+
+It PRP B-NP
+happened VBD B-VP
+at IN B-PP
+Northrop NNP B-NP
+Corp. NNP I-NP
+in IN B-PP
+Los NNP B-NP
+Angeles NNP I-NP
+. . O
+
+The DT B-NP
+court NN I-NP
+is VBZ B-VP
+called VBN I-VP
+the DT B-NP
+Management NNP I-NP
+Appeals NNP I-NP
+Committee NNP I-NP
+, , O
+or CC O
+just RB B-NP
+`` `` I-NP
+MAC NNP I-NP
+, , O
+'' '' O
+and CC O
+it PRP B-NP
+is VBZ B-VP
+likely JJ B-ADJP
+to TO B-VP
+hear VB I-VP
+a DT B-NP
+couple NN I-NP
+of IN I-NP
+dozen NN I-NP
+cases VBZ I-NP
+a DT B-NP
+year NN I-NP
+. . O
+
+Alter VB B-VP
+some DT B-NP
+details NNS I-NP
+of IN B-PP
+this DT B-NP
+example NN I-NP
+and CC O
+it PRP B-NP
+could MD B-VP
+be VB I-VP
+taking VBG I-VP
+place NN B-NP
+today NN B-ADVP
+at IN B-PP
+Federal NNP B-NP
+Express NNP I-NP
+in IN B-PP
+Memphis NNP B-NP
+, , O
+the DT B-NP
+Defense NNP I-NP
+and CC I-NP
+Underseas NNP I-NP
+Systems NNP I-NP
+divisions NNS I-NP
+of IN B-PP
+Honeywell NNP B-NP
+in IN B-PP
+Minneapolis NNP B-NP
+, , O
+a DT B-NP
+General NNP I-NP
+Electric NNP I-NP
+plant NN I-NP
+in IN B-PP
+Columbia NNP B-NP
+, , O
+Md. NNP B-NP
+, , O
+or CC O
+a DT B-NP
+number NN I-NP
+of IN B-PP
+other JJ B-NP
+companies NNS I-NP
+. . O
+
+These DT B-NP
+firms NNS I-NP
+are VBP B-VP
+pioneers NNS B-NP
+in IN B-PP
+a DT B-NP
+significant JJ I-NP
+new JJ I-NP
+trend NN I-NP
+in IN B-PP
+the DT B-NP
+corporate JJ I-NP
+world NN I-NP
+: : O
+the DT B-NP
+rise NN I-NP
+of IN B-PP
+what WP B-NP
+I PRP B-NP
+call VBP B-VP
+corporate JJ B-NP
+due JJ I-NP
+process NN I-NP
+. . O
+
+Although IN B-SBAR
+corporate JJ B-NP
+due JJ I-NP
+process NN I-NP
+is VBZ B-VP
+practiced VBN I-VP
+today NN B-NP
+in IN B-PP
+few JJ B-NP
+companies NNS I-NP
+-- : O
+perhaps RB B-ADVP
+40 CD B-NP
+to TO I-NP
+60 CD I-NP
+-- : O
+it PRP B-NP
+is VBZ B-VP
+one CD B-NP
+of IN B-PP
+the DT B-NP
+fastest JJS I-NP
+developing VBG I-NP
+trends NNS I-NP
+in IN B-PP
+industry NN B-NP
+. . O
+
+In IN B-PP
+the DT B-NP
+coming VBG I-NP
+decade NN I-NP
+a DT B-NP
+majority NN I-NP
+of IN B-PP
+people-oriented JJ B-NP
+companies NNS I-NP
+are VBP B-VP
+likely JJ B-ADJP
+to TO B-VP
+adopt VB I-VP
+it PRP B-NP
+. . O
+
+Corporate JJ B-NP
+due JJ I-NP
+process NN I-NP
+appeals NNS B-VP
+to TO B-PP
+management NN B-NP
+for IN B-PP
+a DT B-NP
+variety NN I-NP
+of IN B-PP
+reasons NNS B-NP
+. . O
+
+It PRP B-NP
+reduces VBZ B-VP
+lawsuits NNS B-NP
+from IN B-PP
+disgruntled JJ B-NP
+employees NNS I-NP
+and CC I-NP
+ex-employees NNS I-NP
+, , O
+with IN B-PP
+all DT B-NP
+that WDT B-NP
+means VBZ B-VP
+for IN B-PP
+reduced VBN B-NP
+legal JJ I-NP
+costs NNS I-NP
+and CC O
+better RBR B-NP
+public JJ I-NP
+relations NNS I-NP
+. . O
+
+It PRP B-NP
+helps VBZ B-VP
+to TO I-VP
+keep VB I-VP
+out IN B-PRT
+unions NNS B-NP
+. . O
+
+It PRP B-NP
+increases VBZ B-VP
+employee NN B-NP
+commitment NN I-NP
+to TO B-PP
+the DT B-NP
+company NN I-NP
+, , O
+with IN B-PP
+all DT B-NP
+that WDT B-NP
+means VBZ B-VP
+for IN B-PP
+efficiency NN B-NP
+and CC O
+quality NN B-NP
+control NN I-NP
+. . O
+
+What WP B-NP
+must MD O
+your PRP$ B-NP
+management NN I-NP
+team NN I-NP
+do VBP B-VP
+to TO B-VP
+establish VB I-VP
+corporate JJ B-NP
+due JJ I-NP
+process NN I-NP
+? . O
+
+Here RB B-ADVP
+are VBP B-VP
+four CD B-NP
+key JJ I-NP
+steps NNS I-NP
+: : O
+
+1 CD B-LST
+. . O
+Make VB B-VP
+sure JJ B-ADJP
+you PRP B-NP
+have VBP B-VP
+a DT B-NP
+strong JJ I-NP
+personnel NNS I-NP
+department NN I-NP
+. . O
+
+It PRP B-NP
+must MD B-VP
+be VB I-VP
+able JJ B-ADJP
+to TO B-VP
+handle VB I-VP
+most RBS B-NP
+of IN B-PP
+the DT B-NP
+complaints NNS I-NP
+that WDT B-NP
+can MD B-VP
+not RB I-VP
+be VB I-VP
+solved VBN I-VP
+in IN B-PP
+the DT B-NP
+trenches NNS I-NP
+by IN B-PP
+managers NNS B-NP
+and CC O
+their PRP$ B-NP
+subordinates NNS I-NP
+, , O
+else RB B-ADVP
+the DT B-NP
+company NN I-NP
+court NN I-NP
+or CC I-NP
+adjudicators NNS I-NP
+will MD B-VP
+be VB B-VP
+inundated VBN I-VP
+with IN B-PP
+cases NNS B-NP
+. . O
+
+At IN B-PP
+Polaroid NNP B-NP
+, , O
+the DT B-NP
+Personnel NNP I-NP
+Policy NNP I-NP
+Planning NNP I-NP
+Committee NNP I-NP
+may MD B-VP
+hear VB I-VP
+only RB B-NP
+about IN I-NP
+20 CD I-NP
+cases VBZ I-NP
+a DT B-NP
+year NN I-NP
+; : O
+the DT B-NP
+rest NN I-NP
+of IN B-PP
+the DT B-NP
+many JJ I-NP
+hundreds NNS I-NP
+of IN B-PP
+complaints NNS B-NP
+are VBP B-VP
+resolved VBN I-VP
+at IN B-PP
+earlier JJR B-NP
+stages NNS I-NP
+. . O
+
+At IN B-PP
+TWA NNP B-NP
+, , O
+the DT B-NP
+System NNP I-NP
+Board NNP I-NP
+of IN B-PP
+Adjustment NNP B-NP
+hears VBZ B-VP
+50 CD B-NP
+to TO I-NP
+75 CD I-NP
+cases VBZ I-NP
+a DT B-NP
+year NN I-NP
+, , O
+only RB B-NP
+a DT I-NP
+fraction NN I-NP
+of IN B-PP
+the DT B-NP
+complaints NNS I-NP
+brought VBN B-VP
+to TO B-PP
+personnel NNS B-NP
+specialists NNS I-NP
+. . O
+
+At IN B-PP
+Citicorp NNP B-NP
+, , O
+the DT B-NP
+Problem NNP I-NP
+Review NNP I-NP
+Board NNP I-NP
+may MD B-VP
+hear VB I-VP
+only RB B-NP
+12 CD I-NP
+or CC I-NP
+so RB I-NP
+cases VBZ I-NP
+because IN B-PP
+of IN I-PP
+personnel NNS B-NP
+'s POS B-NP
+skill NN I-NP
+in IN B-PP
+complaint-resolution NN B-NP
+. . O
+
+In IN B-PP
+a DT B-NP
+typical JJ I-NP
+year NN I-NP
+, , O
+up IN B-NP
+to TO I-NP
+20 CD I-NP
+% NN I-NP
+of IN B-PP
+the DT B-NP
+work NN I-NP
+force NN I-NP
+goes VBZ B-VP
+to TO B-PP
+personnel NNS B-NP
+specialists NNS I-NP
+with IN B-PP
+complaints NNS B-NP
+of IN B-PP
+unfair JJ B-NP
+treatment NN I-NP
+. . O
+
+In IN B-PP
+a DT B-NP
+large JJ I-NP
+company NN I-NP
+that WDT B-NP
+means VBZ B-VP
+many JJ B-NP
+hundreds NNS I-NP
+of IN B-PP
+complaints NNS B-NP
+for IN B-PP
+personnel NNS B-NP
+to TO B-VP
+handle VB I-VP
+. . O
+
+2 CD B-LST
+. . O
+Formally RB B-ADVP
+or CC I-ADVP
+informally RB I-ADVP
+, , O
+train NN B-VP
+all DT B-NP
+your PRP$ I-NP
+managers NNS I-NP
+and CC I-NP
+supervisors NNS I-NP
+in IN B-PP
+the DT B-NP
+company NN I-NP
+'s POS B-NP
+due-process NN I-NP
+approach NN I-NP
+. . O
+
+See VB B-VP
+that IN B-SBAR
+they PRP B-NP
+know VBP B-VP
+company NN B-NP
+personnel NNS I-NP
+policy NN I-NP
+backwards RB B-ADVP
+and CC I-ADVP
+forwards RB I-ADVP
+, , O
+for IN O
+it PRP B-NP
+is VBZ B-VP
+the DT B-NP
+`` `` I-NP
+law NN I-NP
+'' '' O
+governing VBG B-VP
+company NN B-NP
+courts NNS I-NP
+and CC I-NP
+adjudicators NNS I-NP
+. . O
+
+Coach NNP B-VP
+them PRP B-NP
+in IN B-PP
+handling NN B-VP
+complaints NNS B-NP
+so RB B-SBAR
+that IN I-SBAR
+they PRP B-NP
+can MD B-VP
+resolve VB I-VP
+problems NNS B-NP
+immediately RB B-ADVP
+. . O
+
+In IN B-SBAR
+case NN O
+managers NNS B-NP
+and CC O
+personnel NNS B-NP
+specialists NNS I-NP
+are VBP B-VP
+unsuccessful JJ B-ADJP
+and CC O
+subordinates NNS B-NP
+take VBP B-VP
+their PRP$ B-NP
+complaints NNS I-NP
+to TO B-PP
+a DT B-NP
+company NN I-NP
+court NN I-NP
+or CC I-NP
+adjudicator NN I-NP
+, , O
+teach VB B-VP
+managers NNS B-NP
+to TO B-VP
+accept VB I-VP
+reversals NNS B-NP
+as IN B-PP
+a DT B-NP
+fact NN I-NP
+of IN B-PP
+business NN B-NP
+life NN I-NP
+, , O
+for IN O
+in IN B-PP
+a DT B-NP
+good JJ I-NP
+due-process NN I-NP
+system NN I-NP
+they PRP B-NP
+are VBP B-VP
+bound VBN I-VP
+to TO I-VP
+happen VB I-VP
+. . O
+
+In IN B-PP
+the DT B-NP
+15 CD I-NP
+companies NNS I-NP
+I PRP B-NP
+studied VBD B-VP
+, , O
+reversal NN B-NP
+rates NNS I-NP
+range VBP B-VP
+on IN B-PP
+the DT B-NP
+average NN I-NP
+from IN B-PP
+20 CD B-NP
+% NN I-NP
+to TO B-PP
+40 CD B-NP
+% NN I-NP
+. . O
+
+3 CD B-LST
+. . O
+Decide VB B-VP
+whether IN O
+you PRP B-NP
+want VBP B-VP
+a DT B-NP
+panel NN I-NP
+system NN I-NP
+or CC O
+a DT B-NP
+single JJ I-NP
+adjudicator NN I-NP
+. . O
+
+A DT B-NP
+panel NN I-NP
+system NN I-NP
+like IN B-PP
+that DT B-NP
+in NN B-PP
+the DT B-NP
+Bob NNP I-NP
+Stone NNP I-NP
+example NN I-NP
+enjoys VBZ B-VP
+such JJ B-NP
+advantages NNS I-NP
+as IN B-PP
+high JJ B-NP
+credibility NN I-NP
+and CC O
+, , O
+for IN B-PP
+the DT B-NP
+panelists NNS I-NP
+, , O
+mutual JJ B-NP
+support NN I-NP
+. . O
+
+An DT B-NP
+adjudicator NN I-NP
+system NN I-NP
+-- : O
+that DT B-INTJ
+is VBZ I-INTJ
+, , O
+an DT B-NP
+investigator NN I-NP
+who WP B-NP
+acts VBZ B-VP
+first JJ B-ADVP
+as IN B-PP
+a DT B-NP
+fact-finder NN I-NP
+and CC O
+then RB O
+switches VBZ B-VP
+hats NNS B-NP
+and CC O
+arbitrates VBZ B-VP
+the DT B-NP
+facts NNS I-NP
+-- : O
+has VBZ B-VP
+such JJ B-NP
+advantages NNS I-NP
+as IN B-PP
+speed NN B-NP
+, , O
+flexibility NN B-NP
+and CC O
+maximum JJ B-NP
+privacy NN I-NP
+. . O
+
+International NNP B-NP
+Business NNP I-NP
+Machines NNPS I-NP
+and CC O
+Bank NNP B-NP
+of IN B-PP
+America NNP B-NP
+are VBP B-VP
+among IN B-PP
+the DT B-NP
+companies NNS I-NP
+using VBG B-VP
+the DT B-NP
+single-adjudicator JJ I-NP
+approach NN I-NP
+. . O
+
+4 CD B-LST
+. . O
+Make VB B-VP
+your PRP$ B-NP
+due-process NN I-NP
+system NN I-NP
+visible JJ B-ADJP
+. . O
+
+It PRP B-NP
+wo MD B-VP
+n't RB I-VP
+do VB I-VP
+any DT B-NP
+good NN I-NP
+for IN B-PP
+anybody NN B-NP
+unless IN B-SBAR
+employees NNS B-NP
+know VBP B-VP
+about IN B-PP
+it PRP B-NP
+. . O
+
+Most JJS B-NP
+managements NNS I-NP
+hesitate VBP B-VP
+to TO I-VP
+go VB I-VP
+all DT B-ADVP
+out NN I-ADVP
+in IN B-PP
+advertising VBG B-VP
+their PRP$ B-NP
+due-process NN I-NP
+systems NNS I-NP
+for IN B-PP
+fear NN B-NP
+of IN B-PP
+encouraging VBG B-VP
+cranks NNS B-NP
+and CC O
+chronic JJ B-NP
+soreheads NNS I-NP
+to TO B-VP
+file VB I-VP
+complaints NNS B-NP
+. . O
+
+On IN B-PP
+the DT B-NP
+other JJ I-NP
+hand NN I-NP
+, , O
+they PRP B-NP
+make VBP B-VP
+sure JJ B-ADJP
+at IN B-PP
+a DT B-NP
+minimum NN I-NP
+that IN B-SBAR
+their PRP$ B-NP
+systems NNS I-NP
+are VBP B-VP
+described VBN I-VP
+in IN B-PP
+their PRP$ B-NP
+employee NN I-NP
+handbooks NNS I-NP
+and CC O
+talked VBD B-VP
+up IN B-PRT
+by IN B-PP
+personnel NNS B-NP
+specialists NNS I-NP
+. . O
+
+Smith-Kline NNP B-NP
+Beecham NNP I-NP
+goes VBZ B-VP
+further JJ B-ADVP
+and CC O
+sometimes RB B-VP
+features VBZ I-VP
+its PRP$ B-NP
+grievance NN I-NP
+procedure NN I-NP
+in IN B-PP
+closed-circuit JJ B-NP
+TV NN I-NP
+programs NNS I-NP
+. . O
+
+Naturally RB B-ADVP
+, , O
+one CD B-NP
+of IN B-PP
+the DT B-NP
+best JJS I-NP
+ways NNS I-NP
+to TO B-VP
+guarantee VB I-VP
+visibility NN B-NP
+for IN B-PP
+your PRP$ B-NP
+due-process NN I-NP
+system NN I-NP
+is VBZ B-VP
+for IN B-SBAR
+top JJ B-NP
+management NN I-NP
+to TO B-VP
+support VB I-VP
+it PRP B-NP
+. . O
+
+At IN B-PP
+IBM NNP B-NP
+, , O
+the DT B-NP
+company NN I-NP
+'s POS B-NP
+Open NNP I-NP
+Door NNP I-NP
+system NN I-NP
+is VBZ B-VP
+sometimes RB B-ADVP
+the DT B-NP
+subject NN I-NP
+of IN B-PP
+memorandums NNS B-NP
+from IN B-PP
+the DT B-NP
+chief JJ I-NP
+executive NN I-NP
+. . O
+
+Federal NNP B-NP
+Express NNP I-NP
+goes VBZ B-VP
+further JJ B-ADVP
+in IN B-PP
+this DT B-NP
+respect NN I-NP
+than IN B-PP
+any DT B-NP
+company NN I-NP
+I PRP B-NP
+know VBP B-VP
+of IN B-PP
+with IN B-PP
+both DT B-NP
+Frederick NNP B-NP
+Smith NNP I-NP
+and CC O
+James NNP B-NP
+Barksdale NNP I-NP
+, , O
+chief JJ B-NP
+executive NN I-NP
+and CC O
+chief JJ B-NP
+operating VBG I-NP
+officer NN I-NP
+, , O
+respectively RB B-ADVP
+, , O
+sitting VBG B-VP
+in IN B-PRT
+on IN B-PP
+the DT B-NP
+Appeals NNP I-NP
+Board NNP I-NP
+almost RB B-NP
+every DT I-NP
+Tuesday NNP I-NP
+to TO B-VP
+decide VB I-VP
+cases NNS B-NP
+. . O
+
+Mr. NNP B-NP
+Ewing NNP I-NP
+is VBZ B-VP
+a DT B-NP
+consultant NN I-NP
+based VBN B-VP
+in IN B-PP
+Winchester NNP B-NP
+, , O
+Mass. NNP B-NP
+, , O
+and CC O
+author NN B-NP
+of IN B-PP
+`` `` O
+Justice NNP B-NP
+on IN B-PP
+the DT B-NP
+Job NNP I-NP
+: : O
+Resolving NNP B-VP
+Grievances NNP B-NP
+in IN B-PP
+the DT B-NP
+Nonunion NNP I-NP
+Workplace NN I-NP
+'' '' O
+-LRB- ( O
+Harvard NNP B-NP
+Business NNP I-NP
+School NNP I-NP
+Press NNP I-NP
+, , O
+1989 CD B-NP
+-RRB- ) O
+. . O
+
+Tokyo NNP B-NP
+stocks NNS I-NP
+closed VBD B-VP
+higher JJR B-ADVP
+in IN B-PP
+active JJ B-NP
+trading NN I-NP
+Friday NNP B-NP
+, , O
+marking VBG B-VP
+the DT B-NP
+fourth JJ I-NP
+consecutive JJ I-NP
+daily JJ I-NP
+gain NN I-NP
+since IN B-PP
+Monday NNP B-NP
+'s POS B-NP
+sharp JJ I-NP
+fall NN I-NP
+. . O
+
+London JJ B-NP
+shares NNS I-NP
+closed VBD B-VP
+moderately RB B-ADVP
+lower JJR I-ADVP
+in IN B-PP
+thin JJ B-NP
+trading NN I-NP
+. . O
+
+At IN B-PP
+Tokyo NNP B-NP
+, , O
+the DT B-NP
+Nikkei NNP I-NP
+index NN I-NP
+of IN B-PP
+225 CD B-NP
+selected VBN I-NP
+issues NNS I-NP
+was VBD B-VP
+up IN B-ADVP
+112.16 CD B-NP
+points NNS I-NP
+to TO B-PP
+35486.38 CD B-NP
+. . O
+
+The DT B-NP
+index NN I-NP
+advanced VBD B-VP
+266.66 CD B-NP
+points NNS I-NP
+Thursday NNP B-NP
+. . O
+
+In IN B-PP
+early JJ B-NP
+trading NN I-NP
+in IN B-PP
+Tokyo NNP B-NP
+Monday NNP B-NP
+, , O
+the DT B-NP
+Nikkei NNP I-NP
+index NN I-NP
+rose VBD B-VP
+101.98 CD B-NP
+points NNS I-NP
+to TO B-PP
+35588.36 CD B-NP
+. . O
+
+Friday NNP B-NP
+'s POS B-NP
+volume NN I-NP
+on IN B-PP
+the DT B-NP
+First NNP I-NP
+Section NN I-NP
+was VBD B-VP
+estimated VBN I-VP
+at IN B-PP
+one CD B-NP
+billion CD I-NP
+shares NNS I-NP
+, , O
+up IN B-ADVP
+from IN B-PP
+862 CD B-NP
+million CD I-NP
+Thursday NNP B-NP
+. . O
+
+Winners NNS B-NP
+outpaced VBD B-VP
+losers NNS B-NP
+, , O
+572 CD B-ADVP
+to TO I-ADVP
+368 CD I-ADVP
+, , O
+while IN B-SBAR
+181 CD B-NP
+issues NNS I-NP
+remained VBD B-VP
+unchanged JJ B-ADJP
+. . O
+
+With IN B-SBAR
+investors NNS B-NP
+relieved VBN B-ADJP
+at IN B-PP
+the DT B-NP
+overnight JJ I-NP
+gain NN I-NP
+in IN B-PP
+New NNP B-NP
+York NNP I-NP
+stocks NNS I-NP
+, , O
+small-lot JJ B-NP
+buying NN I-NP
+orders NNS I-NP
+streamed VBD B-VP
+into IN B-PP
+the DT B-NP
+market NN I-NP
+from IN B-PP
+early JJ B-NP
+morning NN I-NP
+, , O
+making VBG B-VP
+traders NNS B-NP
+believe VBP B-VP
+the DT B-NP
+market NN I-NP
+was VBD B-VP
+back RB B-ADVP
+to TO B-PP
+normal JJ B-NP
+. . O
+
+The DT B-NP
+Nikkei NNP I-NP
+, , O
+which WDT B-NP
+reached VBD B-VP
+as RB B-ADJP
+high JJ I-ADJP
+as IN B-PP
+35611.38 CD B-NP
+right NN B-ADVP
+after IN B-PP
+the DT B-NP
+opening NN I-NP
+, , O
+surrendered VBD B-VP
+part NN B-NP
+of IN B-PP
+its PRP$ B-NP
+early JJ I-NP
+advance NN I-NP
+toward IN B-PP
+the DT B-NP
+end NN I-NP
+of IN B-PP
+the DT B-NP
+day NN I-NP
+because IN B-PP
+of IN I-PP
+profit-taking NN B-NP
+. . O
+
+`` `` O
+Investors NNS B-NP
+, , B-NP
+especially RB I-NP
+dealers NNS B-NP
+, , O
+do VBP B-VP
+n't RB I-VP
+want VB I-VP
+to TO I-VP
+hold VB I-VP
+a DT B-NP
+position NN I-NP
+over IN B-PP
+the DT B-NP
+weekend NN I-NP
+, , O
+'' '' O
+a DT B-NP
+trader NN I-NP
+at IN B-PP
+Dai-ichi NNP B-NP
+Securities NNP I-NP
+said VBD B-VP
+, , O
+adding VBG B-VP
+, , O
+though RB B-ADVP
+, , O
+that IN B-SBAR
+the DT B-NP
+trading NN I-NP
+mood NN I-NP
+remained VBD B-VP
+positive JJ B-ADJP
+through IN B-PP
+the DT B-NP
+afternoon NN I-NP
+session NN I-NP
+. . O
+
+The DT B-NP
+Tokyo NNP I-NP
+Stock NNP I-NP
+Price NNP I-NP
+Index NNP I-NP
+-LRB- ( O
+Topix NNP B-NP
+-RRB- ) O
+of IN B-PP
+all DT B-NP
+issues NNS I-NP
+listed VBN B-VP
+in IN B-PP
+the DT B-NP
+First NNP I-NP
+Section NN I-NP
+, , O
+which WDT B-NP
+gained VBD B-VP
+22.78 CD B-NP
+points NNS I-NP
+Thursday NNP B-NP
+, , O
+was VBD B-VP
+up IN B-ADVP
+14.06 CD B-NP
+points NNS I-NP
+, , O
+or CC O
+0.53 CD B-NP
+% NN I-NP
+, , O
+at IN B-PP
+2679.72 CD B-NP
+. . O
+
+The DT B-NP
+Second JJ I-NP
+Section NN I-NP
+index NN I-NP
+, , O
+which WDT B-NP
+rose VBD B-VP
+15.72 CD B-NP
+points NNS I-NP
+Thursday NNP B-NP
+, , O
+was VBD B-VP
+up IN B-ADVP
+11.88 CD B-NP
+points NNS I-NP
+, , O
+or CC O
+0.32 CD B-NP
+% NN I-NP
+, , O
+to TO B-VP
+close VB I-VP
+at IN B-PP
+3717.46 CD B-NP
+. . O
+
+Volume NN B-NP
+in IN B-PP
+the DT B-NP
+second JJ I-NP
+section NN I-NP
+was VBD B-VP
+estimated VBN I-VP
+at IN B-PP
+30 CD B-NP
+million CD I-NP
+shares NNS I-NP
+, , O
+up IN B-ADVP
+from IN B-PP
+28 CD B-NP
+million CD I-NP
+Thursday NNP B-NP
+. . O
+
+In IN B-PP
+turmoil NN B-NP
+caused VBN B-VP
+by IN B-PP
+the DT O
+previous JJ B-NP
+Friday NNP I-NP
+'s POS B-NP
+plunge NN I-NP
+in IN B-PP
+New NNP B-NP
+York NNP I-NP
+stocks NNS I-NP
+, , O
+the DT B-NP
+Nikkei NNP I-NP
+marked VBD B-VP
+a DT B-NP
+sharp JJ I-NP
+647.33-point JJ I-NP
+fall NN I-NP
+Monday NNP B-NP
+. . O
+
+But CC O
+the DT B-NP
+Nikkei NNP I-NP
+fell VBD B-VP
+an DT B-NP
+overall JJ I-NP
+1.8 CD I-NP
+% NN I-NP
+in IN B-PP
+value NN B-NP
+that DT B-NP
+day NN I-NP
+compared VBN B-PP
+with IN B-PP
+Wall NNP B-NP
+Street NNP I-NP
+'s POS I-NP
+far RB B-ADJP
+sharper JJR I-ADJP
+6.9 CD B-ADJP
+% NN I-ADJP
+drop NN B-NP
+on IN B-PP
+Oct. NNP B-NP
+13 CD I-NP
+. . O
+
+The DT B-NP
+Tokyo NNP I-NP
+market NN I-NP
+'s POS B-NP
+resiliency NN I-NP
+helped VBD B-VP
+participants NNS B-NP
+to TO B-VP
+regain VB I-VP
+confidence NN B-NP
+gradually RB B-ADVP
+as IN B-SBAR
+they PRP B-NP
+spent VBD B-VP
+more JJR B-NP
+time NN I-NP
+on IN B-PP
+analyzing VBG B-VP
+factors NNS B-NP
+that WDT B-NP
+caused VBD B-VP
+the DT B-NP
+Friday NNP I-NP
+plunge NN I-NP
+and CC O
+realized VBD B-VP
+these DT B-NP
+problems NNS I-NP
+were VBD B-VP
+unique JJ B-ADJP
+to TO B-PP
+New NNP B-NP
+York NNP I-NP
+stocks NNS I-NP
+and CC B-ADJP
+not RB I-ADJP
+directly RB B-ADJP
+related VBN I-ADJP
+to TO B-PP
+Tokyo NNP B-NP
+. . O
+
+The DT B-NP
+Nikkei NNP I-NP
+continued VBD B-VP
+to TO I-VP
+gain VB I-VP
+for IN B-PP
+the DT B-NP
+rest NN I-NP
+of IN B-PP
+the DT B-NP
+week NN I-NP
+, , O
+adding VBG B-VP
+1017.69 CD B-NP
+points NNS I-NP
+in IN B-PP
+four CD B-NP
+days NNS I-NP
+-- : O
+more JJR B-VP
+than IN I-VP
+erasing VBG I-VP
+Monday NNP B-NP
+'s POS B-NP
+losses NNS I-NP
+. . O
+
+But CC O
+further JJ B-NP
+major JJ I-NP
+advances NNS I-NP
+on IN B-PP
+the DT B-NP
+Nikkei NNP I-NP
+are VBP B-VP
+n't RB I-VP
+foreseen VBN I-VP
+this DT B-NP
+week NN I-NP
+by IN B-PP
+market NN B-NP
+observers NNS I-NP
+. . O
+
+Investors NNS B-NP
+are VBP B-VP
+still RB I-VP
+waiting VBG I-VP
+to TO I-VP
+see VB I-VP
+how WRB B-ADVP
+the DT B-NP
+U.S. NNP I-NP
+government NN I-NP
+will MD B-VP
+decide VB I-VP
+on IN B-PP
+interest NN B-NP
+rates NNS I-NP
+and CC O
+how WRB B-ADVP
+the DT B-NP
+dollar NN I-NP
+will MD B-VP
+be VB I-VP
+stabilized VBN I-VP
+. . O
+
+Some DT B-NP
+high-priced JJ I-NP
+issues NNS I-NP
+made VBD B-VP
+a DT B-NP
+comeback NN I-NP
+Friday NNP B-NP
+. . O
+
+Pioneer NNP B-NP
+surged VBD B-VP
+450 CD B-NP
+yen NN I-NP
+-LRB- ( O
+$ $ B-NP
+3.16 CD I-NP
+-RRB- ) O
+to TO B-PP
+6,050 CD B-NP
+yen NN I-NP
+-LRB- ( O
+$ $ B-NP
+42.60 CD I-NP
+-RRB- ) O
+. . O
+
+Kyocera NNP B-NP
+advanced VBD B-VP
+80 CD B-NP
+yen NN I-NP
+to TO B-PP
+5,440 CD B-NP
+. . O
+
+Fanuc NNP B-NP
+gained VBD B-VP
+100 CD B-NP
+to TO B-PP
+7,580 CD B-NP
+. . O
+
+Breweries NNP B-NP
+attracted VBD B-VP
+investors NNS B-NP
+because IN B-PP
+of IN I-PP
+their PRP$ B-NP
+land NN I-NP
+property NN I-NP
+holdings NNS I-NP
+that WDT B-NP
+could MD B-VP
+figure VB I-VP
+in IN B-PP
+development NN B-NP
+or CC O
+other JJ B-NP
+plans NNS I-NP
+, , O
+traders NNS B-NP
+said VBD B-VP
+. . O
+
+Sapporo NNP B-NP
+gained VBD B-VP
+80 CD B-NP
+to TO B-PP
+1,920 CD B-NP
+and CC O
+Kirin NNP B-NP
+added VBD B-VP
+60 CD B-NP
+to TO B-PP
+2,070 CD B-NP
+. . O
+
+Housings NNS B-NP
+, , I-NP
+constructions NNS I-NP
+and CC I-NP
+pharmaceuticals NNS I-NP
+continued VBD B-VP
+to TO I-VP
+be VB I-VP
+bought VBN I-VP
+following VBG B-PP
+Thursday NNP B-NP
+'s POS B-NP
+gains NNS I-NP
+because IN B-PP
+of IN I-PP
+strong JJ B-NP
+earnings NNS I-NP
+outlooks NNS I-NP
+. . O
+
+Daiwa NNP B-NP
+House NNP I-NP
+gained VBD B-VP
+50 CD B-NP
+to TO B-PP
+2,660 CD B-NP
+. . O
+
+Misawa NNP B-NP
+Homes NNP I-NP
+was VBD B-VP
+up IN B-ADVP
+20 CD B-NP
+at IN B-PP
+2,960 CD B-NP
+. . O
+
+Kajima NNP B-NP
+advanced VBD B-VP
+40 CD B-NP
+to TO B-PP
+2,120 CD B-NP
+and CC O
+Ohbayashi NNP B-NP
+added VBD B-VP
+50 CD B-NP
+to TO B-PP
+1,730 CD B-NP
+. . O
+
+Fujisawa NNP B-NP
+added VBD B-VP
+80 CD B-NP
+to TO B-PP
+2,010 CD B-NP
+and CC O
+Mochida NNP B-NP
+advanced VBD B-VP
+230 CD B-NP
+to TO B-PP
+4,400 CD B-NP
+. . O
+
+London JJ B-NP
+share NN I-NP
+prices NNS I-NP
+were VBD B-VP
+influenced VBN I-VP
+largely RB B-ADVP
+by IN B-PP
+declines NNS B-NP
+on IN B-PP
+Wall NNP B-NP
+Street NNP I-NP
+and CC O
+weakness NN B-NP
+in IN B-PP
+the DT B-NP
+British JJ I-NP
+pound NN I-NP
+. . O
+
+The DT B-NP
+key JJ I-NP
+Financial NNP I-NP
+Times-Stock NNP I-NP
+Exchange NNP I-NP
+100-share JJ I-NP
+index NN I-NP
+ended VBD B-VP
+10.2 CD B-NP
+points NNS I-NP
+lower JJR B-ADVP
+at IN B-PP
+2179.1 CD B-NP
+, , O
+above IN B-ADVP
+its PRP$ B-NP
+intraday JJ I-NP
+low NN I-NP
+of IN B-PP
+2176.9 CD B-NP
+, , B-ADVP
+but CC I-ADVP
+off IN B-ADVP
+the DT B-NP
+day NN I-NP
+'s POS I-NP
+high NN B-NP
+of IN B-PP
+2189 CD B-NP
+. . O
+
+The DT B-NP
+index NN I-NP
+finished VBD B-VP
+2.4 CD B-NP
+% NN I-NP
+under IN B-PP
+its PRP$ B-NP
+close NN I-NP
+of IN B-PP
+2233.9 CD B-NP
+the DT B-NP
+previous JJ I-NP
+Friday NNP I-NP
+, , O
+although IN B-SBAR
+it PRP B-NP
+recouped VBD B-VP
+some DT B-NP
+of IN B-PP
+the DT B-NP
+sharp JJ I-NP
+losses NNS I-NP
+staged VBD B-VP
+early JJ B-NP
+last JJ I-NP
+week NN I-NP
+on IN B-PP
+the DT B-NP
+back RB I-NP
+of IN B-PP
+Wall NNP B-NP
+Street NNP I-NP
+'s POS B-NP
+fall NN I-NP
+. . O
+
+London NNP B-NP
+was VBD B-VP
+weak JJ B-ADJP
+throughout IN B-PP
+Friday NNP B-NP
+'s POS B-NP
+trading NN I-NP
+, , O
+however RB B-ADVP
+, , O
+on IN B-PP
+what WP B-NP
+dealers NNS B-NP
+attributed VBD B-VP
+to TO B-PP
+generally RB B-NP
+thin JJ I-NP
+interest NN I-NP
+ahead RB B-ADVP
+of IN B-PP
+the DT B-NP
+weekend NN I-NP
+and CC O
+this DT B-NP
+week NN I-NP
+'s POS I-NP
+potentially RB B-ADJP
+important JJ I-ADJP
+U.K. NNP B-NP
+trade NN I-NP
+figures NNS I-NP
+for IN B-PP
+September NNP B-NP
+. . O
+
+The DT B-NP
+FT-SE NNP I-NP
+100 CD I-NP
+largely RB B-ADVP
+remained VBD B-VP
+within IN B-PP
+an DT B-NP
+11-point JJ I-NP
+range NN I-NP
+establshed VBN B-VP
+within IN B-PP
+the DT B-NP
+first JJ I-NP
+hour NN I-NP
+of IN B-PP
+trading NN B-NP
+before IN B-PP
+it PRP B-NP
+eased VBD B-VP
+to TO B-PP
+an DT B-NP
+intraday JJ I-NP
+low JJ I-NP
+late RB B-ADVP
+in IN B-PP
+the DT B-NP
+session NN I-NP
+when WRB B-ADVP
+a DT B-NP
+flurry NN I-NP
+of IN B-PP
+program NN B-NP
+selling VBG I-NP
+pushed VBN B-VP
+Wall NNP B-NP
+Street NNP I-NP
+lower JJR B-ADVP
+. . O
+
+The DT B-NP
+FT NNP I-NP
+30-share JJ I-NP
+index NN I-NP
+closed VBD B-VP
+11.0 CD B-NP
+points NNS I-NP
+lower JJR B-ADVP
+at IN B-PP
+1761.0 CD B-NP
+. . O
+
+Volume NN B-NP
+was VBD B-VP
+extremely RB B-ADJP
+thin JJ I-ADJP
+at IN B-PP
+351.3 CD B-NP
+million CD I-NP
+shares NNS I-NP
+, , O
+the DT B-NP
+lightest JJS I-NP
+volume NN I-NP
+of IN B-PP
+the DT B-NP
+week NN I-NP
+and CC O
+modestly RB B-ADVP
+under IN B-PP
+Thursday NNP B-NP
+'s POS B-NP
+387.4 CD I-NP
+million CD I-NP
+shares NNS I-NP
+. . O
+
+Dealers NNS B-NP
+said VBD B-VP
+the DT B-NP
+day NN I-NP
+'s POS B-NP
+action NN I-NP
+was VBD B-VP
+featureless JJ B-ADJP
+outside IN B-PP
+some DT B-NP
+response NN I-NP
+to TO B-PP
+sterling NN B-NP
+'s POS B-NP
+early JJ I-NP
+weakness NN I-NP
+against IN B-PP
+the DT B-NP
+mark NN I-NP
+, , O
+and CC O
+fears NNS B-NP
+that IN B-SBAR
+Wall NNP B-NP
+Street NNP I-NP
+might MD B-VP
+open RB I-VP
+lower JJR B-ADVP
+after IN B-PP
+its PRP$ B-NP
+strong JJ I-NP
+leap NN I-NP
+forward RB B-ADVP
+Thursday NNP B-NP
+. . O
+
+They PRP B-NP
+added VBD B-VP
+that IN B-SBAR
+market-makers NNS B-NP
+were VBD B-VP
+largely RB I-VP
+sidelined VBN I-VP
+after IN B-PP
+aggressively RB B-VP
+supporting VBG I-VP
+the DT B-NP
+market NN I-NP
+Thursday NNP B-NP
+in IN B-PP
+their PRP$ B-NP
+quest NN I-NP
+to TO B-VP
+cover VB I-VP
+internal JJ B-NP
+shortages NNS I-NP
+of IN B-PP
+FT-SE NNP B-NP
+100 CD I-NP
+shares NNS I-NP
+. . O
+
+Interest NN B-NP
+may MD B-VP
+remain VB I-VP
+limited JJ B-ADJP
+into IN B-PP
+tomorrow NN B-NP
+'s POS B-NP
+U.K. NNP I-NP
+trade NN I-NP
+figures NNS I-NP
+, , O
+which WDT B-NP
+the DT B-NP
+market NN I-NP
+will MD B-VP
+be VB I-VP
+watching VBG I-VP
+closely RB B-ADVP
+to TO B-VP
+see VB I-VP
+if IN B-SBAR
+there EX B-NP
+is VBZ B-VP
+any DT B-NP
+improvement NN I-NP
+after IN B-PP
+disappointing JJ B-NP
+numbers NNS I-NP
+in IN B-PP
+the DT B-NP
+previous JJ I-NP
+two CD I-NP
+months NNS I-NP
+. . O
+
+The DT B-NP
+key JJ I-NP
+corporate JJ I-NP
+news NN I-NP
+of IN B-PP
+the DT B-NP
+day NN I-NP
+was VBD B-VP
+that IN B-SBAR
+British JJ B-NP
+Airways NNPS I-NP
+decided VBD B-VP
+to TO I-VP
+withdraw VB I-VP
+from IN B-PP
+a DT B-NP
+management-led JJ I-NP
+bid NN I-NP
+for IN B-PP
+UAL NNP B-NP
+Corp. NNP I-NP
+, , O
+the DT B-NP
+parent NN I-NP
+of IN B-PP
+United NNP B-NP
+Airlines NNPS I-NP
+. . O
+
+British JJ B-NP
+Airways NNPS I-NP
+rose VBD B-VP
+initially RB B-ADVP
+after IN B-PP
+announcing VBG B-VP
+its PRP$ B-NP
+withdrawal NN I-NP
+from IN B-PP
+the DT B-NP
+UAL NNP I-NP
+deal NN I-NP
+. . O
+
+Dealers NNS B-NP
+said VBD B-VP
+they PRP B-NP
+viewed VBD B-VP
+the DT O
+initial JJ O
+# # O
+390-million CD O
+-LRB- ( O
+$ $ B-ADJP
+622 CD O
+million CD O
+-RRB- ) O
+outlay NN B-NP
+for IN B-PP
+a DT B-NP
+15 CD I-NP
+% NN I-NP
+stake NN I-NP
+in IN B-PP
+the DT B-NP
+airline NN I-NP
+as IN B-PP
+a DT B-NP
+bit NN I-NP
+much JJ I-NP
+. . O
+
+Its PRP$ B-NP
+shares NNS I-NP
+slid VBD B-VP
+in IN B-PP
+late JJ B-NP
+dealings NNS I-NP
+to TO B-VP
+close VB I-VP
+a DT B-NP
+penny NN I-NP
+per IN B-PP
+share NN B-NP
+lower JJR B-ADVP
+at IN B-PP
+197 CD B-NP
+pence NN I-NP
+. . O
+
+The DT B-NP
+airline NN I-NP
+was VBD B-VP
+the DT B-NP
+most RBS I-NP
+active JJ I-NP
+FT-SE NNP I-NP
+100 CD I-NP
+at IN B-PP
+8.2 CD B-NP
+million CD I-NP
+shares NNS I-NP
+traded VBN B-VP
+. . O
+
+The DT B-NP
+next JJ I-NP
+most RBS I-NP
+active JJ I-NP
+top-tier JJ I-NP
+stock NN I-NP
+was VBD B-VP
+B.A.T NNP B-NP
+Industries NNPS I-NP
+, , O
+the DT B-NP
+target NN I-NP
+of IN B-PP
+Sir NNP B-NP
+James NNP I-NP
+Goldsmith NNP I-NP
+'s POS B-NP
+# # B-ADJP
+13.4 CD O
+billion CD O
+bid NN B-NP
+. . O
+
+The DT B-NP
+company NN I-NP
+gained VBD B-VP
+shareholder NN B-NP
+approval NN I-NP
+Thursday NNP B-NP
+to TO B-VP
+restructure VB I-VP
+in IN B-PP
+a DT B-NP
+bid NN I-NP
+to TO B-VP
+fend VB I-VP
+off IN B-PRT
+the DT B-NP
+hostile JJ I-NP
+takeover NN I-NP
+. . O
+
+Sir NNP B-NP
+James NNP I-NP
+said VBD B-VP
+Thursday NNP B-NP
+night NN I-NP
+that IN B-SBAR
+his PRP$ B-NP
+plans NNS I-NP
+for IN B-PP
+the DT B-NP
+takeover NN I-NP
+had VBD B-VP
+n't RB I-VP
+changed VBN I-VP
+. . O
+
+B.A.T NNP B-NP
+ended VBD B-VP
+the DT B-NP
+day NN I-NP
+at IN B-PP
+778 CD B-NP
+, , O
+down JJ B-ADVP
+5 NN B-NP
+, , O
+on IN B-PP
+turnover NN B-NP
+of IN B-PP
+7.5 CD B-NP
+million CD I-NP
+shares NNS I-NP
+. . O
+
+Dealers NNS B-NP
+said VBD B-VP
+it PRP B-NP
+was VBD B-VP
+hit VBN I-VP
+by IN B-PP
+some DT B-NP
+profit-taking NN I-NP
+after IN B-PP
+gains NNS B-NP
+since IN B-PP
+mid-week NN B-NP
+. . O
+
+In IN B-PP
+other JJ B-NP
+active JJ I-NP
+shares NNS I-NP
+, , O
+Trusthouse NNP B-NP
+Forte NNP I-NP
+shed VB B-VP
+10 CD B-NP
+to TO B-PP
+294 CD B-NP
+on IN B-PP
+volume NN B-NP
+of IN B-PP
+6.4 CD B-NP
+million CD I-NP
+shares NNS I-NP
+after IN B-PP
+a DT B-NP
+Barclays NNP I-NP
+De NNP I-NP
+Zoete NNP I-NP
+Wedd NNP I-NP
+downgrading NN I-NP
+, , O
+while IN B-SBAR
+Hillsdown NNP B-NP
+Holdings NNP I-NP
+, , O
+a DT B-NP
+food NN I-NP
+products NNS I-NP
+concern VBP I-NP
+, , O
+was VBD B-VP
+boosted VBN I-VP
+2 CD B-NP
+to TO B-PP
+271 CD B-NP
+after IN O
+it PRP B-NP
+disclosed VBD B-VP
+it PRP B-NP
+would MD B-VP
+seek VB I-VP
+shareholder NN B-NP
+approval NN I-NP
+to TO B-VP
+begin VB I-VP
+share NN B-NP
+repurchases NNS I-NP
+. . O
+
+Elsewhere RB B-ADVP
+in IN B-PP
+Europe NNP B-NP
+, , O
+share NN B-NP
+prices NNS I-NP
+closed VBD B-VP
+higher JJR B-ADVP
+in IN B-PP
+Stockholm NNP B-NP
+, , I-NP
+Brussels NNP I-NP
+and CC I-NP
+Milan NNP I-NP
+. . O
+
+Prices NNS B-NP
+were VBD B-VP
+lower JJR B-ADJP
+in IN B-PP
+Frankfurt NNP B-NP
+, , I-NP
+Zurich NNP I-NP
+, , I-NP
+Paris NNP I-NP
+and CC I-NP
+Amsterdam NNP I-NP
+. . O
+
+South JJ B-NP
+African JJ I-NP
+gold NN I-NP
+stocks NNS I-NP
+closed VBD B-VP
+moderately RB B-ADVP
+lower JJR I-ADVP
+. . O
+
+Share NN B-NP
+prices NNS I-NP
+closed VBD B-VP
+higher JJR B-ADVP
+in IN B-PP
+Sydney NNP B-NP
+, , O
+Taipei NNP B-NP
+, , O
+Wellington NNP B-NP
+, , O
+Manila NNP B-NP
+, , O
+Hong NNP B-NP
+Kong NNP I-NP
+and CC O
+Singapore NNP B-NP
+and CC O
+were VBD B-VP
+lower JJR B-ADJP
+in IN B-PP
+Seoul NNP B-NP
+. . O
+
+Here RB B-ADVP
+are VBP B-VP
+price NN B-NP
+trends NNS I-NP
+on IN B-PP
+the DT B-NP
+world NN I-NP
+'s POS B-NP
+major JJ I-NP
+stock NN I-NP
+markets NNS I-NP
+, , O
+as IN B-SBAR
+calculated VBN B-VP
+by IN B-PP
+Morgan NNP B-NP
+Stanley NNP I-NP
+Capital NNP I-NP
+International NNP I-NP
+Perspective NNP I-NP
+, , O
+Geneva NNP B-NP
+. . O
+
+To TO B-VP
+make VB I-VP
+them PRP B-NP
+directly RB B-ADJP
+comparable JJ I-ADJP
+, , O
+each DT B-NP
+index NN I-NP
+is VBZ B-VP
+based VBN I-VP
+on IN B-PP
+the DT B-NP
+close NN I-NP
+of IN B-PP
+1969 CD B-NP
+equaling VBG B-VP
+100 CD B-NP
+. . O
+
+The DT B-NP
+percentage NN I-NP
+change NN I-NP
+is VBZ B-VP
+since IN B-PP
+year-end NN B-NP
+. . O
+
+The DT B-NP
+U.S. NNP I-NP
+is VBZ B-VP
+required VBN I-VP
+to TO I-VP
+notify VB I-VP
+foreign JJ B-NP
+dictators NNS I-NP
+if IN B-SBAR
+it PRP B-NP
+knows VBZ B-VP
+of IN B-PP
+coup NN B-NP
+plans NNS I-NP
+likely JJ B-ADJP
+to TO B-VP
+endanger VB I-VP
+their PRP$ B-NP
+lives NNS I-NP
+, , O
+government NN B-NP
+officials NNS I-NP
+said VBD B-VP
+. . O
+
+The DT B-NP
+notification NN I-NP
+policy NN I-NP
+was VBD B-VP
+part NN B-NP
+of IN B-PP
+a DT B-NP
+set NN I-NP
+of IN B-PP
+guidelines NNS B-NP
+on IN B-PP
+handling NN B-VP
+coups NNS B-NP
+outlined VBN B-VP
+in IN B-PP
+a DT B-NP
+secret JJ I-NP
+1988 CD I-NP
+exchange NN I-NP
+of IN B-PP
+letters NNS B-NP
+between IN B-PP
+the DT B-NP
+Reagan NNP I-NP
+administration NN I-NP
+and CC O
+the DT B-NP
+Senate NNP I-NP
+Intelligence NNP I-NP
+Committee NNP I-NP
+. . O
+
+The DT B-NP
+existence NN I-NP
+of IN B-PP
+the DT B-NP
+guidelines NNS I-NP
+has VBZ B-VP
+become VBN I-VP
+known VBN I-VP
+since IN B-SBAR
+President NNP B-NP
+Bush NNP I-NP
+disclosed VBD B-VP
+them PRP B-NP
+privately RB B-ADVP
+to TO B-PP
+seven CD B-NP
+Republican NNP I-NP
+senators NNS I-NP
+at IN B-PP
+a DT B-NP
+White NNP I-NP
+House NNP I-NP
+meeting NN I-NP
+last JJ B-NP
+Monday NNP I-NP
+. . O
+
+Officials NNS B-NP
+familiar JJ B-ADJP
+with IN B-PP
+the DT B-NP
+meeting NN I-NP
+said VBD B-VP
+Mr. NNP B-NP
+Bush NNP I-NP
+cited VBD B-VP
+the DT B-NP
+policy NN I-NP
+as IN B-PP
+an DT B-NP
+example NN I-NP
+of IN B-PP
+the DT B-NP
+sort NN I-NP
+of IN B-PP
+congressional JJ B-NP
+requirements NNS I-NP
+the DT B-NP
+administration NN I-NP
+contends VBZ B-VP
+contribute VB B-VP
+to TO B-PP
+the DT B-NP
+failure NN I-NP
+of IN B-PP
+such JJ B-NP
+covert JJ I-NP
+actions NNS I-NP
+as IN B-PP
+this DT B-NP
+month NN I-NP
+'s POS B-NP
+futile JJ I-NP
+effort NN I-NP
+to TO B-VP
+oust VB I-VP
+Panamanian JJ B-NP
+dictator NN I-NP
+Manuel NNP I-NP
+Noriega NNP I-NP
+. . O
+
+According VBG B-PP
+to TO B-PP
+the DT B-NP
+officials NNS I-NP
+, , O
+Mr. NNP B-NP
+Bush NNP I-NP
+even RB B-ADVP
+read VB B-VP
+to TO B-PP
+the DT B-NP
+senators NNS I-NP
+selections NNS B-NP
+from IN B-PP
+a DT B-NP
+highly RB I-NP
+classified VBN I-NP
+letter NN I-NP
+from IN B-PP
+the DT B-NP
+committee NN I-NP
+to TO B-PP
+the DT B-NP
+White NNP I-NP
+House NNP I-NP
+discussing VBG B-VP
+the DT B-NP
+guidelines NNS I-NP
+. . O
+
+They PRP B-NP
+said VBD B-VP
+the DT B-NP
+president NN I-NP
+conceded VBD B-VP
+the DT B-NP
+notification NN I-NP
+requirement NN I-NP
+did VBD B-VP
+n't RB I-VP
+affect VB I-VP
+his PRP$ B-NP
+decision NN I-NP
+to TO B-VP
+lend VB I-VP
+only RB B-NP
+minor JJ I-NP
+support NN I-NP
+to TO B-PP
+this DT B-NP
+month NN I-NP
+'s POS B-NP
+Panama NNP I-NP
+coup NN I-NP
+effort NN I-NP
+. . O
+
+No DT B-NP
+notification NN I-NP
+was VBD B-VP
+ever RB I-VP
+considered VBN I-VP
+, , O
+officials NNS B-NP
+said VBD B-VP
+, , O
+apparently RB B-ADVP
+because IN B-SBAR
+the DT B-NP
+U.S. NNP I-NP
+did VBD B-VP
+n't RB I-VP
+think VB I-VP
+the DT B-NP
+coup NN I-NP
+plotters NNS I-NP
+intended VBN B-VP
+to TO I-VP
+kill VB I-VP
+Mr. NNP B-NP
+Noriega NNP I-NP
+, , O
+but CC O
+merely RB B-VP
+sought VBD I-VP
+to TO I-VP
+imprison VB I-VP
+him PRP B-NP
+. . O
+
+What WP B-NP
+'s VBZ B-VP
+more JJR B-NP
+, , O
+both DT B-NP
+administration NN B-NP
+and CC O
+congressional JJ B-NP
+officials NNS I-NP
+hint VBP B-VP
+that IN B-SBAR
+the DT B-NP
+notification NN I-NP
+requirement NN I-NP
+is VBZ B-VP
+likely JJ B-ADJP
+to TO B-VP
+be VB I-VP
+dropped VBN I-VP
+from IN B-PP
+the DT B-NP
+guidelines NNS I-NP
+on IN B-PP
+coup NN B-NP
+attempts NNS I-NP
+that WDT B-NP
+are VBP B-VP
+being VBG I-VP
+rewritten VBN I-VP
+by IN B-PP
+the DT B-NP
+panel NN I-NP
+and CC O
+the DT B-NP
+White NNP I-NP
+House NNP I-NP
+. . O
+
+The DT B-NP
+rewriting VBG I-NP
+was VBD B-VP
+launched VBN I-VP
+at IN B-PP
+a DT B-NP
+meeting NN I-NP
+between IN B-PP
+Mr. NNP B-NP
+Bush NNP I-NP
+and CC O
+intelligence NN B-NP
+committee NN I-NP
+leaders NNS I-NP
+Oct. NNP B-NP
+12 CD I-NP
+, , O
+a DT B-NP
+few JJ I-NP
+days NNS I-NP
+before IN B-PP
+the DT B-NP
+meeting NN I-NP
+at IN B-PP
+which WDT B-NP
+the DT B-NP
+president NN I-NP
+complained VBD B-VP
+about IN B-PP
+the DT B-NP
+rules NNS I-NP
+. . O
+
+However RB B-ADVP
+, , O
+the DT B-NP
+disclosure NN I-NP
+of IN B-PP
+
diff --git a/paddle/trainer/tests/train_files.txt b/paddle/trainer/tests/train_files.txt
new file mode 100644
index 00000000000000..1c268914953ff0
--- /dev/null
+++ b/paddle/trainer/tests/train_files.txt
@@ -0,0 +1 @@
+trainer/tests/train_proto.bin
diff --git a/paddle/utils/BarrierStat.cpp b/paddle/utils/BarrierStat.cpp
new file mode 100644
index 00000000000000..cbc738a839d08a
--- /dev/null
+++ b/paddle/utils/BarrierStat.cpp
@@ -0,0 +1,336 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <sys/types.h>
+#include <iomanip>
+#include <algorithm>
+#include <string.h>
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/BarrierStat.h"
+#include "paddle/utils/Flags.h"
+
+P_DEFINE_bool(log_barrier_abstract, true,
+              "if true, show abstract of barrier performance");
+P_DEFINE_int32(log_barrier_lowest_nodes, 5,
+               "how many lowest node will be logged");
+P_DEFINE_bool(log_barrier_show_log, false,  // for performance tuning insight
+              "if true, always show barrier abstract even with little gap");
+
+namespace paddle {
+
+std::ostream &operator<<(std::ostream &output, BarrierStatBase &stat) {
+  if (FLAGS_log_barrier_abstract) {
+    std::lock_guard<std::mutex> guard(
+        const_cast<BarrierStatBase &>(stat).lock_);
+    stat.showAbstract(output);
+  }
+  return output;
+}
+
+BarrierStatBase::BarrierStatBase(uint16_t numConnThreads,
+                                 const std::string &name)
+    : totSamples_(0), numConnThreads_(numConnThreads), name_(name) {
+  abstract_.resize(numConnThreads_);
+  if (FLAGS_log_barrier_show_log) {
+    rateThreshold_ = 0.0;
+  } else {
+    /* probablity of abnormal node
+     * p = 1/n + (n/8)/(n+1), n = nodes, n > 1
+     * if the freq of lowest trainerId larger than p,
+     * output FLAGS_log_barrier_lowest_nodes lastTrainerId.
+     * numConnThreads_ indicates nodes
+     */
+    float n = (float)numConnThreads;
+    rateThreshold_ = 1.0 / n + (n / 8.0) / (n + 1.0);
+  }
+}
+
+BarrierEndStat::BarrierEndStat(uint16_t numConnThreads, const std::string &name)
+    : BarrierStatBase(numConnThreads, name) {
+  timeVector_.reset(new TimeVectorEnd(numConnThreads_));
+  reset(true);
+  LOG(INFO) << " create barrierEndStat: " << name
+            << " endBarrier warning rate: " << rateThreshold_;
+}
+
+/*
+ * Note:
+ * the design different pserver entity owns different statSet to obey
+ * the background that different pserver runs separately.
+ */
+void BarrierEndStat::updateStat(struct timeval &cur, int32_t trainerId) {
+  CHECK_LT(trainerId, numConnThreads_) << "trainerId is invalid in barrier";
+
+  std::lock_guard<std::mutex> guard(lock_);
+  timeVector_->addTimeval(cur, trainerId);
+
+  if (timeVector_->full()) {
+    std::lock_guard<std::mutex> abstractGuard(abstractLock_);
+    auto id = timeVector_->getLastTrainerId();
+    auto delta = timeToMicroSecond(timeVector_->getDelta());
+    auto secondDelta = timeToMicroSecond(timeVector_->get1NDelta());
+    auto lastTwoDelta = timeToMicroSecond(timeVector_->getMinus1NDelta());
+    auto midDelta = timeToMicroSecond(timeVector_->getMidNDelta());
+    // discard first sample, since first sample probably is abnormal.
+    if (totSamples_) {
+      abstract_[id].freq++;
+
+      if (delta < abstract_[id].minDelta) {
+        abstract_[id].minDelta = delta;
+      }
+      if (delta > abstract_[id].maxDelta) {
+        abstract_[id].maxDelta = delta;
+      }
+      abstract_[id].totDelta += delta;
+      abstract_[id].totSecondDelta += secondDelta;
+      abstract_[id].totLastTwoDelta += lastTwoDelta;
+      abstract_[id].totMidDelta += midDelta;
+
+      // update totAbstract_
+      totAbstract_.freq++;
+      if (delta < totAbstract_.minDelta) {
+        totAbstract_.minDelta = delta;
+      }
+      if (delta > totAbstract_.maxDelta) {
+        totAbstract_.maxDelta = delta;
+      }
+      totAbstract_.totDelta += delta;
+      totAbstract_.totSecondDelta += secondDelta;
+      totAbstract_.totLastTwoDelta += lastTwoDelta;
+      totAbstract_.totMidDelta += midDelta;
+    }
+
+    totSamples_++;
+    timeVector_->reset();
+  }
+}
+
+void BarrierEndStat::reset(bool clearRawData) {
+  int32_t i = 0;
+
+  totSamples_ = 0;
+
+  std::lock_guard<std::mutex> guard(abstractLock_);
+
+  if (clearRawData) {
+    timeVector_->reset();
+  }
+
+  for (auto &abstract : abstract_) {
+    memset((void *)&abstract, 0, sizeof(abstract));
+    abstract.minDelta = UINT64_MAX;
+    abstract.trainerId = i++;
+  }
+  memset((void *)&totAbstract_, 0, sizeof(Abstract));
+  totAbstract_.minDelta = UINT64_MAX;
+}
+
+void BarrierEndStat::showAbstract(std::ostream &output) {
+  // do not support the case "<=2 pserver"
+  if (numConnThreads_ <= 2 || !totSamples_) {
+    return;
+  }
+
+  // duplicate freq info
+  std::vector<struct Abstract> outputAbstract = abstract_;
+  std::sort(outputAbstract.begin(), outputAbstract.end(),
+            [](const struct Abstract &a, const struct Abstract &b) {
+              return a.freq > b.freq;
+            });
+
+  auto rate = (float)outputAbstract[0].freq / (float)totSamples_;
+  if (rate < rateThreshold_) {
+    return;
+  }
+
+  output << std::setw(20) << name_ << std::endl;
+
+  /*
+   * Note:
+   * avgGap:        the average delta between 1 -- n arriving trainers
+   * avgSecondGap:  the average delta between 2 -- n arriving trainers
+   * avgLastTwoGap: the average delta between n-1 -- n  arriving trainers
+   * avgMidGap:     the average delta between n/2 -- n  arriving trainers
+   * rato: samples / totSamples
+   *
+   * the stat is based on per trainer if trainer_id is set, totAbstract is
+   * stat based on all trainers scope.
+   */
+  output << std::setw(42) << " " << std::setw(15) << "trainerId"
+         << std::setw(15) << "avgGap" << std::setw(15) << "avgSecondGap"
+         << std::setw(15) << "avgLastTwoGap" << std::setw(15) << "avgMidGap"
+         << std::setw(10) << "rate" << std::setw(10) << "samples"
+         << std::setw(10) << "totSamples" << std::endl;
+  // show totAbstract, it's valuable when lastTrainerId is even-distributed'
+  if (!totAbstract_.freq) return;
+  output << std::setw(42) << " " << std::setw(15) << "totAbstract"
+         << std::setw(15) << (totAbstract_.totDelta / totAbstract_.freq) * 0.001
+         << std::setw(15)
+         << (totAbstract_.totSecondDelta / totAbstract_.freq) * 0.001
+         << std::setw(15)
+         << (totAbstract_.totLastTwoDelta / totAbstract_.freq) * 0.001
+         << std::setw(15)
+         << (totAbstract_.totMidDelta / totAbstract_.freq) * 0.001
+         << std::setw(10) << (float)totAbstract_.freq / (float)totSamples_
+         << std::setw(10) << (float)totAbstract_.freq << std::setw(10)
+         << (float)totSamples_ << std::endl;
+
+  // show lastTrainerId abstract
+  int count = 0;
+  for (auto &abstract : outputAbstract) {
+    if (!abstract.freq || count++ >= FLAGS_log_barrier_lowest_nodes) {
+      break;
+    }
+    // output format control
+    output << std::setw(42) << " " << std::setw(15) << abstract.trainerId
+           << std::setw(15) << (abstract.totDelta / abstract.freq) * 0.001
+           << std::setw(15) << (abstract.totSecondDelta / abstract.freq) * 0.001
+           << std::setw(15)
+           << (abstract.totLastTwoDelta / abstract.freq) * 0.001
+           << std::setw(15) << (abstract.totMidDelta / abstract.freq) * 0.001
+           << std::setw(10) << (float)abstract.freq / (float)totSamples_
+           << std::setw(10) << (float)abstract.freq << std::setw(10)
+           << (float)totSamples_ << std::endl;
+  }
+}
+
+BarrierDeltaStat::BarrierDeltaStat(uint16_t numConnThreads,
+                                   const std::string &name)
+    : BarrierStatBase(numConnThreads, name) {
+  timeVector_.reset(new TimeVectorDelta(numConnThreads_));
+  reset(true);
+  LOG(INFO) << " create barrierDeltaStat: " << name
+            << " barrierDelta warning rate: " << rateThreshold_;
+}
+
+void BarrierDeltaStat::updateStat(uint64_t delta, int32_t trainerId) {
+  CHECK_LT(trainerId, numConnThreads_) << "trainerId is invalid in barrier";
+
+  std::lock_guard<std::mutex> guard(lock_);
+  timeVector_->addTimeval(delta, trainerId);
+
+  if (timeVector_->full()) {
+    std::lock_guard<std::mutex> abstractGuard(abstractLock_);
+    auto id = timeVector_->getMaxTrainerId();
+    auto delta = timeVector_->getDelta();
+    // discard first sample, since first sample probably is abnormal.
+    if (totSamples_) {
+      abstract_[id].freq++;
+
+      if (delta < abstract_[id].minDelta) {
+        abstract_[id].minDelta = delta;
+      }
+      if (delta > abstract_[id].maxDelta) {
+        abstract_[id].maxDelta = delta;
+      }
+      abstract_[id].totDelta += delta;
+
+      // update totAbstract_
+      totAbstract_.freq++;
+      if (delta < totAbstract_.minDelta) {
+        totAbstract_.minDelta = delta;
+      }
+      if (delta > totAbstract_.maxDelta) {
+        totAbstract_.maxDelta = delta;
+      }
+      totAbstract_.totDelta += delta;
+    }
+
+    totSamples_++;
+    timeVector_->reset();
+  }
+}
+
+void BarrierDeltaStat::reset(bool clearRawData) {
+  int32_t i = 0;
+
+  totSamples_ = 0;
+
+  std::lock_guard<std::mutex> guard(abstractLock_);
+
+  if (clearRawData) {
+    timeVector_->reset();
+  }
+
+  for (auto &abstract : abstract_) {
+    memset((void *)&abstract, 0, sizeof(abstract));
+    abstract.minDelta = UINT64_MAX;
+    abstract.trainerId = i++;
+  }
+  memset((void *)&totAbstract_, 0, sizeof(Abstract));
+  totAbstract_.minDelta = UINT64_MAX;
+}
+
+void BarrierDeltaStat::showAbstract(std::ostream &output) {
+  // do not support the case "<=2 pserver"
+  if (numConnThreads_ <= 2 || !totSamples_) {
+    return;
+  }
+
+  // duplicate freq info
+  std::vector<struct Abstract> outputAbstract = abstract_;
+  std::sort(outputAbstract.begin(), outputAbstract.end(),
+            [](const struct Abstract &a, const struct Abstract &b) {
+              return a.freq > b.freq;
+            });
+
+  auto rate = (float)outputAbstract[0].freq / (float)totSamples_;
+  if (rate < rateThreshold_) {
+    return;
+  }
+
+  output << std::setw(20) << name_ << std::endl;
+
+  /* Note:
+   * Gap means the delta from all trainers' forwardbackward
+   * avgGap: average Gap in log_period batches
+   * minGap: min Gap in log_period batches
+   * maxGap: max Gap in log_period batches
+   * trainerId: the slowest trainer_id
+   *
+   * the stat is based on per trainer if trainer_id is set, totAbstract is
+   * stat based on all trainers scope.
+   */
+  output << std::setw(42) << " " << std::setw(15) << "trainerId"
+         << std::setw(15) << "avgGap" << std::setw(10) << "minGap"
+         << std::setw(10) << "maxGap" << std::setw(10) << "rate"
+         << std::setw(10) << "samples" << std::setw(10) << "totSamples"
+         << std::endl;
+  // show totAbstract, it's valuable when lastTrainerId is even-distributed'
+  if (!totAbstract_.freq) return;
+  output << std::setw(42) << " " << std::setw(15) << "totAbstract"
+         << std::setw(15) << (totAbstract_.totDelta / totAbstract_.freq) * 0.001
+         << std::setw(10) << totAbstract_.minDelta * 0.001 << std::setw(10)
+         << totAbstract_.maxDelta * 0.001 << std::setw(10)
+         << (float)totAbstract_.freq / (float)totSamples_ << std::setw(10)
+         << (float)totAbstract_.freq << std::setw(10) << (float)totSamples_
+         << std::endl;
+
+  // show lastTrainerId abstract
+  int count = 0;
+  for (auto &abstract : outputAbstract) {
+    if (!abstract.freq || count++ >= FLAGS_log_barrier_lowest_nodes) {
+      break;
+    }
+    // output format control
+    output << std::setw(42) << " " << std::setw(15) << abstract.trainerId
+           << std::setw(15) << (abstract.totDelta / abstract.freq) * 0.001
+           << std::setw(10) << abstract.minDelta * 0.001 << std::setw(10)
+           << abstract.maxDelta * 0.001 << std::setw(10)
+           << (float)abstract.freq / (float)totSamples_ << std::setw(10)
+           << (float)abstract.freq << std::setw(10) << (float)totSamples_
+           << std::endl;
+  }
+}
+}  // namespace paddle
diff --git a/paddle/utils/BarrierStat.h b/paddle/utils/BarrierStat.h
new file mode 100644
index 00000000000000..22d6cc9bcec5ec
--- /dev/null
+++ b/paddle/utils/BarrierStat.h
@@ -0,0 +1,426 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <stdint.h>
+#include <string>
+#include <sys/time.h>
+#include <memory>
+#include <iostream>
+#include <mutex>
+#include <unordered_map>
+#include <list>
+
+#include "Logging.h"
+#include "Locks.h"
+#include "ThreadLocal.h"
+#include "Stat.h"
+
+namespace paddle {
+
+inline uint64_t timeToMicroSecond(struct timeval time) {
+  return time.tv_sec * 1000000LU + time.tv_usec;
+}
+
+class TimeVectorEnd {
+  /*
+   * help class for gathering all barrier performance data
+   * which shows time point property.
+   * freqently used in barrier performance tuning API, such
+   * as tuning which is slowest node in sync-sgd mode training.
+   */
+public:
+  explicit TimeVectorEnd(uint16_t size) : size_(size) {
+    index_ = 0;
+    timeArray_.resize(size);
+    trainerIds_.resize(size);
+  }
+  ~TimeVectorEnd() {}
+
+  uint16_t size() { return size_; }
+
+  bool full() { return index_ == size_; }
+
+  bool empty() { return index_ == 0; }
+
+  void reset() { index_ = 0; }
+
+  void addTimeval(struct timeval time, int32_t trainerId) {
+    timeArray_[index_] = time;
+    trainerIds_[index_] = trainerId;
+    index_++;
+  }
+
+  struct timeval getDelta() const {
+    struct timeval delta;
+    CHECK_GT(size_, 1) << "not support with 1 pserver";
+    timersub(&timeArray_[size_ - 1], &timeArray_[0], &delta);
+    return delta;
+  }
+
+  /* 2, n delta */
+  struct timeval get1NDelta() const {
+    CHECK_GT(size_, 2) << "not support with less than 2 pservers";
+    struct timeval delta;
+    timersub(&timeArray_[size_ - 1], &timeArray_[1], &delta);
+    return delta;
+  }
+
+  /* n-1, n delta */
+  struct timeval getMinus1NDelta() const {
+    CHECK_GT(size_, 2) << "not support with less than 2 pservers";
+    struct timeval delta;
+    timersub(&timeArray_[size_ - 1], &timeArray_[size_ - 2], &delta);
+    return delta;
+  }
+
+  /* n/2, n delta */
+  struct timeval getMidNDelta() const {
+    CHECK_GT(size_, 2) << "not support with less than 2 pservers";
+    struct timeval delta;
+    timersub(&timeArray_[size_ - 1], &timeArray_[size_ / 2], &delta);
+    return delta;
+  }
+
+  int32_t getLastTrainerId() const { return trainerIds_[index_ - 1]; }
+
+private:
+  uint16_t size_;
+  uint16_t index_;
+  std::vector<struct timeval> timeArray_;
+  std::vector<int32_t> trainerIds_;
+};
+
+class TimeVectorDelta {
+  /*
+   * help class for gathering performance data which shows time
+   * delta property, such as tuning the time distribution of
+   * forwardBackward time from all cluster nodes.
+   */
+public:
+  explicit TimeVectorDelta(uint16_t size)
+      : size_(size), min_(UINT64_MAX), max_(0) {
+    index_ = 0;
+    timeArray_.resize(size);
+  }
+  ~TimeVectorDelta() {}
+
+  uint16_t size() { return size_; }
+
+  bool full() { return index_ == size_; }
+
+  bool empty() { return index_ == 0; }
+
+  void reset() {
+    index_ = 0;
+    min_ = UINT64_MAX;
+    max_ = 0;
+  }
+
+  void addTimeval(uint64_t delta, int32_t trainerId) {
+    timeArray_[index_] = delta;
+    index_++;
+    if (delta < min_) {
+      min_ = delta;
+    }
+    if (delta > max_) {
+      max_ = delta;
+      maxTrainerId_ = trainerId;
+    }
+  }
+
+  uint64_t getDelta() const {
+    CHECK_GT(size_, 1) << "not support with 1 pserver";
+    return max_ - min_;
+  }
+
+  /* 2, n delta */
+  uint64_t get1NDelta() const {
+    CHECK_GT(size_, 2) << "not support with less than 2 pservers";
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /* n-1, n delta */
+  uint64_t getMinus1NDelta() const {
+    CHECK_GT(size_, 2) << "not support with less than 2 pservers";
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /* n/2, n delta */
+  uint64_t getMidNDelta() const {
+    CHECK_GT(size_, 2) << "not support with less than 2 pservers";
+    LOG(FATAL) << "Not implemented";
+  }
+
+  int32_t getMaxTrainerId() const { return maxTrainerId_; }
+
+private:
+  uint16_t size_;
+  uint16_t index_;
+  std::vector<uint64_t> timeArray_;
+
+private:
+  uint64_t min_;
+  uint64_t max_;
+  int32_t maxTrainerId_;
+};
+
+// total samples stats, us
+struct Abstract {
+  // last trainerId for barrier end, maxDelta trainerId for barrier delta
+  int32_t trainerId;
+  uint64_t minDelta;
+  uint64_t maxDelta;
+  uint64_t totDelta;
+  // first one is probably itself, so discard it.
+  uint64_t totSecondDelta;
+  // to confirm if last node destroy barrier performance.
+  uint64_t totLastTwoDelta;
+  // n/2-n delta
+  uint64_t totMidDelta;
+  uint64_t freq;
+};
+
+// barrier performance tunning stats
+class BarrierStatBase {
+public:
+  BarrierStatBase(uint16_t numConnThreads, const std::string &name);
+
+  virtual ~BarrierStatBase() {}
+
+  // if called at pserver end, then trainId means trainer's id.
+  // by default trainer does not use trainerId, so set it to -1
+  virtual void updateStat(struct timeval &cur, int32_t trainerId = -1) = 0;
+  virtual void updateStat(uint64_t delta, int32_t trainerId = -1) = 0;
+
+  const std::string &getName() { return name_; }
+
+  virtual void reset(bool clearRawData = true) {}
+  // since the timeVector_ is not stateful, so it's not clear whether the
+  // the barrier delta is correct. if one timestamp was lost, the all data
+  // from barrier stat becomes rubbish. -_-
+  virtual bool checkPassBarrier() {
+    LOG(INFO) << "bug implementation found";
+    return false;
+  }
+
+protected:
+  virtual void showAbstract(std::ostream &output) {}
+  friend std::ostream &operator<<(std::ostream &output, BarrierStatBase &stat);
+
+protected:
+  std::mutex lock_;
+  std::mutex abstractLock_;  // see note on updaterStat
+  // each freqency for each barrier trainer
+  std::vector<struct Abstract> abstract_;
+  // it is valuable when do perf-tuining, if lastTrainerId acts uniform
+  // distribution
+  struct Abstract totAbstract_;
+  uint64_t totSamples_;
+
+protected:
+  uint16_t numConnThreads_;  // total updates needed
+  float rateThreshold_;
+  std::string name_;
+};
+
+// the end-time of arriving real/forged barrier position
+class BarrierEndStat : public BarrierStatBase {
+public:
+  BarrierEndStat(uint16_t numConnThreads, const std::string &name);
+  ~BarrierEndStat() {}
+
+  virtual void updateStat(struct timeval &cur, int32_t trainerId = -1);
+  virtual void updateStat(uint64_t delta, int32_t trainerId = -1) {
+    LOG(INFO) << "have no delta updateStat in BarrierEndStat";
+  }
+  virtual void reset(bool clearRawData = true);
+  virtual bool checkPassBarrier() { return timeVector_->empty(); }
+
+protected:
+  /*
+   * LOG:
+   * readAllBlocks_denseUpdater
+   * trainerId      avgGap         avgSecondGap   avgLastTwoGap  avgMidGap rate
+   * 44             86.702         81.022         9.984          50.472 0.144737
+   * 46             87.723         82.939         8.737          50.019 0.118421
+   * 35             100.923        96.752         14.305         61.979
+   * 0.0657895
+   * log_barrier_abstract, log_barrier_lowest_nodes, log_barrier_threshold
+   * control details.
+   */
+  virtual void showAbstract(std::ostream &output);
+
+private:
+  std::unique_ptr<TimeVectorEnd> timeVector_;
+};
+
+// the delta-time from different trainers,
+// eg, find the degree of imbalance of BP time at pserver end
+// the entry value in timerVector_ is BP delta, do evaluation to BP delta.
+class BarrierDeltaStat : public BarrierStatBase {
+public:
+  BarrierDeltaStat(uint16_t numConnThreads, const std::string &name);
+  ~BarrierDeltaStat() {}
+
+  virtual void updateStat(uint64_t delta, int32_t trainerId = -1);
+  virtual void updateStat(struct timeval &cur, int32_t trainerId = -1) {
+    LOG(INFO) << "have no timeval updateStat in BarrierDeltaStat";
+  }
+
+  virtual void reset(bool clearRawData = true);
+
+  virtual bool checkPassBarrier() { return timeVector_->empty(); }
+
+protected:
+  virtual void showAbstract(std::ostream &outPut);
+
+private:
+  // store delta time in uint64_t, eg BP time of all trainers
+  std::unique_ptr<TimeVectorDelta> timeVector_;
+};
+
+// to distinguish different contexts for same parallel threads, and different
+// threads with same code-sgement, just use tagName to tag the run-time
+// position.
+// in Sparse, sendParallel threads can not only run in the stage of push&pull
+// with same thread group, but also run in the stage of pull&push with different
+// thread group, tag will be used to distinguish different run-time barrier
+// position.
+// trainerId in REGISTER_BARRIER_TIMER_SERVER is used to retreive lowest trainer
+// nodes.
+
+// end barrier
+#define __REGISTER_BARRIER_TIMER_SERVER(set, statName, numConnThreads, \
+                                        trainerId, ...)                \
+  do {                                                                 \
+    if (numConnThreads > 2) {                                          \
+      std::string internalName =                                       \
+          std::string(statName) + std::string(__VA_ARGS__);            \
+      BarrierStatPtr __stat =                                          \
+          (set).getStat(numConnThreads, internalName, BARRIER_END);    \
+      struct timeval cur;                                              \
+      gettimeofday(&cur, nullptr);                                     \
+      __stat->updateStat(cur, trainerId);                              \
+    }                                                                  \
+  } while (0);
+
+// end barrier with user-defined timer
+#define __REGISTER_BARRIER_TIMER_SERVER_SET(set, statName, numConnThreads, \
+                                            trainerId, cur, ...)           \
+  do {                                                                     \
+    if (numConnThreads > 2) {                                              \
+      std::string internalName =                                           \
+          std::string(statName) + std::string(__VA_ARGS__);                \
+      BarrierStatPtr __stat =                                              \
+          (set).getStat(numConnThreads, internalName, BARRIER_END);        \
+      __stat->updateStat(cur, trainerId);                                  \
+    }                                                                      \
+  } while (0);
+
+// delta barrier
+#define __REGISTER_BARRIER_DELTA_SERVER_SET(set, statName, numConnThreads, \
+                                            trainerId, delta, ...)         \
+  do {                                                                     \
+    if (numConnThreads > 2) {                                              \
+      std::string internalName =                                           \
+          std::string(statName) + std::string(__VA_ARGS__);                \
+      BarrierStatPtr __stat =                                              \
+          (set).getStat(numConnThreads, internalName, BARRIER_DELTA);      \
+      __stat->updateStat(delta, trainerId);                                \
+    }                                                                      \
+  } while (0);
+
+// check end barrier
+#define __CHECK_BARRIER_TIMER(set, statName, numConnThreads, ...)   \
+  do {                                                              \
+    std::string internalName =                                      \
+        std::string(statName) + std::string(__VA_ARGS__);           \
+    BarrierStatPtr __stat =                                         \
+        (set).getStat(numConnThreads, internalName, BARRIER_END);   \
+    PCHECK(__stat->checkPassBarrier()) << internalName              \
+                                       << ": invalid barrier data"; \
+  } while (0);
+
+/*
+ * Note:
+ * with sync-sgd algriothm in cluster mode, lots of synchronize action exsit at
+ * pserve end. these synchronizaton actions have impact on the efficiency of
+ * parameter exchange. the synchronizaton(barrier) GAP is composed of lots of
+ * factors, such as the forwardBackward variance, network fluncation. we try
+ * to have a quantitative analysis on these factor, so we design lots of barrier
+ * time to capture these performance. these barrier also can be placed at
+ * implict barrier position.
+ *
+ * example:
+ * in sync-sgd algorithm, each parameter server waits for all gradients from
+ * all trainers, thus, an explict barrier point exsit before doing optimization.
+ * the barrier timer located before the point can sense the barrier condition.
+ *
+ */
+
+// try to capture which trainer is slowest node in sync-sgd at pserver.
+#define REGISTER_SLOW_NODES_PROBE(set, statName, numConnThreads, trainerId,   \
+                                  ...)                                        \
+  __REGISTER_BARRIER_TIMER_SERVER((set), statName, numConnThreads, trainerId, \
+                                  __VA_ARGS__)
+// try to check if all threads or trainers have passed barriers for data
+// accuracy.
+#define CHECK_BARRIER_TIMER(set, statName, numConnThreads, ...) \
+  __CHECK_BARRIER_TIMER((set), statName, numConnThreads, __VA_ARGS__)
+
+#ifdef PADDLE_DISABLE_TIMER
+
+#define REGISTER_BARRIER_TIMER_SERVER(set, statName, numConnThreads, \
+                                      trainerId, ...)
+#define REGISTER_BARRIER_TIMER_SERVER_SET(set, statName, numConnThreads, \
+                                          trainerId, cur, ...)
+#define REGISTER_BARRIER_DELTA_SERVER_SET(set, statName, numConnThreads, \
+                                          trainerId, cur, ...)
+
+#else
+
+/*
+ * sensing barrier time distribution for all parallelization threads.
+ * it provides low API for slow node check(REGISTER_SLOW_NODES_PROBE)
+ */
+#define REGISTER_BARRIER_TIMER_SERVER(set, statName, numConnThreads,          \
+                                      trainerId, ...)                         \
+  __REGISTER_BARRIER_TIMER_SERVER((set), statName, numConnThreads, trainerId, \
+                                  __VA_ARGS__)
+
+/*
+ * sensing barrier time distribution for all parallelization threads.
+ * but time point for barrier performance is set by user.
+ * eg, with this api, you can get implict barrier point such as the beginning
+ * time distribution
+ * for receiving data.
+ */
+#define REGISTER_BARRIER_TIMER_SERVER_SET(set, statName, numConnThreads, \
+                                          trainerId, cur, ...)           \
+  __REGISTER_BARRIER_TIMER_SERVER_SET((set), statName, numConnThreads,   \
+                                      trainerId, cur, __VA_ARGS__)
+
+// try to capture time delta from all trainers, such as forwardBackward time
+// which implies
+// computation fluctuation
+#define REGISTER_BARRIER_DELTA_SERVER_SET(set, statName, numConnThreads, \
+                                          trainerId, delta, ...)         \
+  __REGISTER_BARRIER_DELTA_SERVER_SET((set), statName, numConnThreads,     \
+                                      trainerId, delta, __VA_ARGS__)
+
+#endif  // DISABLE_TIMER
+}  // namespace paddle
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
new file mode 100644
index 00000000000000..3c08f1e3055f86
--- /dev/null
+++ b/paddle/utils/CMakeLists.txt
@@ -0,0 +1,13 @@
+# The utilities for paddle
+
+file(GLOB UTIL_HEADERS . *.h)
+file(GLOB UTIL_SOURCES . *.cpp)
+
+add_library(paddle_utils STATIC
+        ${UTIL_SOURCES})
+add_style_check_target(paddle_utils ${UTIL_HEADERS})
+add_style_check_target(paddle_utils ${UTIL_SOURCES})
+add_dependencies(paddle_utils gen_proto_cpp)
+if(WITH_TESTING)
+    add_subdirectory(tests)
+endif()
\ No newline at end of file
diff --git a/paddle/utils/ClassRegistrar.h b/paddle/utils/ClassRegistrar.h
new file mode 100644
index 00000000000000..0c7747ac77a118
--- /dev/null
+++ b/paddle/utils/ClassRegistrar.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <map>
+#include <string>
+
+#include "Util.h"
+
+namespace paddle {
+
+/**
+ * This class is used to keep a set of class types. It can register a
+ * class by a type name and create an instance of a class by type.
+ * Example:
+ *   // Declare the registrar
+ *   ClassRegistrar<Layer, LayerConfig> registar_;
+ *
+ *   // Register a class using its constructor
+ *   registrar_.registerClass<ConvLayer>("conv");
+ *
+ *   // Register a class using a creation function
+ *   registrar_.registerClass("pool", [](LayerConfig& config){
+ *     return PoolLayer::create(config);
+ *   });
+ *
+ *   // create a class instance by type name
+ *   Layer* layer = registrar_.createByType("conv", config);
+ */
+template <class BaseClass, typename... CreateArgs>
+class ClassRegistrar {
+public:
+  typedef std::function<BaseClass*(CreateArgs...)> ClassCreator;
+
+  // Register a class using a creation function.
+  // The creation function's arguments are CreateArgs
+  void registerClass(const std::string& type, ClassCreator creator) {
+    CHECK(creatorMap_.count(type) == 0) << "Duplicated class type: " << type;
+    creatorMap_[type] = creator;
+  }
+
+  // Register a class using its constructor
+  // The constructor's arguments are CreateArgs
+  template <class ClassType>
+  void registerClass(const std::string& type) {
+    registerClass(type,
+                  [](CreateArgs... args) { return new ClassType(args...); });
+  }
+
+  // Create a class instance of type @type using args
+  BaseClass* createByType(const std::string& type, CreateArgs... args) {
+    ClassCreator creator;
+    CHECK(mapGet(type, creatorMap_, &creator))
+        << "Unknown class type: " << type;
+    return creator(args...);
+  }
+
+  template <typename T>
+  inline void forEachType(T callback) {
+      for (auto it = creatorMap_.begin(); it != creatorMap_.end(); ++it) {
+          callback(it->first);
+      }
+  }
+
+protected:
+  std::map<std::string, ClassCreator> creatorMap_;
+};
+
+}  // namespace paddle
diff --git a/paddle/utils/CommandLineParser.cpp b/paddle/utils/CommandLineParser.cpp
new file mode 100644
index 00000000000000..8edcad5747b419
--- /dev/null
+++ b/paddle/utils/CommandLineParser.cpp
@@ -0,0 +1,255 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "CommandLineParser.h"
+#ifndef PADDLE_USE_GFLAGS
+#include "paddle/utils/StringUtil.h"
+#include <algorithm>
+#include <iostream>
+#include <iomanip>
+#include <stdlib.h>
+#include <string>
+#include <vector>
+#include <utility>
+#include <tuple>
+
+namespace paddle {
+
+static constexpr int kStatusOK = 0;
+static constexpr int kStatusInvalid = 1;
+static constexpr int kStatusNotFound = 2;
+
+
+/**
+ * \brief: Convert a string to any type value.
+ *
+ * \note: It will specialize by type T that is supported.
+ */
+template <typename T>
+bool StringToValue(const std::string& content, T* value) {
+  bool ok;
+  *value = str::toWithStatus<T>(content, &ok);
+  return ok;
+}
+
+template <>
+bool StringToValue<bool>(const std::string& content, bool* value) {
+  std::string tmp = content;
+
+  std::transform(tmp.begin(), tmp.end(), tmp.begin(), [](char in) -> char {
+    if (in <= 'Z' && in >= 'A') {
+      return in - ('Z' - 'z');
+    } else {
+      return in;
+    }
+  });  // tolower.
+
+  if (tmp == "true" || tmp == "1") {
+    *value = true;
+    return true;
+  } else if (tmp == "false" || tmp == "0") {
+    *value = false;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template <>
+bool StringToValue<std::string>(const std::string& content,
+                                std::string* value) {
+  *value = content;
+  return true;
+}
+
+/**
+ * \brief Parse argument "--blah=blah".
+ *
+ * \param argument: The command line argument string, such as "--blah=blah"
+ * \param [out] extraInfo: The details error message for parse argument.
+ * \return: kStatusOK, kStatusInvalid, kStatusNotFound
+ */
+template <typename T>
+int ParseArgument(const std::string& argument, std::string* extraInfo) {
+  for (auto& command :
+       flags_internal::CommandLineFlagRegistry<T>::Instance()->commands) {
+    std::string& name = command.name;
+    T* value = command.value;
+
+    std::string prefix = "--";
+    prefix += name;
+    prefix += "=";
+    std::string content;
+    if (str::startsWith(argument, prefix)) {
+      content = argument.substr(prefix.size(), argument.size() - prefix.size());
+    } else {
+      prefix = "-";
+      prefix += name;
+      prefix += "=";
+      if (str::startsWith(argument, prefix)) {
+        content =
+            argument.substr(prefix.size(), argument.size() - prefix.size());
+      }
+    }
+
+    if (!content.empty()) {
+      if (StringToValue(content, value)) {
+        return kStatusOK;
+      } else {
+        *extraInfo = name;
+        return kStatusInvalid;
+      }
+    }
+  }
+  return kStatusNotFound;
+}
+
+/**
+ * @brief ParseBoolArgumentExtra
+ * parse '--flag_name', '-flag_name' as true; '--noflag_name', '-noflag_name' as
+ * false
+ */
+static int ParseBoolArgumentExtra(
+    const std::string& argument, std::string* extraInfo) {
+  (void)(extraInfo);  // unused extraInfo, just make api same.
+
+  //! @warning: The order and content of prefixes is DESIGNED for parsing
+  //! command line. The length of prefixes are 1, 2, 3, 4. The parse logic takes
+  //! use of this fact. DO NOT CHANGE IT without reading how to parse command
+  //! below.
+  static const std::vector<std::pair<const char*, bool> >  prefixes = {
+    {"-", true},
+    {"--", true},
+    {"-no", false},
+    {"--no", false}
+  };
+
+  for (flags_internal::CommandLineFlagRegistry<bool>::Command& command :
+       flags_internal::CommandLineFlagRegistry<bool>::Instance()->commands) {
+    if (argument.size() > command.name.size()) {
+      //! Use the length of prefix is 1, 2, 3, 4.
+      size_t diff = argument.size() - command.name.size() - 1UL;
+      if (diff < prefixes.size()) {
+        const std::string& prefix = std::get<0>(prefixes[diff]);
+        if (argument == prefix + command.name) {
+          *command.value = std::get<1>(prefixes[diff]);
+          return kStatusOK;
+        }
+      }
+    }
+  }
+  return kStatusNotFound;
+}
+
+
+/**
+ * \brief: Print command line arguments' usage with type T.
+ */
+template <typename T>
+static void PrintTypeUsage() {
+  for (auto& command :
+       flags_internal::CommandLineFlagRegistry<T>::Instance()->commands) {
+    std::string& name = command.name;
+    name = "--" + name;  // Program will exit, so modify name is safe.
+    std::string& desc = command.text;
+    T& defaultValue = command.defaultValue;
+    std::cerr << std::setw(20) << name << ": " << desc
+              << "[default:" << defaultValue << "]." << std::endl;
+  }
+}
+
+template <typename ...TS>
+static void PrintTypeUsages() {
+  int unused[] = {
+    0,
+    (PrintTypeUsage<TS>(), 0) ...
+  };
+  (void)(unused);
+}
+/**
+ * \brief: Print all usage, and exit(1)
+ */
+static void PrintUsageAndExit(const char* argv0) {
+  std::cerr << "Program " << argv0 << " Flags: " << std::endl;
+  PrintTypeUsages<bool, int32_t, std::string, double, int64_t, uint64_t>();
+  exit(1);
+}
+
+/**
+ * \brief: Print the error flags, usage, and exit.
+ */
+static void PrintParseError(const std::string& name, const char* actualInput,
+                            const char* arg0) {
+  std::cerr << "Parse command flag " << name << " error! User input is "
+            << actualInput << std::endl;
+  PrintUsageAndExit(arg0);
+}
+
+void ParseCommandLineFlags(int* argc, char** argv, bool withHelp) {
+  int unused_argc = 1;
+  std::string extra;
+  for (int i = 1; i < *argc; ++i) {
+    std::string arg = argv[i];
+    int s = kStatusInvalid;
+#define ParseArgumentWithType(type)           \
+  s = ParseArgument<type>(arg, &extra);       \
+  if (s == kStatusOK) {                       \
+    continue;                                 \
+  } else if (s == kStatusInvalid) {           \
+    PrintParseError(extra, argv[i], argv[0]); \
+  }
+
+    ParseArgumentWithType(bool);    // NOLINT
+    ParseArgumentWithType(int32_t);
+    ParseArgumentWithType(double);  // NOLINT
+    ParseArgumentWithType(int64_t);
+    ParseArgumentWithType(uint64_t);
+    ParseArgumentWithType(std::string);
+
+#undef ParseArgumentWithType
+    s = ParseBoolArgumentExtra(arg, &extra);
+    if (s == kStatusOK) {
+      continue;
+    }
+
+    if (withHelp && (arg == "--help" || arg == "-h")) {
+      PrintUsageAndExit(argv[0]);
+    }
+
+    // NOT Found for all flags.
+    std::swap(argv[unused_argc++], argv[i]);
+  }
+  *argc = unused_argc;
+}
+
+}  // namespace paddle
+#else
+namespace paddle {
+#ifndef GFLAGS_NS
+#define GFLAGS_NS google
+#endif
+
+namespace gflags_ns = GFLAGS_NS;
+
+void ParseCommandLineFlags(int* argc, char** argv, bool withHelp) {
+  if (withHelp) {
+    gflags_ns::ParseCommandLineFlags(argc, &argv, true);
+  } else {
+    gflags_ns::ParseCommandLineNonHelpFlags(argc, &argv, true);
+  }
+}
+
+}  // namespace paddle
+#endif
diff --git a/paddle/utils/CommandLineParser.h b/paddle/utils/CommandLineParser.h
new file mode 100644
index 00000000000000..d18675ffa30d7f
--- /dev/null
+++ b/paddle/utils/CommandLineParser.h
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+#ifndef PADDLE_USE_GFLAGS
+#include "DisableCopy.h"
+#include <string>
+#include <vector>
+#include <stdint.h>
+
+namespace paddle {
+
+namespace flags_internal {
+
+/**
+ * Command line flag registry for special type T. It will store all command
+ * arguments settings. such as name, default value.
+ */
+template <typename T>
+struct CommandLineFlagRegistry {
+  /**
+   * The factory method of CommandLineFlagRegistry
+   *
+   * \return: The singleton instance of CommandLineFlagRegistry.
+   */
+  static CommandLineFlagRegistry* Instance() {
+    static CommandLineFlagRegistry instance_;
+    return &instance_;
+  }
+
+  struct Command {
+    /// name of argument.
+    std::string name;
+    /// address of actual variable. such as FLAGS_xxx.
+    T* value;
+    /// usage text.
+    std::string text;
+    /// default value of this command.
+    T defaultValue;
+  };
+
+  /// the command line arguments of type T.
+  std::vector<Command> commands;
+
+  DISABLE_COPY(CommandLineFlagRegistry);
+
+private:
+  inline CommandLineFlagRegistry() {}
+};
+
+/**
+ *Helper class to register command line flag.
+ */
+template <typename T>
+struct CommandLineFlagRegister {
+  /**
+   * \brief: Register a command line argument
+   *
+   * \param [in] name: The command line name.
+   * \param [inout] val: The command line argument instance, FLAGS_xxx.
+   * \param [in] desc: The command line helper message.
+   */
+  CommandLineFlagRegister(const std::string& name, T* val,
+                          const std::string desc) {
+    CommandLineFlagRegistry<T>::Instance()->commands.push_back(
+        {name, val, desc, *val});
+  }
+};
+
+/**
+ * \brief: Define a command line arguments.
+ *
+ * \param type: The variable type, such as int, double, etc.
+ * \param name: The variable name. The command line argument is '--name', the variable
+ *is 'FLAGS_name'
+ * \param default_value: The default value of command line argument.
+ * \param text: The description in command line argument.
+ */
+#define PADDLE_DEFINE_variable(type, name, default_value, text) \
+  type FLAGS_##name = default_value;                            \
+  namespace paddle_flags_internal {                             \
+  paddle::flags_internal::CommandLineFlagRegister<type>         \
+      flags_internal_var_##name(#name, &FLAGS_##name, text);    \
+  }  // namespace paddle_flags_internal
+
+/**
+ * Declare a variable to use.
+ */
+#define PADDLE_DECLARE_variable(type, name) extern type FLAGS_##name;
+
+// DEFINE macro for each types.
+#define P_DEFINE_int32(name, default_value, text) \
+  PADDLE_DEFINE_variable(int32_t, name, default_value, text)
+
+#define P_DEFINE_bool(name, default_value, text) \
+  PADDLE_DEFINE_variable(bool, name, default_value, text)
+
+#define P_DEFINE_string(name, default_value, text) \
+  PADDLE_DEFINE_variable(std::string, name, default_value, text)
+
+#define P_DEFINE_double(name, default_value, text) \
+  PADDLE_DEFINE_variable(double, name, default_value, text)
+
+#define P_DEFINE_int64(name, default_value, text) \
+  PADDLE_DEFINE_variable(int64_t, name, default_value, text)
+
+#define P_DEFINE_uint64(name, default_value, text) \
+  PADDLE_DEFINE_variable(uint64_t, name, default_value, text)
+
+// Declare macro for each types.
+#define P_DECLARE_int32(name) PADDLE_DECLARE_variable(int32_t, name)
+#define P_DECLARE_bool(name) PADDLE_DECLARE_variable(bool, name)
+#define P_DECLARE_string(name) PADDLE_DECLARE_variable(std::string, name)
+#define P_DECLARE_double(name) PADDLE_DECLARE_variable(double, name)
+#define P_DECLARE_int64(name) PADDLE_DECLARE_variable(int64_t, name)
+#define P_DECLARE_uint64(name) PADDLE_DECLARE_variable(uint64_t, name)
+}  // namespace flags_internal
+
+/**
+ * \brief Parse command line flags. If parse error, just failed and exit 1.
+ *
+ * \param [inout] argc: The command argument count. This method will modify
+ *argc, and left unused arguments.
+ * \param [inout] argv: The command argument values. This method will modify
+ *argv, and left unused arguments.
+ * \param [in] withHelp: True will parse '-h' and '--help' to print usage.
+ *
+ * \note: The Command line flags format basically as follow:
+ *
+ *  * If the type of flag is not bool, then the follow format of command line
+ *    will be parsed:
+ *    * --flag_name=value
+ *    * -flag_name=value
+ *
+ *  * If the flag is bool, then:
+ *    * --flag_name=value, -flag_name=value will be parsed.
+ *       * if value.tolower() == "true"| "1" will be treated as true.
+ *       * else if value.tolower() == "false" | "0" will be treated as false.
+ *    * --flag_name will be parsed as true.
+ *    * --noflag_name will be parsed as false.
+ */
+void ParseCommandLineFlags(int* argc, char** argv, bool withHelp = true);
+
+}  // namespace paddle
+
+#else  // if use gflags.
+#include <gflags/gflags.h>
+
+#define P_DEFINE_int32 DEFINE_int32
+#define P_DEFINE_bool DEFINE_bool
+#define P_DEFINE_string DEFINE_string
+#define P_DEFINE_double DEFINE_double
+#define P_DEFINE_int64 DEFINE_int64
+#define P_DEFINE_uint64 DEFINE_uint64
+#define P_DECLARE_int32 DECLARE_int32
+#define P_DECLARE_bool DECLARE_bool
+#define P_DECLARE_string DECLARE_string
+#define P_DECLARE_double DECLARE_double
+#define P_DECLARE_int64 DECLARE_int64
+#define P_DECLARE_uint64 DECLARE_uint64
+namespace paddle {
+void ParseCommandLineFlags(int* argc, char** argv, bool withHelp = true);
+
+}  // namespace paddle
+
+#endif
diff --git a/paddle/utils/CustomStackTrace.cpp b/paddle/utils/CustomStackTrace.cpp
new file mode 100644
index 00000000000000..50d7f5402f5867
--- /dev/null
+++ b/paddle/utils/CustomStackTrace.cpp
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "CustomStackTrace.h"
+
+namespace paddle {
+
+CustomStackTrace<std::string> gLayerStackTrace;
+
+}  // namespace paddle
diff --git a/paddle/utils/CustomStackTrace.h b/paddle/utils/CustomStackTrace.h
new file mode 100644
index 00000000000000..e1b2d2d8e5ee6c
--- /dev/null
+++ b/paddle/utils/CustomStackTrace.h
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stack>
+
+#include "ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * A ThreadLocal stack for tracing train/test process. 
+ * (More details of ThreadLocal can be find 
+ * in the comments of ThreadLocal class.)
+ * 
+ * For example.
+ * @code{.cpp}
+ * 
+ * paddle::CustomStackTrace<std::string> stack;
+ * PASS_TEST=0;
+ * for (auto& layer : layers){
+ *   stack.push(layer->getName());
+ *   layer->forward(passType);
+ * }
+ * for (auto& layer : layers){
+ *   layer->backward(passType);
+ *   stack.pop(layer->getName());
+ * }
+ * 
+ * if(passType == PASS_TEST) {
+ *   stack.clear();
+ * }
+ * else {
+ *   stack.dump([](const std::string& layername){
+ *     LOG(INFO) << "LayerName: " << layername;
+ *   })
+ * }
+ * 
+ *
+ * @endcode
+ */
+template <typename T>
+class CustomStackTrace{
+public:
+  /**
+   * @brief Pop out an item from the top of the stack. For safety the item 
+   * will be poped should equal to ip.
+   */
+  void pop(const T& ip) {
+    auto& p = *logstack_;
+    CHECK_EQ(ip, p.top());
+    p.pop();
+  }
+  /**
+   * @brief Empty the stack by sequence from top to button.
+   * @param[in] callback A function deal with each item while dumping.
+   * It must have and only have a in parameter which is the stack item.
+   */
+  template <typename Callback>
+  void dump(Callback callback) {
+    auto& p = *logstack_;
+    while (!p.empty()) {
+      callback(p.top());
+      p.pop();
+    }
+  }
+  /**
+   * @brief Only empty the stack.
+   */
+  void clear() {
+    dump([](const T& ip){});
+  }
+  /**
+   * @brief Push item ip to the top of the stack.
+   */
+  void push(const T& ip) {
+    auto& p = *logstack_;
+    p.push(ip);
+  }
+
+private:
+  ThreadLocalD<std::stack<T> > logstack_;
+};
+
+extern CustomStackTrace<std::string> gLayerStackTrace;
+
+}  // namespace paddle
diff --git a/paddle/utils/DisableCopy.h b/paddle/utils/DisableCopy.h
new file mode 100644
index 00000000000000..964daa237beb30
--- /dev/null
+++ b/paddle/utils/DisableCopy.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+/**
+ * Disable copy macro.
+ */
+#define DISABLE_COPY(CLASS_NAME)\
+  CLASS_NAME(CLASS_NAME &&) = delete; \
+  CLASS_NAME(const CLASS_NAME &other) = delete; \
+  CLASS_NAME& operator=(const CLASS_NAME &other) = delete
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
new file mode 100644
index 00000000000000..b2b5a5949e59cb
--- /dev/null
+++ b/paddle/utils/Flags.cpp
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "Flags.h"
+
+#ifdef PADDLE_ONLY_CPU
+P_DEFINE_bool(use_gpu, false, "Only support CPU training");
+#else
+P_DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
+#endif
+
+P_DEFINE_bool(
+    parallel_nn, false,
+    "Whether to use multi-threads to calculate one neural network."
+    "If it was set false, use gpu_id specify which gpu core to use"
+    "(the device property in the trainer config file will be ingored)."
+    "If it was set true, the gpu core is specified by the trainer"
+    "  config file(gpu_id will be ignored).");
+P_DEFINE_int32(trainer_count, 1, "Defined how many trainers to train");
+P_DEFINE_int32(gpu_id, 0, "Which gpu core to use");
+P_DEFINE_int32(port, 20134, "Listening port for pserver");
+P_DEFINE_int32(data_server_port, 21134, "Listening port for dserver");
+P_DEFINE_int32(ports_num, 1,
+               "The ports number for parameter send,"
+               " increment based on default port number");
+P_DEFINE_int32(ports_num_for_sparse, 0,
+               "The ports number for parameter send,"
+               " increment based on default (port + ports_num)");
+P_DEFINE_string(nics, "xgbe0,xgbe1", "network device name for pservers");
+P_DEFINE_string(rdma_tcp, "tcp", "use rdma or tcp rdma transport protocol");
+P_DEFINE_int32(
+    trainer_id, 0,
+    "For distributed training, each trainer must be given an unique id"
+    " ranging from 0 to num_trainers-1. Trainer 0 is the master"
+    " trainer");
+P_DEFINE_int32(num_gradient_servers, 1, "number of gradient servers");
+P_DEFINE_string(comment, "", "A string for commenting this training task");
+P_DEFINE_string(load_missing_parameter_strategy, "fail",
+                "which operation to take on load model fails. support "
+                "fail/rand/zero only.");
+P_DEFINE_int32(log_period, 100, "Log progress every so many batches");
+P_DEFINE_int32(log_period_server, 500,
+               "Log progress every so many batches at pserver end");
+P_DEFINE_double(checkgrad_eps, 1e-5, "parameter change size for checkgrad");
+P_DEFINE_int32(enable_parallel_vector, 0,
+               "threshold for enable parallel vector");
+P_DEFINE_bool(loadsave_parameters_in_pserver, false,
+              "load and save parameters in pserver. "
+              "only work while parameter set sparse_remote_update.");
+P_DEFINE_int32(beam_size, 1,
+               "Beam size used in generating most probable output sequences.");
+
+P_DEFINE_bool(show_layer_stat, false, "show the statistics of each layer");
+P_DEFINE_string(predict_file, "", "File name for saving predict result");
+P_DEFINE_bool(prev_batch_state, false, "batch is continue with next batch");
+P_DEFINE_string(init_model_path, "",
+                "Path of the initial model parameters."
+                "If it was set, start_pass will be ignored.");
diff --git a/paddle/utils/Flags.h b/paddle/utils/Flags.h
new file mode 100644
index 00000000000000..b23a29eff90691
--- /dev/null
+++ b/paddle/utils/Flags.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "CommandLineParser.h"
+
+P_DECLARE_bool(parallel_nn);
+P_DECLARE_int32(async_count);
+P_DECLARE_int32(port);
+P_DECLARE_int32(data_server_port);
+P_DECLARE_bool(use_gpu);
+P_DECLARE_int32(gpu_id);
+P_DECLARE_int32(trainer_count);
+P_DECLARE_int32(ports_num);
+P_DECLARE_int32(ports_num_for_sparse);
+P_DECLARE_string(nics);
+P_DECLARE_string(rdma_tcp);
+P_DECLARE_int32(trainer_id);
+P_DECLARE_int32(num_gradient_servers);
+P_DECLARE_string(comment);
+P_DECLARE_string(load_missing_parameter_strategy);
+P_DECLARE_int32(log_period);
+P_DECLARE_int32(log_period_server);
+P_DECLARE_double(checkgrad_eps);
+P_DECLARE_int32(enable_parallel_vector);
+P_DECLARE_bool(loadsave_parameters_in_pserver);
+P_DECLARE_int32(beam_size);
+P_DECLARE_bool(show_layer_stat);
+P_DECLARE_string(predict_file);
+P_DECLARE_bool(prev_batch_state);
+P_DECLARE_string(init_model_path);
diff --git a/paddle/utils/GlobalConstants.cpp b/paddle/utils/GlobalConstants.cpp
new file mode 100644
index 00000000000000..8ed6471e4e85de
--- /dev/null
+++ b/paddle/utils/GlobalConstants.cpp
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "GlobalConstants.h"
+
+namespace paddle {
+
+const std::string TrainAlgorithm::SGD = "sgd";
+const std::string TrainAlgorithm::AsyncSGD = "async_sgd";
+const std::string TrainAlgorithm::OWLQN = "owlqn";
+
+}  // namespace paddle
diff --git a/paddle/utils/GlobalConstants.h b/paddle/utils/GlobalConstants.h
new file mode 100644
index 00000000000000..8818b014f80be9
--- /dev/null
+++ b/paddle/utils/GlobalConstants.h
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+#include <string>
+
+namespace paddle {
+
+namespace enumeration_wrapper {
+enum PassType {
+  PASS_TRAIN,  // Train pass
+  PASS_TEST,   // Test pass
+  PASS_GC,     // Gradient Check pass
+  PASS_METRIC,  // pass for generate template output with no drop rate.
+  // pass for metric learning training with metric learning error, only used
+  // when we are doing KNN evaluation.
+  PASS_METRIC_TRAIN,
+  PASS_METRIC_TRAIN_WITH_NOERROR,  // Pass for metric learning training
+                                   // with no evaluation.
+};
+
+enum ParameterType {
+  PARAMETER_VALUE = 0,
+  PARAMETER_GRADIENT,
+  PARAMETER_MOMENTUM,
+
+  // Used by ParameterAverager
+  PARAMETER_SUM1,
+  PARAMETER_SUM2,
+  PARAMETER_SUM3,
+
+  //   also used by AdagradParameterUpdater/AdadeltaParameterUpdater
+  PARAMETER_LEARNING_RATE,
+
+  // Used by Sparse SGD update
+  PARAMETER_UPDATE_TIME,
+
+  // Used by async_sgd
+  // Change of the parameter since last remote update
+  PARAMETER_DELTA,
+
+  // Used by BatchRemoteParameterUpdater
+  PARAMETER_GRADIENT_SUM,
+
+  // Used by AdagradParameterUpdater/AdadeltaParameterUpdater
+  PARAMETER_GRADIENT_SQURESUM,
+  PARAMETER_GRADIENT_SQURESUM1,
+
+  // Used by SparseConnected layer
+  PARAMETER_ROWS,
+  PARAMETER_COLS,
+
+  // Used by Adam Optimizer.
+  PARAMETER_SECOND_MOMENTUM,
+
+  // Used By AdaMax Optimizer.
+  PARAMETER_WEIGHTED_INFINITY_NORM,
+
+  // Used by remote parameter average
+  PARAMETER_APPLY,
+
+  // Used by sparse momentum
+  PARAMETER_MOMENTUM_UT,
+  PARAMETER_MOMENTUM_VT,
+
+  NUM_PARAMETER_TYPES,
+};
+
+}  // namespace enumeration_wrapper
+
+//! explicit import enum into paddle namespace.
+using namespace enumeration_wrapper;    // NOLINT
+
+class TrainAlgorithm {
+public:
+  static const std::string SGD;
+  static const std::string AsyncSGD;
+  static const std::string OWLQN;
+
+  static inline bool isValid(const std::string& algo) {
+    return algo == SGD || algo == AsyncSGD || algo == OWLQN;
+  }
+};
+
+#ifdef __AVX__
+const int ALIGN_HINT = 32;
+#else
+const int ALIGN_HINT = 16;
+#endif
+
+}  // namespace paddle
diff --git a/paddle/utils/Locks.h b/paddle/utils/Locks.h
new file mode 100644
index 00000000000000..085aca508dbbec
--- /dev/null
+++ b/paddle/utils/Locks.h
@@ -0,0 +1,238 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <pthread.h>
+#include <semaphore.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include <condition_variable>
+#include <mutex>
+
+namespace paddle {
+
+/**
+ * A simple read-write lock.
+ * The RWlock allows a number of readers or at most one writer 
+ * at any point in time.
+ * The RWlock disable copy.
+ *
+ * Lock:
+ *
+ * Use lock() to lock on write mode, no other thread can get it
+ * until unlock.
+ *
+ * Use lock_shared() to lock on read mode, other thread can get
+ * it by using the same method lock_shared().
+ * 
+ * Unlock:
+ *
+ * Use unlock() to unlock the lock.
+ */
+class RWLock {
+public:
+  RWLock() { pthread_rwlock_init(&rwlock_, NULL); }
+  ~RWLock() { pthread_rwlock_destroy(&rwlock_); }
+  RWLock(const RWLock&) = delete;
+  RWLock& operator=(const RWLock&) = delete;
+
+  /**
+   * @brief lock on write mode.
+   * @note the method will block the thread, if failed to get the lock.
+   */
+  // std::mutex interface
+  void lock() { pthread_rwlock_wrlock(&rwlock_); }
+  /**
+   * @brief lock on read mode.
+   * @note if another thread is writing, it can't get the lock,
+   * and will block the thread.
+   */
+  void lock_shared() { pthread_rwlock_rdlock(&rwlock_); }
+  void unlock() { pthread_rwlock_unlock(&rwlock_); }
+
+protected:
+  pthread_rwlock_t rwlock_;
+};
+
+/**
+ * The ReadLockGuard is a read mode RWLock 
+ * using RAII management mechanism. 
+ */
+class ReadLockGuard {
+public:
+  /**
+   * @brief Construct Function. Lock on rwlock in read mode. 
+   */
+  explicit ReadLockGuard(RWLock& rwlock) : rwlock_(&rwlock) {
+    rwlock_->lock_shared();
+  }
+
+  /**
+   * @brief Destruct Function.
+   * @note This method just unlock the read mode rwlock, 
+   * won't destroy the lock.
+   */
+  ~ReadLockGuard() { rwlock_->unlock(); }
+
+protected:
+  RWLock* rwlock_;
+};
+
+/**
+ * A simple wrapper for spin lock.
+ * The lock() method of SpinLock is busy-waiting
+ * which means it will keep trying to lock until lock on successfully.
+ * The SpinLock disable copy.
+ */
+class SpinLock {
+public:
+  SpinLock() { pthread_spin_init(&lock_, 0); }
+  ~SpinLock() { pthread_spin_destroy(&lock_); }
+  SpinLock(const SpinLock&) = delete;
+  SpinLock& operator=(const SpinLock&) = delete;
+
+  // std::mutext interface
+  void lock() { pthread_spin_lock(&lock_); }
+  void unlock() { pthread_spin_unlock(&lock_); }
+
+protected:
+  pthread_spinlock_t lock_;
+  char padding_[64 - sizeof(pthread_spinlock_t)];
+};
+
+/**
+ * A simple wapper of semaphore which can only be shared in the same process.
+ */
+class Semaphore {
+public:
+  /**
+   * @brief Construct Function. 
+   * @param[in] initValue the initial value of the 
+   * semaphore, default 0.
+   */
+  explicit Semaphore(int initValue = 0) { sem_init(&sem_, 0, initValue); }
+
+  ~Semaphore() { sem_destroy(&sem_); }
+
+  /**
+   * @brief The same as wait(), except if the decrement can not 
+   * be performed until ts return false install of blocking.
+   * @param[in] ts an absolute timeout in seconds and nanoseconds 
+   * since the Epoch 1970-01-01 00:00:00 +0000(UTC).
+   * @return ture if the decrement proceeds before ts, 
+   * else return false.
+   */
+  bool timeWait(struct timespec* ts) { return (0 == sem_timedwait(&sem_, ts)); }
+
+  /**
+   * @brief decrement the semaphore. If the semaphore's value is 0, then call blocks.
+   */
+  void wait() { sem_wait(&sem_); }
+
+  /**
+   * @brief increment the semaphore. If the semaphore's value 
+   * greater than 0, wake up a thread blocked in wait().
+   */
+  void post() { sem_post(&sem_); }
+
+protected:
+  sem_t sem_;
+};
+
+static_assert(sizeof(SpinLock) == 64, "Wrong padding");
+
+/**
+ * A simple wrapper of thread barrier.
+ * The ThreadBarrier disable copy.
+ */
+class ThreadBarrier {
+public:
+  /**
+   * @brief Construct Function. Initialize the barrier should
+   * wait for count threads in wait().
+   */
+  explicit ThreadBarrier(int count) {
+    pthread_barrier_init(&barrier_, NULL, count);
+  }
+  ~ThreadBarrier() { pthread_barrier_destroy(&barrier_); }
+  ThreadBarrier(const ThreadBarrier&) = delete;
+  ThreadBarrier& operator=(const ThreadBarrier&) = delete;
+
+  /**
+   * @brief . 
+   * If there were count - 1 threads waiting before, 
+   * then wake up all the count - 1 threads and continue run together. 
+   * Else block the thread until waked by other thread .
+   */
+  void wait() { pthread_barrier_wait(&barrier_); }
+
+protected:
+  pthread_barrier_t barrier_;
+};
+
+/**
+ * A wrapper for condition variable with mutex.
+ */
+class LockedCondition : public std::condition_variable {
+public:
+  /**
+   * @brief execute op and notify one thread which was blocked.
+   * @param[in] op a thread can do something in op before notify.
+   */
+  template <class Op>
+  void notify_one(Op op) {
+    std::lock_guard<std::mutex> guard(mutex_);
+    op();
+    std::condition_variable::notify_one();
+  }
+
+  /**
+   * @brief execute op and notify all the threads which were blocked.
+   * @param[in] op a thread can do something in op before notify.
+   */
+  template <class Op>
+  void notify_all(Op op) {
+    std::lock_guard<std::mutex> guard(mutex_);
+    op();
+    std::condition_variable::notify_all();
+  }
+
+  /**
+   * @brief wait until pred return ture.
+   * @tparam Predicate c++ concepts, describes a function object 
+   * that takes a single iterator argument 
+   * that is dereferenced and used to 
+   * return a value testable as a bool.
+   * @note pred shall not apply any non-constant function 
+   * through the dereferenced iterator. 
+   */
+  template <class Predicate>
+  void wait(Predicate pred) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    std::condition_variable::wait(lock, pred);
+  }
+
+  /**
+   * @brief get mutex.
+   */
+  std::mutex* mutex() { return &mutex_; }
+
+protected:
+  std::mutex mutex_;
+};
+
+}  // namespace paddle
diff --git a/paddle/utils/Logging.cpp b/paddle/utils/Logging.cpp
new file mode 100644
index 00000000000000..a0644940b5402a
--- /dev/null
+++ b/paddle/utils/Logging.cpp
@@ -0,0 +1,205 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * Basically from tensorflow/core/platform/default/logging.cc
+ * Used in embedded system where there is no glogs.
+ */
+
+#include "Logging.h"
+#ifndef PADDLE_USE_GLOG
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <vector>
+#include <thread>
+#include <mutex>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+namespace paddle {
+
+namespace internal {
+
+std::string join(const std::string& part1, const std::string& part2) {
+  const char sep = '/';
+  if (!part2.empty() && part2.front() == sep) {
+    return part2;
+  }
+  std::string ret;
+  ret.reserve(part1.size() + part2.size() + 1);
+  ret = part1;
+  if (!ret.empty() && ret.back() != sep) {
+    ret += sep;
+  }
+  ret += part2;
+  return ret;
+}
+
+static inline bool env2bool(const char* envName, bool defaultValue = false) {
+  char* envValue = getenv(envName);
+  if (envValue == nullptr) {
+    return defaultValue;
+  } else {
+    return memchr("tTyY1\0", envValue[0], 6) != nullptr;
+  }
+}
+
+static inline int env2int(const char* envName, int defaultValue = 0) {
+  char* envValue = getenv(envName);
+  if (envValue == nullptr) {
+    return defaultValue;
+  } else {
+    int retValue = defaultValue;
+    try {
+      retValue = std::stoi(envValue);
+    } catch (...) {
+      // pass
+    }
+    return retValue;
+  }
+}
+
+static inline int env2index(const char* envName,
+                            const std::vector<std::string>& options,
+                            int defaultValue) {
+  char* envValue = getenv(envName);
+  if (envValue == nullptr) {
+    return defaultValue;
+  } else {
+    for (size_t i = 0; i < options.size(); ++i) {
+      if (options[i] == envValue) {
+        return static_cast<int>(i);
+      }
+    }
+    return defaultValue;
+  }
+}
+
+static bool gLogToStderr = env2bool("PLOG_LOGTOSTDERR", true);
+static const std::vector<std::string> gLevelName = {"INFO", "WARNING", "ERROR",
+                                                    "FATAL"};
+static int gMinLogLevel =
+    env2int("PLOG_MINLOGLEVEL", env2index("PLOG_MINLOGLEVEL", gLevelName, 0));
+
+static std::vector<std::vector<int>> gLogFds;
+static std::vector<int> gLogFileFds;
+static bool gLogInited = false;
+static void freeLogFileFds() {
+  for (auto fd : gLogFileFds) {
+    close(fd);
+  }
+}
+
+static void initializeLogFds(char* argv0) {
+  gLogFds.resize(NUM_SEVERITIES);
+
+  for (int i = gMinLogLevel; i < NUM_SEVERITIES && gLogToStderr;
+       ++i) {  // Add stderr
+    std::vector<int>& fds = gLogFds[i];
+    fds.push_back(STDERR_FILENO);
+  }
+
+  char* logDir = getenv("PLOG_LOGDIR");
+
+  for (int i = gMinLogLevel; i < NUM_SEVERITIES && logDir != nullptr; ++i) {
+    std::string filename =
+        join(logDir, std::string(argv0) + "." + gLevelName[i]);
+    int fd = open(filename.c_str(), O_CREAT | O_WRONLY, 0644);
+    if (fd == -1) {
+      fprintf(stderr, "Open log file error!");
+      exit(1);
+    }
+    gLogFileFds.push_back(fd);
+
+    std::vector<int>& curFds = gLogFds[i];
+    curFds.insert(curFds.end(), gLogFileFds.begin(), gLogFileFds.end());
+  }
+
+  atexit(freeLogFileFds);
+  gLogInited = true;
+}
+
+static void (*gFailureFunctionPtr)() __attribute__((noreturn)) = abort;
+
+LogMessage::LogMessage(const char* fname, int line, int severity)
+    : fname_(fname), line_(line), severity_(severity) {}
+
+LogMessage::~LogMessage() { this->generateLogMessage(); }
+
+void LogMessage::generateLogMessage() {
+  if (!gLogInited) {
+    fprintf(stderr, "%c %s:%d] %s\n", "IWEF"[severity_], fname_, line_,
+            str().c_str());
+  } else {
+    for (auto& fd : gLogFds[this->severity_]) {
+      dprintf(fd, "%c %s:%d] %s\n", "IWEF"[severity_], fname_, line_,
+              str().c_str());
+    }
+  }
+}
+
+LogMessageFatal::LogMessageFatal(const char* file, int line)
+    : LogMessage(file, line, FATAL) {}
+
+LogMessageFatal::~LogMessageFatal() {
+  generateLogMessage();
+  gFailureFunctionPtr();
+}
+}  // namespace internal
+
+void initializeLogging(int argc, char** argv) {
+  internal::initializeLogFds(argv[0]);
+}
+
+namespace logging {
+void setMinLogLevel(int level) {
+  paddle::internal::gMinLogLevel = level;
+}
+
+void installFailureFunction(void (*callback)()) {
+  paddle::internal::gFailureFunctionPtr = callback;
+}
+
+}  // namespace logging
+
+}  // namespace paddle
+
+#else
+namespace paddle {
+void initializeLogging(int argc, char** argv) {
+  (void)(argc);
+  if (!getenv("GLOG_logtostderr")) {
+    google::LogToStderr();
+  }
+  google::InstallFailureSignalHandler();
+  google::InitGoogleLogging(argv[0]);
+}
+
+namespace logging {
+void setMinLogLevel(int level) {
+  FLAGS_minloglevel = level;
+}
+void installFailureFunction(void (*callback)()) {
+  google::InstallFailureFunction(callback);
+}
+void installFailureWriter(void(*callback)(const char*, int)) {
+  google::InstallFailureWriter(callback);
+}
+}  // namespace logging
+}  // namespace paddle
+#endif
diff --git a/paddle/utils/Logging.h b/paddle/utils/Logging.h
new file mode 100644
index 00000000000000..7fdfa3240c1de7
--- /dev/null
+++ b/paddle/utils/Logging.h
@@ -0,0 +1,199 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * Basically from tensorflow/core/platform/default/logging.h
+ * Used in embedded system where there is no glogs.
+ */
+
+#pragma once
+#include <sstream>
+#include <memory>
+#include <string>
+
+#ifndef PADDLE_USE_GLOG
+
+//! TODO(yuyang18): Move this utility macro into some global header.
+#define PP_CAT(a, b) PP_CAT_I(a, b)
+#define PP_CAT_I(a, b) PP_CAT_II(~, a##b)
+#define PP_CAT_II(p, res) res
+
+/**
+ * Generate Unique Variable Name, Usefully in macro.
+ * @SEE http://stackoverflow.com/questions/1082192/how-to-generate-random-variable-names-in-c-using-macros
+ */
+#define UNIQUE_NAME(base) PP_CAT(base, __LINE__)
+
+
+namespace paddle {
+
+//! Log levels.
+const int INFO = 0;
+const int WARNING = 1;
+const int ERROR = 2;
+const int FATAL = 3;
+const int NUM_SEVERITIES = 4;
+
+namespace internal {
+
+class LogMessage : public std::basic_ostringstream<char> {
+public:
+  LogMessage(const char* fname, int line, int severity);
+  ~LogMessage();
+
+protected:
+  /**
+   * @brief Print log message to stderr, files, etc.
+   */
+  void generateLogMessage();
+
+private:
+  const char* fname_;
+  int line_;
+  int severity_;
+};
+
+// LogMessageFatal ensures the process will exit in failure after
+// logging this message.
+class LogMessageFatal : public LogMessage {
+public:
+  LogMessageFatal(const char* file, int line) __attribute__((cold));
+  ~LogMessageFatal() __attribute__((noreturn));
+};
+
+#define _P_LOG_INFO \
+  ::paddle::internal::LogMessage(__FILE__, __LINE__, paddle::INFO)
+#define _P_LOG_WARNING \
+  ::paddle::internal::LogMessage(__FILE__, __LINE__, paddle::WARNING)
+#define _P_LOG_ERROR \
+  ::paddle::internal::LogMessage(__FILE__, __LINE__, paddle::ERROR)
+#define _P_LOG_FATAL ::paddle::internal::LogMessageFatal(__FILE__, __LINE__)
+
+#define P_LOG(severity) _P_LOG_##severity
+
+#define P_LOG_FIRST_N(severity, n)                                       \
+  static int UNIQUE_NAME(LOG_OCCURRENCES) = 0;                           \
+  if (UNIQUE_NAME(LOG_OCCURRENCES) <= n) ++UNIQUE_NAME(LOG_OCCURRENCES); \
+  if (UNIQUE_NAME(LOG_OCCURRENCES) <= n) P_LOG(severity)
+
+#define P_LOG_IF_EVERY_N(severity, condition, n)                              \
+  static int UNIQUE_NAME(LOG_OCCURRENCES) = 0;                                \
+  if (condition && ((UNIQUE_NAME(LOG_OCCURRENCES) =                           \
+                         (UNIQUE_NAME(LOG_OCCURRENCES) + 1) % n) == (1 % n))) \
+  P_LOG(severity)
+
+#define P_LOG_EVERY_N(severity, n) P_LOG_IF_EVERY_N(severity, true, n)
+
+// TODO(jeff): Define a proper implementation of VLOG_IS_ON
+#define P_VLOG_IS_ON(lvl) ((lvl) <= 0)
+
+#define P_LOG_IF(severity, condition) \
+  if (condition) P_LOG(severity)
+
+#define P_VLOG(lvl) P_LOG_IF(INFO, P_VLOG_IS_ON(lvl))
+
+#define P_VLOG_IF(lvl, cond) P_LOG_IF(INFO, P_VLOG_IS_ON(lvl) && cond)
+
+#define P_VLOG_EVERY_N(lvl, n) P_LOG_IF_EVERY_N(INFO, P_VLOG_IS_ON(lvl), n)
+
+#define PREDICT_FALSE(x) (__builtin_expect(x, 0))
+#define PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
+
+// CHECK dies with a fatal error if condition is not true.  It is *not*
+// controlled by NDEBUG, so the check will be executed regardless of
+// compilation mode.  Therefore, it is safe to do things like:
+//    CHECK(fp->Write(x) == 4)
+#define P_CHECK(condition)         \
+  if (PREDICT_FALSE(!(condition))) \
+  P_LOG(FATAL) << "Check failed: " #condition " "
+
+#define P_CHECK_EQ(val1, val2) P_CHECK((val1) == (val2))
+#define P_CHECK_NE(val1, val2) P_CHECK((val1) != (val2))
+#define P_CHECK_LE(val1, val2) P_CHECK((val1) <= (val2))
+#define P_CHECK_LT(val1, val2) P_CHECK((val1) < (val2))
+#define P_CHECK_GE(val1, val2) P_CHECK((val1) >= (val2))
+#define P_CHECK_GT(val1, val2) P_CHECK((val1) > (val2))
+#define P_CHECK_NOTNULL(val) P_CHECK((val) != NULL)
+
+//! GLOG compatible APIs
+//! NOTE: only implement Paddle actually used APIs.
+#define LOG(x) P_LOG(x)
+#define VLOG(x) P_VLOG(x)
+#define DLOG(x) P_VLOG(5)
+#define CHECK(x) P_CHECK(x)
+#define PCHECK(x) P_CHECK(x)
+#define CHECK_EQ(val1, val2) P_CHECK((val1) == (val2))
+#define CHECK_NE(val1, val2) P_CHECK((val1) != (val2))
+#define CHECK_LE(val1, val2) P_CHECK((val1) <= (val2))
+#define CHECK_LT(val1, val2) P_CHECK((val1) < (val2))
+#define CHECK_GE(val1, val2) P_CHECK((val1) >= (val2))
+#define CHECK_GT(val1, val2) P_CHECK((val1) > (val2))
+#define CHECK_NOTNULL(val) P_CHECK((val) != NULL)
+#define VLOG_IS_ON(x) P_VLOG_IS_ON(x)
+#define LOG_FIRST_N(severity, n) P_LOG_FIRST_N(severity, n)
+#define LOG_IF(severity, condition) P_LOG_IF(severity, condition)
+#define VLOG_EVERY_N(lvl, n) P_VLOG_EVERY_N(lvl, n)
+#define VLOG_IF(lvl, cond) P_VLOG_IF(lvl, cond)
+#define LOG_EVERY_N(severity, n) P_LOG_EVERY_N(severity, n)
+}  //  namespace internal
+
+/**
+ * @brief initialize logging
+ * @note: Current implement of logging is lack of:
+ *          PrintCallStack when fatal.
+ *          VLOG_IS_ON
+ *        But it is portable to multi-platform, and simple enough to modify.
+ */
+void initializeLogging(int argc, char** argv);
+namespace logging {
+/**
+ * @brief Set Min Log Level. if Log.level < minLogLevel, then will not print log
+ *        to stream
+ * @param level. Any integer is OK, but only 0 <= x <= NUM_SEVERITIES is useful.
+ */
+void setMinLogLevel(int level);
+
+/**
+ * @brief Install Log(Fatal) failure function. Default is abort();
+ * @param callback: The failure function.
+ */
+void installFailureFunction(void (*callback)());
+
+/**
+ * @brief installFailureWriter
+ * @note: not implemented currently.
+ */
+inline void installFailureWriter(void(*callback)(const char*, int)) {
+  (void)(callback);  // unused callback.
+}
+}  //  namespace logging
+}  //  namespace paddle
+#else
+#include <glog/logging.h>
+namespace paddle {
+void initializeLogging(int argc, char** argv);
+namespace logging {
+void setMinLogLevel(int level);
+void installFailureFunction(void (*callback)());
+void installFailureWriter(void(*callback)(const char*, int));
+}  //  namespace logging
+}
+#endif  // PADDLE_USE_GLOG
+
+#ifndef NDEBUG
+#define DEBUG_LEVEL 5
+#define DBG VLOG(DEBUG_LEVEL)
+#else
+#define DBG DLOG(INFO)
+#endif
diff --git a/paddle/utils/PythonUtil.cpp b/paddle/utils/PythonUtil.cpp
new file mode 100644
index 00000000000000..78c3a80674f9c1
--- /dev/null
+++ b/paddle/utils/PythonUtil.cpp
@@ -0,0 +1,199 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "PythonUtil.h"
+#include <sstream>
+#include <signal.h>
+
+namespace paddle {
+
+#ifdef PADDLE_NO_PYTHON
+
+P_DEFINE_string(python_path, "", "python path");
+P_DEFINE_string(python_bin, "python2.7", "python bin");
+
+constexpr int kExecuteCMDBufLength = 204800;
+
+int executeCMD(const char* cmd, char* result) {
+  char bufPs[kExecuteCMDBufLength];
+  char ps[kExecuteCMDBufLength] = {0};
+  FILE* ptr;
+  strncpy(ps, cmd, kExecuteCMDBufLength);
+  if ((ptr = popen(ps, "r")) != NULL) {
+    size_t count = fread(bufPs, 1, kExecuteCMDBufLength, ptr);
+    memcpy(result, bufPs,
+           count - 1);  // why count-1: remove the '\n' at the end
+    result[count] = 0;
+    pclose(ptr);
+    ptr = NULL;
+    return count - 1;
+  } else {
+    LOG(FATAL) << "popen failed";
+    return -1;
+  }
+}
+
+std::string callPythonFunc(const std::string& moduleName,
+                           const std::string& funcName,
+                           const std::vector<std::string>& args) {
+  std::string pythonLibPath = "";
+  std::string pythonBinPath = "";
+  if (!FLAGS_python_path.empty()) {
+    pythonLibPath = FLAGS_python_path + "/lib:";
+    pythonBinPath = FLAGS_python_path + "/bin/";
+  }
+  std::string s = "LD_LIBRARY_PATH=" + pythonLibPath + "$LD_LIBRARY_PATH " +
+                  pythonBinPath + std::string(FLAGS_python_bin) +
+                  " -c 'import " + moduleName + "\n" + "print " + moduleName +
+                  "." + funcName + "(";
+  for (auto& arg : args) {
+    s = s + "\"" + arg + "\", ";
+  }
+  s += ")'";
+  char result[kExecuteCMDBufLength] = {0};
+  LOG(INFO) << " cmd string: " << s;
+  int length = executeCMD(s.c_str(), result);
+  CHECK_NE(-1, length);
+  return std::string(result, length);
+}
+
+#else
+
+
+static std::recursive_mutex g_pyMutex;
+
+PyGuard::PyGuard() : guard_(g_pyMutex) {}
+
+
+static void printPyErrorStack(std::ostream& os, bool withEndl = false) {
+  PyObject * ptype, *pvalue, *ptraceback;
+  PyErr_Fetch(&ptype, &pvalue, &ptraceback);
+  PyErr_NormalizeException(&ptype, &pvalue, &ptraceback);
+  PyErr_Clear();
+  PyTracebackObject* obj = (PyTracebackObject*)ptraceback;
+
+  os << "Python Error: " << PyString_AsString(PyObject_Str(ptype))
+            <<" : " << (pvalue == NULL ? ""
+                                       : PyString_AsString(
+                                           PyObject_Str(pvalue)));
+  if (withEndl) {
+    os << std::endl;
+  }
+  os << "Python Callstack: ";
+  if (withEndl) {
+    os << std::endl;
+  }
+  while (obj != NULL) {
+    int line = obj->tb_lineno;
+    const char* filename = PyString_AsString(
+          obj->tb_frame->f_code->co_filename);
+    os << "            " << filename << " : " << line;
+    if (withEndl) {
+      os << std::endl;
+    }
+    obj = obj->tb_next;
+  }
+
+  Py_XDECREF(ptype);
+  Py_XDECREF(pvalue);
+  Py_XDECREF(ptraceback);
+}
+PyObjectPtr callPythonFuncRetPyObj(const std::string& moduleName,
+                                   const std::string& funcName,
+                                   const std::vector<std::string>& args) {
+  PyGuard guard;
+  PyObjectPtr pyModuleName(PyString_FromString(moduleName.c_str()));
+  CHECK_PY(pyModuleName) << "Import PyModule failed" << moduleName;
+  PyObjectPtr pyModule(PyImport_Import(pyModuleName.get()));
+  CHECK_PY(pyModule) << "Import Python Module"<< moduleName << " failed.";
+  PyObjectPtr pyFunc(PyObject_GetAttrString(pyModule.get(), funcName.c_str()));
+  CHECK_PY(pyFunc) << "GetAttrString failed.";
+  PyObjectPtr pyArgs(PyTuple_New(args.size()));
+  for (size_t i = 0; i < args.size(); ++i) {
+    PyObjectPtr pyArg(PyString_FromString(args[i].c_str()));
+    CHECK_PY(pyArg) << "Import pyArg failed.";
+    PyTuple_SetItem(pyArgs.get(), i, pyArg.release());  //  Maybe a problem
+  }
+  PyObjectPtr ret(PyObject_CallObject(pyFunc.get(), pyArgs.get()));
+  CHECK_PY(ret) << "Call Object failed.";
+  return ret;
+}
+
+std::string callPythonFunc(const std::string& moduleName,
+                           const std::string& funcName,
+                           const std::vector<std::string>& args) {
+  PyObjectPtr obj = callPythonFuncRetPyObj(moduleName, funcName, args);
+  return std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
+}
+
+PyObjectPtr createPythonClass(
+    const std::string& moduleName, const std::string& className,
+    const std::vector<std::string>& args,
+    const std::map<std::string, std::string>& kwargs) {
+  PyGuard guard;
+  PyObjectPtr pyModule(PyImport_ImportModule(moduleName.c_str()));
+  LOG(INFO) << "createPythonClass moduleName.c_str:" << moduleName.c_str();
+  CHECK_PY(pyModule) << "Import module " << moduleName << " failed.";
+  PyObjectPtr pyDict(PyModule_GetDict(pyModule.get()));
+  CHECK_PY(pyDict) << "Get Dict failed.";
+  PyObjectPtr pyClass(PyDict_GetItemString(pyDict.get(), className.c_str()));
+  LOG(INFO) << "createPythonClass className.c_str():" << className.c_str();
+  CHECK_PY(pyClass) << "Import class " << className << " failed.";
+  PyObjectPtr argsObjectList(PyTuple_New(args.size()));
+  for (size_t i = 0; i < args.size(); ++i) {
+    PyObjectPtr pyArg(Py_BuildValue("s#", args[i].c_str(), args[i].length()));
+    PyTuple_SetItem(argsObjectList.get(), i, pyArg.release());
+  }
+
+  PyObjectPtr kwargsObjectList(PyDict_New());
+  for (auto& x : kwargs) {
+    PyObjectPtr pyArg(Py_BuildValue("s#", x.second.c_str(), x.second.length()));
+    PyDict_SetItemString(kwargsObjectList.get(), x.first.c_str(),
+                         pyArg.release());
+  }
+
+  PyObjectPtr pyInstance(PyInstance_New(pyClass.get(), argsObjectList.release(),
+                                        kwargsObjectList.release()));
+  CHECK_PY(pyInstance) << "Create class " << className << " failed.";
+  return pyInstance;
+}
+
+
+namespace py {
+char* repr(PyObject* obj) {
+  return PyString_AsString(PyObject_Repr(obj));
+}
+
+std::string getPyCallStack() {
+  std::ostringstream os;
+  printPyErrorStack(os, true);
+  return os.str();
+}
+}  // namespace py
+
+#endif
+
+void initPython(int argc, char** argv) {
+#ifndef PADDLE_NO_PYTHON
+  Py_SetProgramName(argv[0]);
+  Py_Initialize();
+  PySys_SetArgv(argc, argv);
+
+  // python blocks SIGINT. Need to enable it.
+  signal(SIGINT, SIG_DFL);
+#endif
+}
+
+}  // namespace paddle
diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h
new file mode 100644
index 00000000000000..4467fd784ec4eb
--- /dev/null
+++ b/paddle/utils/PythonUtil.h
@@ -0,0 +1,342 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#ifndef PADDLE_NO_PYTHON
+// must include the following two blocks, otherwise,
+// gcc compiler may produce warning
+#ifdef _POSIX_C_SOURCE
+#define __TEMP_POSIX_C_SOURCE _POSIX_C_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+#ifdef _XOPEN_SOURCE
+#define __TEMP_XOPEN_SOURCE _XOPEN_SOURCE
+#undef _XOPEN_SOURCE
+#endif
+#include <Python.h>
+#include <frameobject.h>
+#ifndef _POSIX_C_SOURCE
+#warning "no _POSIX_C_SOURCE defined in Python.h"
+#endif
+#ifndef _XOPEN_SOURCE
+#warning "no _XOPEN_SOURCE defined in Python.h"
+#endif
+#endif
+
+#include "paddle/utils/Util.h"
+#include <stdarg.h>
+#include <mutex>
+#include <map>
+
+namespace paddle {
+
+std::string callPythonFunc(const std::string& moduleName,
+                           const std::string& funcName,
+                           const std::vector<std::string>& args);
+
+#ifndef PADDLE_NO_PYTHON
+
+/**
+ * Global lock guard of python C-api invokes.
+ * NOTE: the lock of this guard is reentrant or recursive.
+ */
+class PyGuard {
+public:
+  PyGuard();
+  PyGuard(const PyGuard& other) = delete;
+  PyGuard& operator=(const PyGuard& other) = delete;
+
+private:
+  std::lock_guard<std::recursive_mutex> guard_;
+};
+
+struct PyObjectDeleter {
+  void operator()(PyObject* obj) {
+    if (obj) {
+      Py_DECREF(obj);
+    }
+  }
+};
+
+typedef std::unique_ptr<PyObject, PyObjectDeleter> PyObjectPtr;
+
+PyObjectPtr callPythonFuncRetPyObj(const std::string& moduleName,
+                                   const std::string& funcName,
+                                   const std::vector<std::string>& args);
+
+PyObjectPtr createPythonClass(const std::string& moduleName,
+                              const std::string& className,
+                              const std::vector<std::string>& args,
+                              const std::map<std::string, std::string>& kwargs);
+
+#define CHECK_PY(x)\
+  CHECK((x) != nullptr) << ::paddle::py::getPyCallStack()
+
+namespace py {
+/**
+ * Cast a PyLong or PyInt to int type T.
+ * @tparam T return type.
+ * @param [in] obj PyLong or PyInt object.
+ * @param [out] ok status for casting. False if error occured. nullptr if user
+ *                 don't care is ok or not.
+ * @return The value of python object, or 0 if not ok.
+ */
+template <typename T>
+T castInt(PyObject* obj, bool* ok = nullptr) {
+  if (PyLong_Check(obj)) {
+    if (ok) *ok = true;
+    return (T) PyLong_AsUnsignedLong(obj);
+  } else if (PyInt_Check(obj)) {
+    if (ok) *ok = true;
+    return (T) PyInt_AsLong(obj);
+  } else {
+    if (ok) *ok = false;
+    return (T) 0;
+  }
+}
+
+/**
+ * Invoke repr of python object.
+ *
+ * Just like toString method in java.
+ */
+char *repr(PyObject* obj);
+
+/**
+ * Invoke repr of python object.
+ */
+inline char *repr(const PyObjectPtr &obj) {
+  return repr(obj.get());
+}
+
+/**
+ * Get Python Error Stack String.
+ */
+std::string getPyCallStack();
+
+/**
+ * Object Helper for PyObjectPtr.
+ *
+ * Implements getAttr method for object.
+ */
+class ObjectHelper {
+public:
+  explicit ObjectHelper(const PyObjectPtr& obj): obj_(obj) {
+  }
+
+  /**
+   * get attribute
+   */
+  inline PyObject* getAttr(const std::string& field) const {
+    auto obj = PyObject_GetAttrString(obj_.get(), field.c_str());
+    CHECK_PY(obj) << "Cannot get attribute on python object " << obj_.get();
+    return obj;
+  }
+
+  /**
+   * Get Int attribute
+   * @param [in] field  attribute name.
+   * @param [out] ok true if this attribute is int.
+   * @tparam T int type.
+   * @return int value.
+   */
+  template <typename T>
+  T getIntAttr(const std::string& field, bool* ok = nullptr) const {
+    PyObjectPtr tmp(getAttr(field));
+    return castInt<T>(tmp.get(), ok);
+  }
+
+  /**
+   * Get int attribute. Log(Fatal) when not ok
+   * @param field attribute name.
+   * @return int value.
+   */
+  template <typename T>
+  T getIntAttrWithError(const std::string& field) const {
+    bool ok;
+    T tmp = getIntAttr<T>(field, &ok);
+    CHECK(ok) << "Cannot get integer attribute on object " << obj_.get();
+    return tmp;
+  }
+
+  /**
+   * Get bool attribute.
+   * @param field
+   * @return
+   */
+  bool getBoolAttr(const std::string& field) const {
+    PyObjectPtr tmp(getAttr(field));
+    return PyObject_IsTrue(tmp.get());
+  }
+
+private:
+  const PyObjectPtr& obj_;
+};
+
+/**
+ * Python Sequence Helper
+ *
+ * The python sequence means list or tuple.
+ */
+class SequenceHelper {
+public:
+  explicit SequenceHelper(const PyObjectPtr& seq) : seq_(seq.get()) {
+    CHECK(PySequence_Check(seq_));
+  }
+
+  explicit SequenceHelper(PyObject* seq): seq_(seq) {
+    CHECK(PySequence_Check(seq_));
+  }
+
+  inline size_t size() const {
+    return (size_t) PySequence_Size(seq_);
+  }
+
+  inline PyObject* operator[] (size_t i) const {
+    return PySequence_Fast_GET_ITEM(seq_, i);
+  }
+
+  inline double getDouble(size_t i) const {
+    auto* ptr = (*this)[i];
+    return PyFloat_AsDouble(ptr);
+  }
+
+  /**
+   * Set a sequence item o[i] = obj;
+   * @param i index
+   * @param obj setted item.
+   * @param steal if steal = true, sequence will move object in iteself,
+   *              just like std::move. Otherwise, it will increase reference
+   *              count. Default is false.
+   */
+  inline void set(size_t i, const PyObjectPtr& obj, bool steal = false) {
+    this->set(i, obj.get(), steal);
+  }
+
+  /**
+   * Set a sequence item o[i] = obj;
+   */
+  inline void set(size_t i, PyObject* obj, bool steal = false) {
+    if (!steal) {
+      Py_XINCREF(obj);
+    }
+    if (PyTuple_Check(seq_)) {
+      CHECK_NE(PyTuple_SetItem(seq_, i, obj), -1) << getPyCallStack();
+    } else {
+      CHECK_NE(PySequence_SetItem(seq_, i, obj), -1) << getPyCallStack();
+    }
+  }
+
+private:
+  PyObject* seq_;
+};
+
+class DictHelper {
+public:
+  explicit DictHelper(PyObject* d): dict_(d) {}
+
+  explicit DictHelper(const PyObjectPtr& d): dict_(d.get()) {}
+
+  void set(const std::string& key, PyObject* item) {
+    PyDict_SetItemString(dict_, key.c_str(), item);
+  }
+
+  void setBool(const std::string& key, bool b) {
+    this->set(key, PyBool_FromLong(b));
+  }
+
+private:
+  inline void checkDict() {
+    CHECK(PyDict_Check(this->dict_));
+  }
+
+  PyObject* dict_;
+};
+
+inline static bool isCallable(const PyObjectPtr& obj) {
+  return PyCallable_Check(obj.get());
+}
+
+/**
+ * Wrap a callable object.
+ */
+class CallableHelper {
+public:
+  explicit CallableHelper(const PyObjectPtr& obj): obj_(obj) {
+    CHECK(py::isCallable(obj_));
+  }
+
+  ~CallableHelper() {}
+
+  /**
+   * reset args, and create new tuple.
+   * @param sz args size.
+   */
+  void setArgsSize(size_t sz) {
+    args.reset(PyTuple_New(sz));
+  }
+
+  /**
+   * Get args sequence. User can set/get by SequenceHelper.
+   */
+  SequenceHelper getArgs() {
+    return SequenceHelper(args);
+  }
+
+  /**
+   * Call python method, return an object.
+   */
+  PyObject* operator() () {
+    PyGuard guard;
+    return PyObject_Call(obj_.get(), args.get(), kwargs.get());
+  }
+
+private:
+  const PyObjectPtr& obj_;
+  PyObjectPtr args;
+  PyObjectPtr kwargs;
+};
+
+inline static PyObject* iterNext(const PyObjectPtr& context, bool* atEnd) {
+  PyGuard g;
+  PyObject* data = PyIter_Next(context.get());
+  if (data == nullptr) {
+    if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
+      PyErr_Clear();
+      *atEnd = true;
+      return nullptr;
+    } else if (PyErr_Occurred()) {
+      CHECK_PY(data) << "Calling iterator next error";
+      return nullptr;
+    } else {
+      *atEnd = false;
+      return data;  // just return none in iterator.
+    }
+  } else {
+    *atEnd = false;
+    return data;
+  }
+}
+}  // namespace py
+
+#endif
+
+/**
+ * Initialize python.
+ */
+void initPython(int argc, char** argv);
+
+}  // namespace paddle
diff --git a/paddle/utils/Queue.h b/paddle/utils/Queue.h
new file mode 100644
index 00000000000000..d73f27d7fafd6c
--- /dev/null
+++ b/paddle/utils/Queue.h
@@ -0,0 +1,243 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+
+#include "Locks.h"
+
+namespace paddle {
+
+/**
+ * A thread-safe queue that automatically grows but never shrinks.
+ * Dequeue a empty queue will block current thread. Enqueue an element
+ * will wake up another thread that blocked by dequeue method.
+ *
+ * For example.
+ * @code{.cpp}
+ *
+ * paddle::Queue<int> q;
+ * END_OF_JOB=-1
+ * void thread1() {
+ *   while (true) {
+ *     auto job = q.dequeue();
+ *     if (job == END_OF_JOB) {
+ *       break;
+ *     }
+ *     processJob(job);
+ *   }
+ * }
+ *
+ * void thread2() {
+ *   while (true) {
+ *      auto job = getJob();
+ *      q.enqueue(job);
+ *      if (job == END_OF_JOB) {
+ *        break;
+ *      }
+ *   }
+ * }
+ *
+ * @endcode
+ */
+template <class T>
+class Queue {
+public:
+  /**
+   * @brief Construct Function. Default capacity of Queue is zero.
+   */
+  Queue() : numElements_(0) {}
+
+  ~Queue() {}
+
+  /**
+   * @brief enqueue an element into Queue.
+   * @param[in] el The enqueue element.
+   * @note This method is thread-safe, and will wake up another blocked thread.
+   */
+  void enqueue(const T& el) {
+    std::unique_lock<std::mutex> lock(queueLock_);
+    elements_.emplace_back(el);
+    numElements_++;
+
+    queueCV_.notify_all();
+  }
+
+  /**
+   * @brief enqueue an element into Queue.
+   * @param[in] el The enqueue element. rvalue reference .
+   * @note This method is thread-safe, and will wake up another blocked thread.
+   */
+  void enqueue(T&& el) {
+    std::unique_lock<std::mutex> lock(queueLock_);
+    elements_.emplace_back(std::move(el));
+    numElements_++;
+
+    queueCV_.notify_all();
+  }
+
+  /**
+   * Dequeue from a queue and return a element.
+   * @note this method will be blocked until not empty.
+   */
+  T dequeue() {
+    std::unique_lock<std::mutex> lock(queueLock_);
+    queueCV_.wait(lock, [this]() { return numElements_ != 0; });
+    T el;
+
+    using std::swap;
+    // Becuase of the previous statement, the right swap() can be found
+    // via argument-dependent lookup (ADL).
+    swap(elements_.front(), el);
+
+    elements_.pop_front();
+    numElements_--;
+    if (numElements_ == 0) {
+      queueCV_.notify_all();
+    }
+    return el;
+  }
+
+  /**
+   * Return size of queue.
+   *
+   * @note This method is not thread safe. Obviously this number
+   * can change by the time you actually look at it.
+   */
+  inline int size() const { return numElements_; }
+
+  /**
+   * @brief is empty or not.
+   * @return true if empty.
+   * @note This method is not thread safe.
+   */
+  inline bool empty() const { return numElements_ == 0; }
+
+  /**
+   * @brief wait util queue is empty
+   */
+  void waitEmpty() {
+    std::unique_lock<std::mutex> lock(queueLock_);
+    queueCV_.wait(lock, [this]() { return numElements_ == 0; });
+  }
+
+private:
+  std::deque<T> elements_;
+  int numElements_;
+  std::mutex queueLock_;
+  std::condition_variable queueCV_;
+};
+
+/*
+ * A thread-safe circular queue that
+ * automatically blocking calling thread if capacity reached.
+ *
+ * For example.
+ * @code{.cpp}
+ *
+ * paddle::BlockingQueue<int> q(capacity);
+ * END_OF_JOB=-1
+ * void thread1() {
+ *   while (true) {
+ *     auto job = q.dequeue();
+ *     if (job == END_OF_JOB) {
+ *       break;
+ *     }
+ *     processJob(job);
+ *   }
+ * }
+ *
+ * void thread2() {
+ *   while (true) {
+ *      auto job = getJob();
+ *      q.enqueue(job); //Block until q.size() < capacity .
+ *      if (job == END_OF_JOB) {
+ *        break;
+ *      }
+ *   }
+ * }
+ */
+template <typename T>
+class BlockingQueue {
+public:
+  /**
+   * @brief Construct Function. 
+   * @param[in] capacity the max numer of elements the queue can have.
+   */
+  explicit BlockingQueue(size_t capacity) : capacity_(capacity) {}
+
+  /**
+   * @brief enqueue an element into Queue.
+   * @param[in] x The enqueue element, pass by reference .
+   * @note This method is thread-safe, and will wake up another thread 
+   * who was blocked because of the queue is empty.
+   * @note If it's size() >= capacity before enqueue, 
+   * this method will block and wait until size() < capacity.
+   */
+  void enqueue(const T& x) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    notFull_.wait(lock, [&] { return queue_.size() < capacity_; });
+    queue_.push_back(x);
+    notEmpty_.notify_one();
+  }
+
+  /**
+   * Dequeue from a queue and return a element.
+   * @note this method will be blocked until not empty.
+   * @note this method will wake up another thread who was blocked because
+   * of the queue is full.
+   */
+  T dequeue() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    notEmpty_.wait(lock, [&] { return !queue_.empty(); });
+
+    T front(queue_.front());
+    queue_.pop_front();
+    notFull_.notify_one();
+    return front;
+  }
+
+  /**
+   * Return size of queue.
+   *
+   * @note This method is thread safe. 
+   * The size of the queue won't change until the method return.
+   */
+  size_t size() {
+    std::lock_guard<std::mutex> guard(mutex_);
+    return queue_.size();
+  }
+
+  /**
+   * @brief is empty or not.
+   * @return true if empty.
+   * @note This method is thread safe.
+   */
+  size_t empty() {
+    std::lock_guard<std::mutex> guard(mutex_);
+    return queue_.empty();
+  }
+
+private:
+  std::mutex mutex_;
+  std::condition_variable notEmpty_;
+  std::condition_variable notFull_;
+  std::deque<T> queue_;
+  size_t capacity_;
+};
+
+}  // namespace paddle
diff --git a/paddle/utils/Stat.cpp b/paddle/utils/Stat.cpp
new file mode 100644
index 00000000000000..14aae6909d4044
--- /dev/null
+++ b/paddle/utils/Stat.cpp
@@ -0,0 +1,217 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Stat.h"
+
+#include <sys/syscall.h>  // for syscall()
+#include <sys/types.h>
+#include <iomanip>
+#include <algorithm>
+
+namespace paddle {
+
+// return the thread id used by glog
+pid_t getTID() {
+#ifndef __NR_gettid
+#define __NR_gettid 224
+#endif
+  pid_t tid = syscall(__NR_gettid);
+  CHECK_NE(tid, -1);
+  return tid;
+}
+
+StatSet globalStat("GlobalStatInfo");
+
+void Stat::addSample(uint64_t value) {
+  StatInfo* statInfo = statInfo_.get(false);
+  if (!statInfo) {
+    statInfo = new StatInfo(this);
+    statInfo_.set(statInfo);
+    std::lock_guard<std::mutex> guard(lock_);
+    threadLocalBuf_.push_back({statInfo, getTID()});
+  }
+  if (value > statInfo->max_) {
+    statInfo->max_ = value;
+  }
+  if (value < statInfo->min_) {
+    statInfo->min_ = value;
+  }
+  statInfo->total_ += value;
+  statInfo->count_++;
+}
+
+void Stat::mergeThreadStat(StatInfo& allThreadStat) {
+  allThreadStat = destructStat_;
+  for (auto& buf : threadLocalBuf_) {
+    if (buf.first->max_ > allThreadStat.max_) {
+      allThreadStat.max_ = buf.first->max_;
+    }
+    if (buf.first->min_ < allThreadStat.min_) {
+      allThreadStat.min_ = buf.first->min_;
+    }
+    allThreadStat.total_ += buf.first->total_;
+    allThreadStat.count_ += buf.first->count_;
+  }
+}
+
+void Stat::reset() {
+  std::lock_guard<std::mutex> guard(lock_);
+  for (auto& buf : threadLocalBuf_) {
+    buf.first->reset();
+  }
+}
+
+std::ostream& operator<<(std::ostream& outPut, const Stat& stat) {
+  std::lock_guard<std::mutex> guard(const_cast<Stat&>(stat).lock_);
+  auto showStat = [&](const StatInfo* info, pid_t tid, bool isFirst = true) {
+    uint64_t average = 0;
+    if (info->count_ > 0) {
+      if (!isFirst) {
+        outPut << std::setw(42) << " ";
+      }
+      average = info->total_ / info->count_;
+      outPut << "Stat=" << std::setw(30) << stat.getName();
+      if (tid) {
+        outPut << " TID=" << std::setw(6) << tid;
+      }
+      outPut << " total=" << std::setw(10) << info->total_ * 0.001
+             << " avg=" << std::setw(10) << average * 0.001
+             << " max=" << std::setw(10) << info->max_ * 0.001
+             << " min=" << std::setw(10) << info->min_ * 0.001
+             << " count=" << std::setw(10) << info->count_ << std::endl;
+    }
+  };
+  if (!stat.getThreadInfo()) {
+    StatInfo infoVarTmp;
+    const_cast<Stat&>(stat).mergeThreadStat(infoVarTmp);
+    showStat(&infoVarTmp, 0);
+  } else {
+    bool isFirst = true;
+    for (auto& buf : stat.threadLocalBuf_) {
+      showStat(buf.first, buf.second, isFirst);
+      if (isFirst) isFirst = false;
+    }
+    showStat(&stat.destructStat_, 0);
+  }
+
+  return outPut;
+}
+
+BarrierStatPtr StatSet::getStat(uint16_t numConnThreads,
+                                const std::string& name,
+                                BarrierStatType bType) {
+  {
+    ReadLockGuard guard(lock_);
+    auto it = barrierStatSet_.find(name);
+    if (it != barrierStatSet_.end()) {
+      return it->second;
+    }
+  }
+
+  std::lock_guard<RWLock> guard(lock_);
+  // test again with lock_guard
+  auto it = barrierStatSet_.find(name);
+  if (it != barrierStatSet_.end()) {
+    return it->second;
+  }
+
+  BarrierStatPtr stat;
+  if (bType == BARRIER_END) {
+    stat = std::make_shared<BarrierEndStat>(numConnThreads, name);
+  } else if (bType == BARRIER_DELTA) {
+    stat = std::make_shared<BarrierDeltaStat>(numConnThreads, name);
+  }
+  auto ret = barrierStatSet_.insert(std::make_pair(name, stat));
+  return ret.first->second;
+}
+
+void StatSet::printSegTimerStatus() {
+  ReadLockGuard guard(lock_);
+  LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
+            << "======= StatSet: [" << name_ << "] status ======" << std::endl;
+  for (auto& stat : statSet_) {
+    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
+              << *(stat.second);
+  }
+}
+
+void StatSet::printBarrierTimerStatus() {
+  ReadLockGuard guard(lock_);
+  // control barrierAbstact in runtime, so enable compliation
+  LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
+            << "======= BarrierStatSet status ======" << std::endl;
+  for (auto& stat : barrierStatSet_) {
+    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
+              << *(stat.second);
+  }
+}
+
+void StatSet::printAllStatus() {
+#ifndef PADDLE_DISABLE_TIMER
+  printSegTimerStatus();
+#endif
+  printBarrierTimerStatus();
+  LOG(INFO) << std::setiosflags(std::ios::left)
+            << "--------------------------------------------------"
+            << std::endl;
+}
+
+void StatSet::printStatus(const std::string& name) {
+  ReadLockGuard guard(lock_);
+  auto iter = statSet_.find(name);
+  CHECK(iter != statSet_.end()) << name << " is not registed in " << name_;
+  LOG(INFO) << *(iter->second);
+}
+
+void StatSet::reset(bool clearRawData) {
+  ReadLockGuard guard(lock_);
+  for (auto& stat : statSet_) {
+    stat.second->reset();
+  }
+  // reset barrierStat
+  for (auto& stat : barrierStatSet_) {
+    stat.second->reset(clearRawData);
+  }
+}
+
+void StatSet::setThreadInfo(const std::string& name, bool flag) {
+  ReadLockGuard guard(lock_);
+  auto iter = statSet_.find(name);
+  CHECK(iter != statSet_.end()) << name << " is not registed in " << name_;
+  iter->second->setThreadInfo(flag);
+}
+
+void StatSet::deleteStat(const std::string& name) {
+  std::lock_guard<RWLock> guard(lock_);
+  auto iter = statSet_.find(name);
+  CHECK(iter != statSet_.end()) << name << " is not registed in " << name_;
+  statSet_.erase(iter);
+}
+
+StatInfo::~StatInfo() {
+  if (stat_) {
+    std::lock_guard<std::mutex> guard(stat_->lock_);
+    if (stat_->destructStat_.max_ < this->max_) {
+      stat_->destructStat_.max_ = this->max_;
+    }
+    if (stat_->destructStat_.min_ > this->min_) {
+      stat_->destructStat_.min_ = this->min_;
+    }
+    stat_->destructStat_.total_ += this->total_;
+    stat_->destructStat_.count_ += this->count_;
+    stat_->threadLocalBuf_.remove({this, getTID()});
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/utils/Stat.h b/paddle/utils/Stat.h
new file mode 100644
index 00000000000000..00e5aaec2babfd
--- /dev/null
+++ b/paddle/utils/Stat.h
@@ -0,0 +1,283 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <string>
+#include <sys/time.h>
+#include <memory>
+#include <iostream>
+#include <mutex>
+#include <unordered_map>
+#include <list>
+
+#include "Logging.h"
+#include "BarrierStat.h"
+#include "Locks.h"
+#include "ThreadLocal.h"
+#include "BarrierStat.h"
+
+namespace paddle {
+
+class Stat;
+
+class StatInfo {
+public:
+  explicit StatInfo(Stat* stat = nullptr) : stat_(stat) {
+    total_ = 0;
+    max_ = 0;
+    count_ = 0;
+    min_ = UINT64_MAX;
+  }
+
+  void reset() {
+    total_ = 0;
+    count_ = 0;
+    max_ = 0;
+    min_ = UINT64_MAX;
+  }
+
+  ~StatInfo();
+
+  Stat* stat_;
+  uint64_t total_;
+  uint64_t max_;
+  uint64_t count_;
+  uint64_t min_;
+};
+
+class Stat;
+typedef std::shared_ptr<Stat> StatPtr;
+typedef std::shared_ptr<BarrierStatBase> BarrierStatPtr;
+
+enum BarrierStatType {
+  BARRIER_END = 0,
+  BARRIER_DELTA = 1,
+};
+
+class StatSet {
+public:
+  explicit StatSet(const std::string& name) : name_(name) {}
+  ~StatSet() {}
+
+  // print to LOG(INFO)
+  void printSegTimerStatus();
+  void printBarrierTimerStatus();
+  void printAllStatus();
+
+  void printStatus(const std::string& name);
+
+  StatPtr getStat(const std::string& name) {
+    {
+      ReadLockGuard guard(lock_);
+      auto it = statSet_.find(name);
+      if (it != statSet_.end()) {
+        return it->second;
+      }
+    }
+    StatPtr stat = std::make_shared<Stat>(name);
+    std::lock_guard<RWLock> guard(lock_);
+    auto ret = statSet_.insert(std::make_pair(name, stat));
+    return ret.first->second;
+  }
+
+  BarrierStatPtr getStat(uint16_t numConnThreads, const std::string& name,
+                         BarrierStatType bType);
+
+  void deleteStat(const std::string& name);
+
+  // true for showing stats for each thread
+  // false for showing stats aggragated over threads
+  void setThreadInfo(const std::string& name, bool flag);
+
+  // true for showing stats for each thread
+  // false for showing stats aggragated over threads
+  void setThreadInfo(bool flag) {
+    for (auto& iter : statSet_) {
+      setThreadInfo(iter.first, flag);
+    }
+  }
+
+  // reset the counters for all stats
+  // clearRawData means also clearing raw tuning data, because at pserver end,
+  // barrier rawData(timeVector_) is stateful, clearing it will cause rubbish
+  // data, while rawData should be cleared at the new pass (so complicated
+  // pserver code logic, -_- ).
+  void reset(bool clearRawData = true);
+
+private:
+  std::unordered_map<std::string, StatPtr> statSet_;
+  std::unordered_map<std::string, BarrierStatPtr> barrierStatSet_;
+  const std::string name_;
+  RWLock lock_;
+};
+
+extern StatSet globalStat;
+
+/*@brief : a simple stat*/
+class Stat {
+public:
+  explicit Stat(const std::string& statName)
+      : destructStat_(nullptr), name_(statName), openThreadInfo_(false) {}
+  ~Stat() {}
+
+  typedef std::list<std::pair<StatInfo*, pid_t>> ThreadLocalBuf;
+
+  const std::string& getName() const { return name_; }
+
+  void addSample(uint64_t value);
+
+  // clear all stats
+  void reset();
+
+  friend std::ostream& operator<<(std::ostream& outPut, const Stat& stat);
+
+  /*  Set operator << whether to print thread info.
+   *  If openThreadInfo_ == true, then print, else print merge thread info.
+   */
+  void setThreadInfo(bool flag) { openThreadInfo_ = flag; }
+
+  bool getThreadInfo() const { return openThreadInfo_; }
+
+  friend class StatInfo;
+
+private:
+  void mergeThreadStat(StatInfo& allThreadStat);
+
+  std::mutex lock_;
+  ThreadLocalBuf threadLocalBuf_;
+  StatInfo destructStat_;
+  ThreadLocal<StatInfo> statInfo_;
+  const std::string name_;
+  bool openThreadInfo_;
+};
+
+extern StatSet globalStat;
+
+inline StatPtr getStat(const std::string& name) {
+  return globalStat.getStat(name);
+}
+
+inline uint64_t nowInMicroSec() {
+  timeval tvTime;
+  (void)gettimeofday(&tvTime, NULL);
+  return tvTime.tv_sec * 1000000LU + tvTime.tv_usec;
+}
+
+/**
+ * A simple help class to measure time interval
+ */
+class Timer {
+public:
+  explicit Timer(bool autoStart = true) : total_(0), startStamp_(0) {
+    if (autoStart) {
+      start();
+    }
+  }
+  void start() { startStamp_ = nowInMicroSec(); }
+  void setStartStamp(uint64_t startStamp) { startStamp_ = startStamp; }
+  uint64_t stop() {
+    total_ += nowInMicroSec() - startStamp_;
+    return total_;
+  }
+
+  uint64_t get() const { return total_; }
+
+  void reset() { total_ = 0; }
+
+protected:
+  uint64_t total_;
+  uint64_t startStamp_;
+};
+
+class TimerOnce {
+public:
+  TimerOnce(Stat* stat, const char* info = "",
+            uint64_t threshold = -1, bool autoStart = true,
+            uint64_t startStamp = 0)
+      : stat_(stat), info_(info), timer_(autoStart), threshold_(threshold) {
+    if (!autoStart) {
+      timer_.setStartStamp(startStamp);
+    }
+  }
+  ~TimerOnce() {
+    uint64_t span = timer_.stop();
+    if (span >= threshold_) {
+      LOG(INFO) << "Stat: [" << stat_->getName() << "] " << info_
+                << " [Span:" << span / 1000 << "ms" << span % 1000 << "us"
+                << "] ";
+    }
+    stat_->addSample(span);
+  }
+
+private:
+  Stat* stat_;
+  const char* info_;
+  Timer timer_;
+  uint64_t threshold_;
+};
+
+inline uint64_t registerTimerArg1(uint64_t threshold = -1,
+                                  StatSet& statSet = globalStat) {
+  return threshold;
+}
+
+inline StatSet& registerTimerArg2(uint64_t threshold = -1,
+                                  StatSet& statSet = globalStat) {
+  return statSet;
+}
+
+#ifdef PADDLE_DISABLE_TIMER
+
+#define REGISTER_TIMER(statName, ...)
+#define REGISTER_TIMER_SET(statName, start, ...)
+#define REGISTER_TIMER_DYNAMIC(statName, ...)
+#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...)
+#define REGISTER_TIMER_INFO(statName, info)
+#define FOR_TIMING(statement)
+
+#else
+
+#define FOR_TIMING(statement) statement
+
+// The default arguments are shown in the following line:
+// REGISTER_TIMER(statName, threshold = -1, statSet = globalStat)
+// TODO(yuyang18,wangyanfei01): if UNIQUE_NAME is needed
+#define REGISTER_TIMER(statName, ...)                                       \
+  static StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \
+  TimerOnce __timerOnce(__stat.get(), "", registerTimerArg1(__VA_ARGS__));
+
+#define REGISTER_TIMER_SET(statName, start, ...)                            \
+  static StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \
+  TimerOnce __timerOnce(__stat.get(), "", registerTimerArg1(__VA_ARGS__),   \
+                        false, start);
+
+// dynmaic timer, support to discriminate runtime entity, used in pserver
+#define REGISTER_TIMER_DYNAMIC(statName, ...)                               \
+  StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName);        \
+  TimerOnce __timerOnce(__stat.get(), "", registerTimerArg1(__VA_ARGS__));
+
+#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...)                    \
+  StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName);        \
+  TimerOnce __timerOnce(__stat.get(), "", registerTimerArg1(__VA_ARGS__),   \
+                        false, start);
+
+#define REGISTER_TIMER_INFO(statName, info)                                 \
+  static StatPtr __stat = globalStat.getStat(statName);                     \
+  TimerOnce __timerOnce(__stat.get(), info, 10 * 1000000LU /*threshold*/);
+
+#endif  // DISABLE_TIMER
+
+}  // namespace paddle
diff --git a/paddle/utils/StringUtil.cpp b/paddle/utils/StringUtil.cpp
new file mode 100644
index 00000000000000..b416cda4af1572
--- /dev/null
+++ b/paddle/utils/StringUtil.cpp
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "StringUtil.h"
+
+namespace paddle {
+namespace str {
+
+bool endsWith(const std::string& str, const std::string& ext) {
+  if (str.size() >= ext.size() && ext == str.substr(str.size() - ext.size())) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void split(const std::string& str, char sep, std::vector<std::string>* pieces) {
+  pieces->clear();
+  if (str.empty()) {
+    return;
+  }
+  size_t pos = 0;
+  size_t next = str.find(sep, pos);
+  while (next != std::string::npos) {
+    pieces->push_back(str.substr(pos, next - pos));
+    pos = next + 1;
+    next = str.find(sep, pos);
+  }
+  if (!str.substr(pos).empty()) {
+    pieces->push_back(str.substr(pos));
+  }
+}
+
+bool startsWith(const std::string& str, const std::string& prefix) {
+  if (prefix.size() <= str.size()) {
+    for (size_t i = 0; i < prefix.size(); ++i) {
+      if (str[i] != prefix[i]) return false;
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+}  // namespace str
+}  // namespace paddle
diff --git a/paddle/utils/StringUtil.h b/paddle/utils/StringUtil.h
new file mode 100644
index 00000000000000..50301a19be46bf
--- /dev/null
+++ b/paddle/utils/StringUtil.h
@@ -0,0 +1,77 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <sstream>
+#include "Logging.h"
+
+namespace paddle {
+
+namespace str {
+/// test whether a string ends with another string
+bool endsWith(const std::string& str, const std::string& ext);
+
+bool startsWith(const std::string& str, const std::string& prefix);
+
+/**
+ * Use sep to split str into pieces.
+ * If str is empty, *pieces will be empty.
+ * If str ends with sep, the last piece will be an empty string.
+ */
+void split(const std::string& str, char sep, std::vector<std::string>* pieces);
+
+/**
+ * Cast string to type T with status.
+ *
+ * @param [in] s input string.
+ * @param [out] ok status, return true if there is no error in casting. Set
+ *              nullptr if user don't care error at all.
+ * @return result of casting. If error occurred, a default value of T() will
+ *         return.
+ */
+template <class T>
+inline T toWithStatus(const std::string& s, bool* ok = nullptr) {
+  std::istringstream sin(s);
+  T v;
+  sin >> v;
+  if (ok) {
+    *ok = sin.eof() && !sin.fail();
+  }
+  return v;
+}
+
+/// Convert string to type T. It makes sure all the characters in s are used.
+/// Otherwise it will abort.
+///
+/// @tparam T type of return
+/// @param s string input.
+template <class T>
+inline T to(const std::string& s) {
+  bool ok;
+  T v = toWithStatus<T>(s, &ok);
+  CHECK(ok) << "Cannot convert s(" << s << ") to type " << typeid(T).name();
+  return v;
+}
+
+
+
+}  // namespace str
+
+#undef DEFINE_STRING_CONVERSION
+
+}  // namespace paddle
diff --git a/paddle/utils/Thread.h b/paddle/utils/Thread.h
new file mode 100644
index 00000000000000..3e1d95ab1fcde9
--- /dev/null
+++ b/paddle/utils/Thread.h
@@ -0,0 +1,610 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "Logging.h"
+#include <thread>
+
+#include <sys/syscall.h>
+#include <unistd.h>
+inline pid_t gettid() { return syscall(SYS_gettid); }
+
+#include "Queue.h"
+#include "ThreadLocal.h"
+
+#include <future>
+
+namespace paddle {
+
+/**
+ * A simple wrapper for std::thread
+ */
+
+class Thread {
+public:
+  /**
+   * @brief Construct Function. Default thread pointer is null.
+   */
+  Thread() { thread_ = nullptr; }
+
+  virtual ~Thread() {}
+
+  /**
+   * @brief Creat a new thread and call *run()* function.
+   */
+  void start() {
+    thread_.reset(new std::thread([this]() { this->run(); }));
+  }
+
+  /**
+   * @brief Detach the thread.
+   * It don't need to be waited until it finish.
+   */
+  void detach() { thread_->detach(); }
+
+  /**
+   * @brief Join the thread.
+   * It should be waited until it finish.
+   */
+  void join() { thread_->join(); }
+
+  /**
+   * @brief Define what to be done on this thread through override this function.
+   */
+  virtual void run() = 0;
+
+protected:
+  std::unique_ptr<std::thread> thread_;
+};
+
+/**
+ * ThreadWorker maintains a job queue. It executes the jobs in the job queue
+ * sequentianlly in a separate thread.
+ *
+ * Use addJob() to add a new job to the job queue.
+ */
+class ThreadWorker : protected Thread {
+public:
+  typedef std::function<void()> JobFunc;
+
+  /**
+   * @brief Construct Function. Default size of job queue is 0 and not stopping.
+   */
+  ThreadWorker() : stopping_(false), empty_(true) { start(); }
+
+  /**
+   * @brief Destruct Function.
+   * If it's running, wait until all job finish and then stop it.
+   */
+  ~ThreadWorker() {
+    if (!stopping_) {
+      wait();
+      stop();
+    }
+  }
+
+  /**
+   * @brief Finish current running job and quit the thread.
+   */
+  void stop() {
+    stopping_ = true;
+    jobs_.enqueue([]() {});
+    join();
+  }
+
+  /**
+   * @brief Add a new job to the job queue.
+   */
+  void addJob(JobFunc func) {
+    empty_ = false;
+    jobs_.enqueue(func);
+  }
+
+  /**
+   * @brief Wait until all jobs was done (the job queue was empty).
+   */
+  void wait() {
+    finishCV_.wait([this] { return empty_; });
+  }
+
+protected:
+  /**
+   * @brief Execute jobs in the job queue sequentianlly,
+   * @note If finish all the jobs in the job queue,
+   * notifies all the waiting threads the job queue was empty.
+   */
+  virtual void run() {
+    while (true) {
+      JobFunc func = jobs_.dequeue();
+      if (stopping_) break;
+      func();
+      if (jobs_.empty()) {
+        finishCV_.notify_all([this] { empty_ = true; });
+      }
+    }
+  }
+
+  Queue<JobFunc> jobs_;
+  bool stopping_;
+  LockedCondition finishCV_;
+  bool empty_;
+};
+
+/**
+ * SyncThreadPool maintains a pool of threads.
+ * It executes the job use all workers in the pool.
+ *
+ * Use exec() to run a new job, job complete when exec returned.
+ * Only one job can exec simultaneously.
+ *
+ * Each worker has an tid whose range is [0, getNumThreads()).
+ * JobFunc can use tid to divide input data.
+ */
+class SyncThreadPool {
+public:
+  typedef std::function<void(int tid, size_t numThreads)> JobFunc;
+
+  /**
+   * @brief Construct Function. No thread will be created.
+   */
+  SyncThreadPool()
+    : jobStartBarrier_(0),
+    jobFinishBarrier_(0)
+  { LOG(FATAL) << "Not implemented"; }
+
+  /**
+   * @brief Construct Fucntion. Create numWorkers of threads in the pool.
+   * @param[in] numWorkers Number of the workers in the pool.
+   * @param[in] checkOwner Default true. If checkOwner is true,
+   * this sync thread pool should be used by it's owner thread.
+   */
+  explicit SyncThreadPool(size_t numWorkers, bool checkOwner = true)
+      : stopping_(false),
+        jobStartBarrier_(numWorkers + 1),
+        jobFinishBarrier_(numWorkers + 1),
+        jobFunc_(nullptr),
+        checkOwner_(checkOwner) {
+    ownerThreadId_ = ::gettid();
+    workers_.resize(numWorkers);
+    start();
+  }
+
+  ~SyncThreadPool() {
+    if (!stopping_) {
+      stop();
+    }
+  }
+
+  /**
+   * @brief Return num of threads in the pool.
+   */
+  size_t getNumThreads() { return workers_.size(); }
+
+  /**
+   * @brief Execute a job using all the theads in the pool.
+   * @param[in] jobFunc The function to be executed.
+   * @param[in] ownerFunc Owner thread can do something in owerFunc when job executing.
+   * @note For the ownerFunc, tid=getNumThreads().
+   */
+  void exec(JobFunc jobFunc, JobFunc ownerFunc = nullptr) {
+    if (checkOwner_) {
+      CHECK_EQ(ownerThreadId_, ::gettid())
+          << "this sync thread pool should be used in one thread";
+    }
+
+    CHECK(jobFunc_ == nullptr);
+    jobFunc_ = jobFunc;
+    jobStartBarrier_.wait();  // notify worker thread start job
+
+    if (ownerFunc) {
+      ownerFunc(workers_.size(), workers_.size());
+    }
+
+    jobFinishBarrier_.wait();  // wait all worker thread complete
+    jobFunc_ = nullptr;
+  }
+
+  /**
+   * @brief Execute a job using all the threads in the pool.
+   * And the owner thread will do the same job.
+   * @param jobFunc The job to be executed.
+   * @note  Assume that JobFunc will execute numThread + 1 times,
+   * with tid ranging [0,numThread]. The thread whose tid is numThread
+   * is the owner thread.
+   */
+  void execPlusOwner(JobFunc jobFunc) { exec(jobFunc, jobFunc); }
+
+  /**
+   * @brief Execute a job if has pool, else use caller thread as a worker.
+   * @param[in] pool The pool to execute the job.
+   * @param[in] jobFunc The job to be excuted.
+   */
+  static void execHelper(SyncThreadPool* pool, JobFunc jobFunc) {
+    if (pool) {
+      pool->exec(jobFunc);
+    } else {
+      jobFunc(0, 1);
+    }
+  }
+
+protected:
+  /**
+   * @brief Start all the workers in the pool, call their run() function.
+   */
+  void start() {
+    for (size_t i = 0; i < workers_.size(); ++i) {
+      workers_[i].reset(
+          new std::thread([this](int tid) { this->run(tid); }, i));
+    }
+  }
+
+  /**
+   * @brief Stop all the workers in the pool.
+   */
+  void stop() {
+    stopping_ = true;
+    // notify worker thread to stop
+    jobStartBarrier_.wait();
+
+    // stop workers
+    for (auto& thread : workers_) {
+      if (thread) {
+        thread->join();
+        thread.reset(nullptr);
+      }
+    }
+  }
+
+  /**
+   * @brief Execute the jobFunc_ using the worker thread tid, if not stopping.
+   */
+  void run(int tid) {
+    VLOG(1) << "SyncThreadPool worker thread " << tid;
+    // init seed deterministic, but differs from global srand()
+    ThreadLocalRand::initThreadSeed(tid + workers_.size());
+
+    while (true) {
+      jobStartBarrier_.wait();  // wait job
+
+      if (stopping_) {
+        break;
+      }
+
+      jobFunc_(tid, workers_.size());
+
+      jobFinishBarrier_.wait();  // notify job complete
+    }
+  }
+
+protected:
+  pid_t ownerThreadId_;
+  bool stopping_;
+  ThreadBarrier jobStartBarrier_;
+  ThreadBarrier jobFinishBarrier_;
+
+  JobFunc jobFunc_;
+  bool checkOwner_;
+  std::vector<std::unique_ptr<std::thread>> workers_;
+};
+
+/**
+ * MultiThreadWorker maintains a job queue and a result queue.
+ * It executes the jobs in the job queue and puts the results into the
+ * result queue sequentially in multi separate threads.
+ *
+ * Add jobs:
+ *
+ *    Use addJob() to add a new job to the job queue
+ *        (the user added jobs should not return nullptr).
+ *
+ *    Use stopAddJob() to stop adding new jobs to the job queue
+ *        (addJob() can not be called after stopAddJob()).
+ *
+ * Normal stop:
+ *
+ *    Use waitResult() to get the results until nullptr is returned.
+ *    Use stop() to exit normally
+ *        (stopAddJob() should be called first).
+ *
+ * Force stop:
+ *
+ *    Use forceStop() to exit forcibly even though there are remaining jobs in the
+ * job queue.
+ */
+template <class T>
+class MultiThreadWorker {
+public:
+  typedef T ResultType;
+  typedef std::shared_ptr<ResultType> ResultPtrType;
+  typedef std::function<ResultPtrType()> JobFunc;
+  /**
+   * @brief Construct Function. Initialize the multithread worker.
+   * @param[in] workerNum Number of the workers.
+   * @param[in] queueCapacity Capapcity of the result queue.
+   */
+  MultiThreadWorker(size_t workerNum, size_t queueCapacity)
+      : stopping_(false),
+        jobAdding_(true),
+        nullResultNum_(0),
+        results_(queueCapacity) {
+    workers_.resize(workerNum);
+    for (auto& worker : workers_) {
+      worker.reset(new std::thread([this]() { this->run(); }));
+    }
+  }
+
+  /**
+   * @brief Destruct Function. Force stop the workers
+   * even though there are remaining jobs in the job queue.
+   */
+  virtual ~MultiThreadWorker() { forceStop(); }
+
+  /**
+   * @brief Stop all the workers normally.
+   * @note stopAddJob() should be called before it.
+   */
+  void stop() {
+    CHECK(!jobAdding_) << "stopAddJob() should be called before stop()";
+    for (auto& worker : workers_) {
+      if (worker) {
+        worker->join();
+        worker = nullptr;
+      }
+    }
+    stopping_ = true;
+  }
+
+  /**
+   * @brief Stop all the workers forcibly.
+   * @note This function will call stopAddJob() first
+   * and empty the result queue.
+   */
+  void forceStop() {
+    if (!stopping_) {
+      stopping_ = true;
+      stopAddJob();
+      while (nullptr != waitResult()) {
+      }
+      stop();
+    }
+  }
+
+  /**
+   * @brief Add a job to the job queue.
+   * @note Job can not be added after calling stopAddJob().
+   */
+  void addJob(JobFunc func) {
+    CHECK(jobAdding_) << "addJob() can not be called after stopAddJob()";
+    jobs_.enqueue(func);
+  }
+
+  /**
+   * @brief Stop adding new jobs to the job queue.
+   * @note This fuction enqueue a return nullptr function to the job queue.
+   */
+  void stopAddJob() {
+    for (size_t i = 0; i < workers_.size(); ++i) {
+      jobs_.enqueue([]() { return nullptr; });
+    }
+    jobAdding_ = false;
+  }
+
+  /**
+   * @brief Dequeue the first result in the result queue and return it.
+   * @note If the result queue is empty, wait until it's not empty
+   * or return nullptr if all the results have been returned.
+   */
+  ResultPtrType waitResult() {
+    while (true) {
+      ResultPtrType result = results_.dequeue();
+      if (result) {
+        return result;
+      }
+
+      ++nullResultNum_;
+      if (nullResultNum_ == workers_.size()) {
+        return nullptr;
+      }
+    }
+  }
+
+  /**
+   * @brief The result queue is empty or not.
+   * @return true if empty.
+   */
+  bool testResult() { return results_.empty(); }
+
+protected:
+  /**
+   * @brief Do the jobs in the job queue sequentianlly
+   * and enqueue the result into the result queue.
+   * @note A nullptr will be enqueued into the resulte queue, when a worker finished.
+   */
+  virtual void run() {
+    while (true) {
+      JobFunc func = jobs_.dequeue();
+      ResultPtrType result = func();
+      if (result == nullptr || stopping_) {
+        // When a worker finished, a nullptr would be enqueued into results_
+        results_.enqueue(nullptr);
+        break;
+      }
+      results_.enqueue(result);
+    }
+  }
+
+  bool stopping_;
+  bool jobAdding_;
+  size_t nullResultNum_;
+  Queue<JobFunc> jobs_;
+  BlockingQueue<ResultPtrType> results_;
+  std::vector<std::unique_ptr<std::thread>> workers_;
+};
+
+/**
+ * AsyncThreadPool maintains a job queue and threads pool.
+ * It executes the jobs from queue asynchronously.
+ *
+ * Add jobs:
+ *
+ *    Use addJob() to add a new job to the job queue and get a std::future
+ *    result. The caller's thread continues running. Call std::future::get()
+ *    when the result's value is needed, and the caller's thread may be
+ *    blocked until thread-pool finished the job.
+ *
+ *    Use addBatchJobs() to add a batch of jobs.
+ *    Unlike addJob()'s asynchronization, addBatchJobs will block caller's
+ *    thread until all jobs in the batch are finished.
+ *
+ * Stop:
+ *    Use stop() to stop the thread pool. Job can be added once stopped.
+ *
+ * Process-wide Singleton:
+ *    Use AsyncThreadPool::ProcessChannel(N) first to create N threads.
+ *    Then call AsyncThreadPool::ProcessChannel() to get the process-wide global
+ *    thread pool.
+ */
+class AsyncThreadPool {
+public:
+  typedef std::function<void()> JobFunc;
+
+  AsyncThreadPool() { LOG(FATAL) << "Not implemented"; }
+
+  /**
+   * @brief Construct Function. Install all the workers.
+   * @param[in] threadNum Number of the threads, must greater than 1.
+   */
+  explicit AsyncThreadPool(size_t threadNum) {
+    CHECK_GT(threadNum, 1U);
+    stopping_ = false;
+    workers_.resize(threadNum);
+    for (auto& worker : workers_) {
+      worker.reset(new std::thread([this]() { this->run(); }));
+    }
+  }
+
+  ~AsyncThreadPool() {
+    if (!stopping_) { stop(); }
+  }
+
+  /**
+   * @brief Stop all the workers normally.
+   */
+  void stop() {
+    stopping_ = true;
+    for (size_t i = 0; i < workers_.size(); i++) {
+      jobs_.enqueue([]{});
+    }
+    for (auto& worker : workers_) {
+      worker->join();
+    }
+  }
+
+  /**
+   * @brief A process-wide singleton. Used as a global thread pool
+   *    It should be initialized by calling
+   *    AsyncThreadPool::ProcessChannel(N) first to create N threads,
+   *    then call AsyncThreadPool::ProcessChannel() will get the thread pool.
+   */
+  static AsyncThreadPool& ProcessChannel(size_t initThreadNum = 0) {
+    static std::shared_ptr<AsyncThreadPool> channel(
+        new AsyncThreadPool(initThreadNum));
+    return *channel;
+  }
+
+  /**
+   * @brief Add a job to queue and return a std::future.
+   * @note The job will be executed
+   * asynchronously.
+   * Call std::future::get() when the execturation result is needed;
+   */
+  template<class F, class... Args>
+  auto addJob(F&& f, Args&&... args)
+      -> std::future<typename std::result_of<F(Args...)>::type> {
+    CHECK(!stopping_) << "AsyncThreadPool is closed";
+    typedef typename std::result_of<F(Args...)>::type T;
+
+    auto task = std::make_shared<std::packaged_task<T()>>(
+        std::bind(std::forward<F>(f), std::forward<Args>(args)...));
+    auto res = task->get_future();
+    jobs_.enqueue([task]{ (*task)(); });
+    return res;
+  }
+
+  /**
+   * @brief Add a batch of jobs to the queue. The main thread will be blocked
+   * until these jobs are finished.
+   * The results will be stored in  `results` according to `jobs` order.
+   *
+   * @tparam F should have a return value.
+   *
+   * @param[in] jobs a vector of executable objection.
+   * @param[in] results a vector to store the results.
+   *
+   * @note *results* may need to be carefully cleared before *addBatchJobs()*.
+   */
+  template<class F>
+  void addBatchJobs(const std::vector<F> &jobs,
+      std::vector<typename std::result_of<F()>::type> &results) {
+    typedef typename std::result_of<F()>::type T;
+    static_assert(!std::is_same<T, void>::value,
+        "should pass a non-void function as job");
+
+    std::vector<std::future<T> > resFuts;
+    for (const auto &job : jobs) {
+      resFuts.emplace_back(addJob(job));
+    }
+    for (auto& fut : resFuts) {
+      results.emplace_back(fut.get());
+    }
+  }
+
+  /**
+   * @brief Add a batch of jobs reguardless of its result.
+   * @tparam F don't need to have a return value.
+   * @param[in] jobs a vector of executable objection.
+   */
+  template<class F>
+  void addBatchJobs(const std::vector<F> &jobs) {
+    CHECK(!stopping_) << "AsyncThreadPool is closed";
+    std::vector<std::future<bool> > tmpRes;
+
+    for (const auto& job : jobs) {
+      tmpRes.emplace_back(addJob([&job]{ job(); return true; }));
+    }
+
+    for (auto& res : tmpRes) {
+      res.get();
+    }
+  }
+
+protected:
+  /**
+   * @brief Execute the jobs in the job queue.
+   */
+  void run() {
+    while (true) {
+      JobFunc func = jobs_.dequeue();
+      func();
+      if (stopping_) break;
+    }
+  }
+
+private:
+  std::vector<std::unique_ptr<std::thread>> workers_;
+  Queue<JobFunc> jobs_;
+  bool stopping_;
+};  // class AsyncThreadPool
+
+}   // namespace paddle
diff --git a/paddle/utils/ThreadLocal.cpp b/paddle/utils/ThreadLocal.cpp
new file mode 100644
index 00000000000000..a4b399d144ee39
--- /dev/null
+++ b/paddle/utils/ThreadLocal.cpp
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ThreadLocal.h"
+
+#include "Thread.h"
+
+#include "CommandLineParser.h"
+
+P_DEFINE_bool(thread_local_rand_use_global_seed, false,
+              "Whether to use global seed in thread local rand.");
+
+namespace paddle {
+
+unsigned int ThreadLocalRand::defaultSeed_ = 1;
+ThreadLocal<unsigned int> ThreadLocalRand::seed_;
+
+unsigned int* ThreadLocalRand::getSeed() {
+  unsigned int* p = seed_.get(false /*createLocal*/);
+  if (!p) {  // init seed
+    if (FLAGS_thread_local_rand_use_global_seed) {
+      p = new unsigned int(defaultSeed_);
+    } else if (getpid() == gettid()) {  // main thread
+      // deterministic, but differs from global srand()
+      p = new unsigned int(defaultSeed_ - 1);
+    } else {
+      p = new unsigned int(defaultSeed_ + gettid());
+      LOG(INFO) << "thread use undeterministic rand seed:" << *p;
+    }
+    seed_.set(p);
+  }
+  return p;
+}
+
+ThreadLocal<std::default_random_engine> ThreadLocalRandomEngine::engine_;
+std::default_random_engine& ThreadLocalRandomEngine::get() {
+  auto engine = engine_.get(false);
+  if (!engine) {
+    engine = new std::default_random_engine;
+    int defaultSeed = ThreadLocalRand::getDefaultSeed();
+    engine->seed(FLAGS_thread_local_rand_use_global_seed
+                     ? defaultSeed
+                     : defaultSeed + gettid());
+    engine_.set(engine);
+  }
+  return *engine;
+}
+
+}  // namespace paddle
diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h
new file mode 100644
index 00000000000000..e782868f69a5d0
--- /dev/null
+++ b/paddle/utils/ThreadLocal.h
@@ -0,0 +1,230 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <map>
+#include <mutex>
+#include <random>
+#include "Logging.h"
+
+namespace paddle {
+
+/**
+ * Thread local storage for object.
+ * Example:
+ *
+ * Declarartion:
+ * ThreadLocal<vector<int>> vec_;
+ *
+ * Use in thread:
+ * vector<int>& vec = *vec; // obtain the thread specific object
+ * vec.resize(100);
+ *
+ * Note that this ThreadLocal will desconstruct all internal data when thread
+ * exits
+ * This class is suitable for cases when frequently creating and deleting
+ * threads.
+ *
+ * Consider implementing a new ThreadLocal if one needs to frequently create
+ * both instances and threads.
+ *
+ * see also ThreadLocalD
+ */
+template <class T>
+class ThreadLocal {
+public:
+  ThreadLocal() {
+    PCHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0);
+  }
+  ~ThreadLocal() { pthread_key_delete(threadSpecificKey_); }
+
+  /**
+   * @brief get thread local object.
+   * @param if createLocal is true and thread local object is never created,
+   * return a new object. Otherwise, return nullptr.
+   */
+  T* get(bool createLocal = true) {
+    T* p = (T*)pthread_getspecific(threadSpecificKey_);
+    if (!p && createLocal) {
+      p = new T();
+      int ret = pthread_setspecific(threadSpecificKey_, p);
+      PCHECK(ret == 0);
+    }
+    return p;
+  }
+
+  /**
+   * @brief set (overwrite) thread local object. If there is a thread local
+   * object before, the previous object will be destructed before.
+   *
+   */
+  void set(T* p) {
+    if (T* q = get(false)) {
+      dataDestructor(q);
+    }
+    PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+  }
+
+  /**
+   * return reference.
+   */
+  T& operator*() { return *get(); }
+
+  /**
+   * Implicit conversion to T*
+   */
+  operator T*() {
+    return get();
+  }
+
+private:
+  static void dataDestructor(void* p) { delete (T*)p; }
+
+  pthread_key_t threadSpecificKey_;
+};
+
+/**
+ * Almost the same as ThreadLocal, but note that this ThreadLocalD will
+ * destruct all internal data when ThreadLocalD instance destructs.
+ *
+ * This class is suitable for cases when frequently creating and deleting
+ * objects.
+ *
+ * see also ThreadLocal
+ *
+ * @note The type T must implemented default constructor.
+ */
+template <class T>
+class ThreadLocalD {
+public:
+  ThreadLocalD() { PCHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); }
+  ~ThreadLocalD() {
+    pthread_key_delete(threadSpecificKey_);
+    for (auto t : threadMap_) {
+      dataDestructor(t.second);
+    }
+  }
+
+  /**
+   * @brief Get thread local object. If not exists, create new one.
+   */
+  T* get() {
+    T* p = (T*)pthread_getspecific(threadSpecificKey_);
+    if (!p) {
+      p = new T();
+      PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+      updateMap(p);
+    }
+    return p;
+  }
+
+  /**
+   * @brief Set thread local object. If there is an object create before, the
+   * old object will be destructed.
+   */
+  void set(T* p) {
+    if (T* q = (T*)pthread_getspecific(threadSpecificKey_)) {
+      dataDestructor(q);
+    }
+    PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+    updateMap(p);
+  }
+
+  /**
+   * @brief Get reference of the thread local object.
+   */
+  T& operator*() { return *get(); }
+
+private:
+  static void dataDestructor(void* p) { delete (T*)p; }
+
+  void updateMap(T* p) {
+    pid_t tid = syscall(SYS_gettid);
+    std::lock_guard<std::mutex> guard(mutex_);
+    auto ret = threadMap_.insert(std::make_pair(tid, p));
+    if (!ret.second) {
+      ret.first->second = p;
+    }
+  }
+
+  pthread_key_t threadSpecificKey_;
+  std::mutex mutex_;
+  std::map<pid_t, T*> threadMap_;
+};
+
+/**
+ * @brief Thread-safe C-style random API.
+ */
+class ThreadLocalRand {
+public:
+  /**
+   * initSeed just like srand,
+   * called by main thread,
+   * init defaultSeed for all thread
+   */
+  static void initSeed(unsigned int seed) { defaultSeed_ = seed; }
+
+  /**
+   * initThreadSeed called by each thread,
+   * init seed to defaultSeed + *tid*
+   * It should be called after main initSeed and before using rand()
+   * It's optional, getSeed will init seed if it's not initialized.
+   */
+  static void initThreadSeed(int tid) {
+    seed_.set(new unsigned int(defaultSeed_ + tid));
+  }
+
+  /// thread get seed, then can call rand_r many times.
+  /// Caller thread can modify the seed value if it's necessary.
+  ///
+  /// if flag thread_local_rand_use_global_seed set,
+  /// the seed will be set to defaultSeed in thread's first call.
+  static unsigned int* getSeed();
+
+  /// like ::rand
+  static int rand() { return rand_r(getSeed()); }
+
+  /**
+   * Get defaultSeed for all thread.
+   */
+  static int getDefaultSeed() { return defaultSeed_; }
+
+protected:
+  static unsigned int defaultSeed_;
+  static ThreadLocal<unsigned int> seed_;
+};
+
+/**
+ * @brief Thread-safe C++ style random engine.
+ */
+class ThreadLocalRandomEngine {
+public:
+  /**
+   * get random_engine for each thread.
+   *
+   * Engine's seed will be initialized by ThreadLocalRand.
+   */
+  static std::default_random_engine& get();
+
+protected:
+  static ThreadLocal<std::default_random_engine> engine_;
+};
+
+}  // namespace paddle
diff --git a/paddle/utils/TypeDefs.h b/paddle/utils/TypeDefs.h
new file mode 100644
index 00000000000000..e02fd62b53823f
--- /dev/null
+++ b/paddle/utils/TypeDefs.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+namespace paddle {
+#ifdef PADDLE_TYPE_DOUBLE
+typedef double real;
+#else
+typedef float real;
+#endif
+
+}  // namespace paddle
+
+using paddle::real;
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
new file mode 100644
index 00000000000000..1c1d75dc5bed98
--- /dev/null
+++ b/paddle/utils/Util.cpp
@@ -0,0 +1,377 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "Util.h"
+
+#include <dirent.h>
+#include <signal.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+
+#include <fstream>
+#include <mutex>
+
+#include "paddle/utils/Logging.h"
+
+#include "CommandLineParser.h"
+#include "CustomStackTrace.h"
+#include "Thread.h"
+#include "ThreadLocal.h"
+#include "Version.h"
+#include "StringUtil.h"
+
+P_DEFINE_int32(seed, 1, "random number seed. 0 for srand(time)");
+
+#ifdef WITH_GOOGLE_PERFTOOLS
+/*
+  In order to use google profiler, you need to install gperftools,
+  which can be obtained at:
+  https://gperftools.googlecode.com/files/gperftools-2.0.tar.gz
+
+  gperftools should be configured with --enable-frame-pointers
+
+  Then link the executable with -lprofiler.
+
+  After you start the application, you can use kill -s signal PID to
+  start/stop profiling. The profile data will be stored in file
+  FLAGS_profile_data_file, which can be analyzed by pprof.
+*/
+
+#include <gperftools/profiler.h>
+
+P_DEFINE_int32(profile_signal, 12, "signal for switch google profiler");
+P_DEFINE_string(profile_data_file, "gperf.prof",
+                "file for storing profile data");
+
+static void profilerSwitch(int signalNumber) {
+  bool static started = false;
+
+  if (!started) {
+    if (ProfilerStart(FLAGS_profile_data_file.c_str())) {
+      LOG(INFO) << "Profiler started";
+    } else {
+      LOG(WARNING) << "Can't turn on cpu profiling for "
+                   << FLAGS_profile_data_file;
+    }
+  } else {
+    ProfilerStop();
+    LOG(INFO) << "Profiler stopped";
+  }
+  started = !started;
+}
+
+static void installProfilerSwitch() {
+  sighandler_t oldHandler = signal(FLAGS_profile_signal, profilerSwitch);
+
+  if (!oldHandler) {
+    LOG(INFO) << "Using signal " << FLAGS_profile_signal
+              << " to turn on/off profiler";
+  } else {
+    LOG(WARNING) << "Signal " << FLAGS_profile_signal << " is already in use\n";
+  }
+}
+
+#else
+
+static void installProfilerSwitch() {}
+
+#endif  // WITH_GOOGLE_PERFTOOLS
+
+namespace paddle {
+
+static bool g_initialized = false;
+typedef std::pair<int, std::function<void()>> PriorityFuncPair;
+typedef std::vector<PriorityFuncPair> InitFuncList;
+static InitFuncList* g_initFuncs = nullptr;
+static std::once_flag g_onceFlag;
+void registerInitFunction(std::function<void()> func, int priority) {
+  if (g_initialized) {
+    LOG(FATAL) << "registerInitFunction() should only called before initMain()";
+  }
+  if (!g_initFuncs) {
+    g_initFuncs = new InitFuncList();
+  }
+  g_initFuncs->push_back(std::make_pair(priority, func));
+}
+
+void runInitFunctions() {
+  std::call_once(g_onceFlag, []() {
+    LOG(INFO) << "Calling runInitFunctions";
+    if (g_initFuncs) {
+      std::sort(g_initFuncs->begin(), g_initFuncs->end(),
+                [](const PriorityFuncPair& x, const PriorityFuncPair& y) {
+                  return x.first > y.first;
+                });
+      for (auto& f : *g_initFuncs) {
+        f.second();
+      }
+      delete g_initFuncs;
+      g_initFuncs = nullptr;
+    }
+    g_initialized = true;
+    LOG(INFO) << "Call runInitFunctions done.";
+  });
+}
+
+void initMain(int argc, char** argv) {
+  initializeLogging(argc, argv);
+  logging::installFailureWriter([](const char* data, int sz) {
+    std::cerr << "Current Layer forward/backward stack is " << std::endl;
+    gLayerStackTrace.dump([](const std::string& layername){
+      std::cerr << "LayerName: " << layername << std::endl;
+    });
+    std::cerr.write(data, sz);
+  });
+  std::string line;
+  for (int i = 0; i < argc; ++i) {
+    line += argv[i];
+    line += ' ';
+  }
+  LOG(INFO) << "commandline: " << line;
+  ParseCommandLineFlags(&argc, argv, true);
+  CHECK_EQ(argc, 1) << "Unknown commandline argument: " << argv[1];
+
+  installProfilerSwitch();
+
+  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+
+  if (FLAGS_seed == 0) {
+    unsigned int t = time(NULL);
+    srand(t);
+    ThreadLocalRand::initSeed(t);
+    LOG(INFO) << "random number seed=" << t;
+  } else {
+    srand(FLAGS_seed);
+    ThreadLocalRand::initSeed(FLAGS_seed);
+  }
+
+  if (FLAGS_use_gpu) {
+    // This is the initialization of the CUDA environment,
+    // need before runInitFunctions.
+    // TODO(hedaoyuan) Can be considered in the runInitFunctions,
+    // but to ensure that it is the first to initialize.
+    hl_start();
+    hl_init(FLAGS_gpu_id);
+  }
+
+  version::printVersion();
+  runInitFunctions();
+}
+
+std::string readFile(const std::string& fileName) {
+  std::ifstream is(fileName);
+
+  // get length of file:
+  is.seekg(0, is.end);
+  size_t length = is.tellg();
+  is.seekg(0, is.beg);
+  std::string str(length, (char)0);
+  CHECK(is.read(&str[0], length)) << "Fail to read file: " << fileName;
+  return str;
+}
+
+namespace path {
+
+std::string basename(const std::string& path) {
+  size_t pos = path.rfind(sep);
+  ++pos;
+  return path.substr(pos, std::string::npos);
+}
+
+std::string dirname(const std::string& path) {
+  size_t pos = path.rfind(sep);
+  if (pos == std::string::npos) return std::string();
+  return path.substr(0, pos);
+}
+
+std::string join(const std::string& part1, const std::string& part2) {
+  if (!part2.empty() && part2.front() == sep) {
+    return part2;
+  }
+  std::string ret;
+  ret.reserve(part1.size() + part2.size() + 1);
+  ret = part1;
+  if (!ret.empty() && ret.back() != sep) {
+    ret += sep;
+  }
+  ret += part2;
+  return ret;
+}
+
+}  // namespace path
+
+void copyFileToPath(const std::string& file, const std::string& dir) {
+  LOG(INFO) << "copy " << file << " to " << dir;
+  std::string fileName = path::basename(file);
+  std::string dst = path::join(dir, fileName);
+  std::ifstream source(file, std::ios_base::binary);
+  std::ofstream dest(dst, std::ios_base::binary);
+  CHECK(source) << "Fail to open " << file;
+  CHECK(dest) << "Fail to open " << dst;
+  dest << source.rdbuf();
+  source.close();
+  dest.close();
+}
+
+bool fileExist(const char* filename) { return (access(filename, 0) == 0); }
+
+void touchFile(const char* filename) {
+  if (!fileExist(filename)) {
+    std::ofstream os(filename);
+  }
+}
+
+int isDir(const char* path) {
+  struct stat s_buf;
+  if (stat(path, &s_buf)) {
+    return 0;
+  }
+  return S_ISDIR(s_buf.st_mode);
+}
+
+void rmDir(const char* folderName) {
+  if (isDir(folderName)) {
+    DIR* dp;
+    struct dirent* ep;
+    std::string buf;
+    dp = opendir(folderName);
+    while ((ep = readdir(dp)) != NULL) {
+      if (strcmp(ep->d_name, ".") && strcmp(ep->d_name, "..")) {
+        buf = std::string(folderName) + "/" + std::string(ep->d_name);
+        if (isDir(buf.c_str())) {
+          rmDir(buf.c_str());
+        } else {
+          remove(buf.c_str());
+        }
+      }
+    }
+    closedir(dp);
+    rmdir(folderName);
+  }
+}
+
+void mkDir(const char* filename) {
+  if (mkdir(filename, 0755)) {
+    CHECK(errno == EEXIST) << filename << "mkdir failed!";
+  }
+}
+
+void mkDirRecursively(const char *dir) {
+  struct stat sb;
+
+  if (!stat(dir, &sb)) return;
+
+  mkDirRecursively(path::dirname(dir).c_str());
+
+  mkDir(dir);
+}
+
+void loadFileList(const std::string& fileListFileName,
+                  std::vector<std::string>& fileList) {
+  std::ifstream is(fileListFileName);
+  CHECK(is) << "Fail to open " << fileListFileName;
+  std::string line;
+  while (is) {
+    if (!getline(is, line)) break;
+    fileList.push_back(line);
+  }
+}
+
+
+double getMemoryUsage() {
+  FILE* fp = fopen("/proc/meminfo", "r");
+  CHECK(fp) << "failed to fopen /proc/meminfo";
+  size_t bufsize = 256 * sizeof(char);
+  char* buf = new (std::nothrow) char[bufsize];
+  CHECK(buf);
+  int totalMem = -1;
+  int freeMem = -1;
+  int bufMem = -1;
+  int cacheMem = -1;
+  while (getline(&buf, &bufsize, fp) >= 0) {
+    if (0 == strncmp(buf, "MemTotal", 8)) {
+      if (1 != sscanf(buf, "%*s%d", &totalMem)) {
+        LOG(FATAL) << "failed to get MemTotal from string: [" << buf << "]";
+      }
+    } else if (0 == strncmp(buf, "MemFree", 7)) {
+      if (1 != sscanf(buf, "%*s%d", &freeMem)) {
+        LOG(FATAL) << "failed to get MemFree from string: [" << buf << "]";
+      }
+    } else if (0 == strncmp(buf, "Buffers", 7)) {
+      if (1 != sscanf(buf, "%*s%d", &bufMem)) {
+        LOG(FATAL) << "failed to get Buffers from string: [" << buf << "]";
+      }
+    } else if (0 == strncmp(buf, "Cached", 6)) {
+      if (1 != sscanf(buf, "%*s%d", &cacheMem)) {
+        LOG(FATAL) << "failed to get Cached from string: [" << buf << "]";
+      }
+    }
+    if (totalMem != -1 && freeMem != -1 && bufMem != -1 && cacheMem != -1) {
+      break;
+    }
+  }
+  CHECK(totalMem != -1 && freeMem != -1 && bufMem != -1 && cacheMem != -1)
+      << "failed to get all information";
+  fclose(fp);
+  delete[] buf;
+  double usedMem = 1.0 - 1.0 * (freeMem + bufMem + cacheMem) / totalMem;
+  return usedMem;
+}
+
+SyncThreadPool* getGlobalSyncThreadPool() {
+  static std::unique_ptr<SyncThreadPool> syncThreadPool;
+  if (syncThreadPool &&
+      syncThreadPool->getNumThreads() != (size_t)FLAGS_trainer_count) {
+    LOG(WARNING) << "trainer_count changed in training process!";
+    syncThreadPool.reset(nullptr);
+  }
+  if (!syncThreadPool) {
+    syncThreadPool.reset(new SyncThreadPool(FLAGS_trainer_count));
+  }
+  return syncThreadPool.get();
+}
+
+size_t calculateServiceNum(const std::string& pservers, int ports_num) {
+  std::vector<std::string> hosts;
+  str::split(pservers, ',', &hosts);
+  return hosts.size() * ports_num;
+}
+
+void memcpyWithCheck(void* dest, const void* src, size_t num,
+                     const void* srcEnd) {
+  int minus = (char*)srcEnd - (char*)src - num;
+  CHECK_LE(0, minus) << "memcpyWithCheck: copy " << num
+                     << " bytes data out of range.";
+  memcpy(dest, src, num);
+}
+
+hl_activation_mode_t hlActiveType(const std::string& type) {
+  if (type == "sigmoid") {
+    return HL_ACTIVATION_SIGMOID;
+  } else if (type == "relu") {
+    return HL_ACTIVATION_RELU;
+  } else if (type == "tanh") {
+    return HL_ACTIVATION_TANH;
+  } else if (type == "linear") {
+    return HL_ACTIVATION_LINEAR;
+  } else {
+    LOG(FATAL) << "Do not support activation type " << type;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
new file mode 100644
index 00000000000000..3729c5c433609a
--- /dev/null
+++ b/paddle/utils/Util.h
@@ -0,0 +1,541 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <string>
+#include <vector>
+#include <memory>
+#include <thread>
+#include <unordered_map>
+#include <mutex>
+#include <functional>
+
+#include "CommandLineParser.h"
+#include "Logging.h"
+#include "TrainerConfig.pb.h"
+#include "DisableCopy.h"
+
+#include "TypeDefs.h"
+#include "Flags.h"
+#include "hl_gpu.h"
+
+/**
+ * Loop over the elements in a container
+ * TODO(yuyang18): It's this foreach useful? Why not use C++ 11 foreach,
+ *                 or make it a inline method?
+ * Example:
+ * FOR_EACH(it, array) {
+ *  sum += *it;
+ * }
+ */
+#define FOR_EACH(iterator_name, container)                              \
+  for (auto iterator_name = (container).begin(), e = (container).end(); \
+       iterator_name != e; ++iterator_name)
+
+/**
+ * Loop over the elements in a container in reverse order
+ * TODO(yuyang18): It's this foreach useful? Why not use C++ 11 foreach,
+ *                 or make it a inline method?
+ * Example:
+ * FOR_EACH_R(it, array) {
+ *  sum += *it;
+ * }
+ */
+#define FOR_EACH_R(iterator_name, container)                              \
+  for (auto iterator_name = (container).rbegin(), e = (container).rend(); \
+       iterator_name != e; ++iterator_name)
+
+
+namespace paddle {
+
+/**
+ * calculate the non-negative remainder of a/b
+ * @param[in] a
+ * @param[in] b, should be positive
+ * @return the non-negative remainder of a / b
+ */
+inline int mod(int a, int b) {
+  int r = a % b;
+  return r >= 0 ? r : r + b;
+}
+
+
+/**
+ * find the value given a key k from container c.
+ * If the key can be found, the value is stored in *value
+ * return true if the key can be found. false otherwise.
+ */
+template <class K, class V, class C>
+bool mapGet(const K& k, const C& c, V* value) {
+  auto it = c.find(k);
+  if (it != c.end()) {
+    *value = it->second;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template <class Container, class T>
+static bool contains(const Container& container, const T& val) {
+  return std::find(container.begin(), container.end(), val) != container.end();
+}
+
+#define ARRAYSIZE(a) (sizeof(a) / sizeof(*(a)))
+
+/**
+ * Initialize some creators or initFunctions for layers and data
+ * providers.
+ * Client codes should call this function before they refer any other
+ * codes that use the layer class and data provider class.
+ *
+ * Codes inside 'core' directory can call initMain which calls
+ * runInitFunctions directly, while codes outside core can simply
+ * call runInitFunctions if they don't need the commandline flags
+ * designed for PADDLE main procedure.
+ */
+void runInitFunctions();
+
+/**
+ * Initialize logging and parse commandline
+ */
+void initMain(int argc, char** argv);
+
+// read the whole file into a string
+std::string readFile(const std::string& fileName);
+
+// copy file to path
+void copyFileToPath(const std::string& file, const std::string& path);
+
+// test file exist or not
+bool fileExist(const char* filename);
+// touch file if not exist
+void touchFile(const char* filename);
+// make dir if not exist
+void mkDir(const char* filename);
+void mkDirRecursively(const char* filename);
+
+void rmDir(const char* folderName);
+
+// load a file list file into a vector(fileList)
+void loadFileList(const std::string& fileListFileName,
+                  std::vector<std::string>& fileList);
+
+/**
+ * Register a function, the function will be called in initMain(). Functions
+ * with higher priority will be called first. The execution order of functions
+ * with same priority is not defined.
+ */
+void registerInitFunction(std::function<void()> func, int priority = 0);
+class InitFunction {
+public:
+  explicit InitFunction(std::function<void()> func, int priority = 0) {
+    registerInitFunction(func, priority);
+  }
+};
+
+/**
+ * Class SetDevice provides a mechanism for set device enviroment.
+ * When a SetDevice object is created, it attempts to change device enviroment.
+ * When the SetDevice object is destructed, it will restore device environment.
+ */
+class SetDevice {
+public:
+  explicit SetDevice(int deviceId) {
+    isSet_ = deviceId >= 0;
+    devId_ = 0;
+    if (isSet_) {
+      devId_ = hl_get_device();
+      hl_set_device(deviceId);
+    }
+  }
+  ~SetDevice() {
+    if (isSet_) {
+      hl_set_device(devId_);
+    }
+  }
+
+protected:
+  bool isSet_;
+  int devId_;
+};
+
+
+/**
+ * Enables direct access to memory allocations on a peer device(d2).
+ * input:
+ * *d1* is device can direct access device d2.
+ * *d2* is peer device to enable direct access to by the d1 device.
+ */
+inline void enablePeerAccess(int d1, int d2) {
+#ifndef PADDLE_ONLY_CPU
+  if (hl_device_can_access_peer(d1, d2)) {
+    SetDevice dev(d1);
+    hl_device_enable_peer_access(d2);
+  }
+#else
+  LOG(FATAL) << "Paddle should be compiled in GPU mode to use this method.";
+#endif
+}
+
+/**
+ * Change the gpu computation mode to asynchronized mode for the rest of the
+ * compilation block. This is useful if the computation consists of multiple
+ * small steps. Async mode can overlap the cuda-kernel launch overhead with the
+ * actual computation.
+ * Example:
+ * {
+ *    AsycnGpuBlock asyncBlock;
+ *    do_some_gpu_computation
+ * }
+ */
+class AsyncGpuBlock {
+public:
+  AsyncGpuBlock() : syncFlag_(hl_get_sync_flag()) { hl_set_sync_flag(false); }
+  ~AsyncGpuBlock() {
+    if (syncFlag_) {
+      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+      hl_set_sync_flag(syncFlag_);
+    }
+  }
+
+private:
+  bool syncFlag_;
+};
+
+
+inline bool useGpu(int deviceId) {
+  return FLAGS_parallel_nn ? (deviceId >= 0 ? true : false) : FLAGS_use_gpu;
+}
+
+/*
+ * hppl activation mode
+ */
+hl_activation_mode_t hlActiveType(const std::string& type);
+
+/**
+ * Return value: memory usage ratio (from 0-1)
+ */
+double getMemoryUsage();
+
+/**
+ * split array by index.
+ * used by sync multi thread task,
+ * each thread call calcSplitArrayInterval with thread id,
+ * get a interval as return.
+ * input:
+ * *totalSize* is array size,
+ * *tId* is thread id, *tSize* is total worker thread num
+ * output:
+ * start and end index as a std::pair
+ */
+inline std::pair<size_t, size_t> calcSplitArrayInterval(size_t totalSize,
+                                                        size_t tId,
+                                                        size_t tSize) {
+  size_t start = totalSize * tId / tSize;
+  size_t end = totalSize * (tId + 1) / tSize;
+  return std::make_pair(start, end);
+}
+
+/**
+ * same as above, but split at boundary of block.
+ */
+inline std::pair<size_t, size_t> calcSplitArrayInterval(size_t totalSize,
+                                                        size_t tId,
+                                                        size_t tSize,
+                                                        size_t blockSize) {
+  size_t numBlocks = totalSize / blockSize;
+  if (numBlocks * blockSize < totalSize) {
+    numBlocks++;
+  }
+
+  auto interval = calcSplitArrayInterval(numBlocks, tId, tSize);
+  size_t start = std::min(interval.first * blockSize, totalSize);
+  size_t end = std::min(interval.second * blockSize, totalSize);
+
+  return std::make_pair(start, end);
+}
+
+// Calculate the number of pservers/dservers based
+// on the host list and port_num.
+size_t calculateServiceNum(const std::string& pservers, int ports_num);
+
+/**
+ * sort and unique ids vector.
+ */
+inline void uniqueIds(std::vector<uint32_t>& ids) {
+  std::sort(ids.begin(), ids.end());
+  auto endpos = std::unique(ids.begin(), ids.end());
+  ids.erase(endpos, ids.end());
+}
+
+/**
+ * Read Type value
+ */
+template <typename T>
+T readT(char*& p, const char* pEnd) {
+  int minus = pEnd - p - sizeof(T);
+  CHECK_LE(0, minus) << "readT: Out of range.";
+  T v = *reinterpret_cast<T*>(p);
+  p += sizeof(T);
+  return v;
+}
+
+void memcpyWithCheck(void* dest, const void* src, size_t num,
+                     const void* srcEnd);
+
+/**
+ * A global sync thread pool, has #FLAGS_trainer_count of threads.
+ * can be used in main thread.
+ */
+class SyncThreadPool;
+SyncThreadPool* getGlobalSyncThreadPool();
+
+
+namespace path {
+
+// directory separator
+const char sep = '/';
+
+// Return the base name of pathname path.
+std::string basename(const std::string& path);
+
+// Return the directory name of path. If the path does not contains any
+// directory, it returns an empty string.
+std::string dirname(const std::string& path);
+
+/*
+  Join two path components intelligently.
+  The return value is the concatenation of part1 and part2 with exactly one
+  directory separator (path.sep) following each non-empty part except the last,
+  meaning that the result will only end in a separator if the last part is
+  empty.
+  If a component is an absolute path, all previous components are thrown away
+  and joining continues from the absolute path component.
+*/
+std::string join(const std::string& part1, const std::string& part2);
+
+template <typename... Args>
+std::string join(const std::string& part1, const std::string& part2,
+                 Args... args) {
+  return join(join(part1, part2), args...);
+}
+
+}  // namespace path
+
+/**
+ * A Checker for each invoke of method in same thread.
+ */
+class SameThreadChecker {
+public:
+  SameThreadChecker() {}
+
+  /**
+   * Disable copy
+   */
+  SameThreadChecker(const SameThreadChecker& other) = delete;
+  SameThreadChecker& operator=(const SameThreadChecker& other) = delete;
+
+  /**
+   * Each invoke of check method should be in same thread, otherwise, it will
+   * failed and core dump.
+   */
+  void check() {
+    std::thread::id curThreadId = std::this_thread::get_id();
+    std::call_once(onceFlag_, [&] { invokeThreadId_ = curThreadId; });
+    CHECK_EQ(invokeThreadId_, curThreadId)
+        << "This method should invoke in "
+           "same thread, but first invoked in " << invokeThreadId_
+        << " current invoked in " << curThreadId;
+  }
+
+private:
+  std::once_flag onceFlag_;
+  std::thread::id invokeThreadId_;
+};
+
+/**
+ * Key-Value Cache Helper.
+ *
+ * It store a object instance global. User can invoke get method by key and a
+ * object creator callback. If there is a instance stored in cache, then it will
+ * return a shared_ptr of it, otherwise, it will invoke creator callback, create
+ * a new instance store global, and return it.
+ *
+ * The cache instance will release when nobody hold a reference to it.
+ *
+ * The KType is the key type.
+ * The VType is the value type.
+ * The Hash is the key hasher object.
+ */
+template <typename KType, typename VType, typename Hash>
+class WeakKVCache {
+public:
+  WeakKVCache() {}
+
+  std::shared_ptr<VType> get(const KType& key,
+                             const std::function<VType*()>& creator) {
+    std::lock_guard<std::mutex> guard(this->lock_);
+    auto it = this->storage_.find(key);
+    if (it != this->storage_.end()) {
+      auto& val = it->second;
+      auto retVal = val.lock();
+      if (retVal != nullptr) {
+        return retVal;
+      }  // else fall trough. Because it is WeakPtr Cache.
+    }
+    auto rawPtr = creator();
+    CHECK(rawPtr != nullptr);
+    std::shared_ptr<VType> retVal(rawPtr);
+    this->storage_[key] = retVal;
+    return retVal;
+  }
+
+private:
+  std::mutex lock_;
+  std::unordered_map<KType, std::weak_ptr<VType>, Hash> storage_;
+};
+
+/**
+ * @brief The ScopedCallbacks class is a callback invoker when object is
+ *        created and destroyed.
+ */
+template <typename CallbackType, typename ...Args>
+class ScopedCallbacks {
+public:
+  ScopedCallbacks(CallbackType enter,
+                  CallbackType exit,
+                  Args& ... args)
+    : exit_(std::bind(exit, args...)) {
+    enter(args...);
+  }
+
+  ScopedCallbacks(const ScopedCallbacks& other) = delete;
+  ScopedCallbacks& operator = (const ScopedCallbacks& other) = delete;
+
+  ~ScopedCallbacks() {
+    exit_();
+  }
+
+private:
+  std::function<void()> exit_;
+};
+
+
+/**
+ * std compatible allocator with memory alignment.
+ * @tparam T type of allocator elements.
+ * @tparam Alignment the alignment in bytes.
+ */
+template <typename T, size_t Alignment>
+class AlignedAllocator {
+public:
+  /// std campatible typedefs.
+  typedef T* pointer;
+  typedef const T* const_pointer;
+  typedef T& reference;
+  typedef const T& const_reference;
+  typedef T value_type;
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+
+  T* address(T& r) const { return &r; }
+
+  const T* address(const T& r) const { return &r; }
+
+  size_t max_size() const {
+    return std::numeric_limits<size_t>::max() / sizeof(T);
+  }
+
+  template <typename U>
+  struct rebind {
+    typedef AlignedAllocator<U, Alignment> other;
+  };
+
+  bool operator==(const AlignedAllocator& other) const { return true; }
+
+  bool operator!=(const AlignedAllocator& other) const {
+    return !(*this == &other);
+  }
+
+  void construct(const T* p, const T& t) const {
+    void* pv = const_cast<T*>(p);
+    new (pv) T(t);
+  }
+
+  void deallocate(const T* p, const size_type n) const {
+    (void)(n);  // UNUSED n
+    free(const_cast<T*>(p));
+  }
+
+  void destroy(const T* p) const { p->~T(); }
+
+  AlignedAllocator() {}
+  ~AlignedAllocator() {}
+
+  AlignedAllocator(const AlignedAllocator&) {}
+  template <typename U>
+  AlignedAllocator(const AlignedAllocator<U, Alignment>&) {}
+
+  /**
+   * @brief allocate n elements of type T, the first address is aligned by
+   *        Alignment bytes.
+   * @param n element count.
+   * @return begin address of allocated buffer
+   * @throw std::length_error for n * sizeof(T) is overflowed.
+   * @throw std::bad_alloc
+   */
+  T* allocate(const size_type n) const {
+    if (n == 0) {
+      return nullptr;
+    }
+    if (n > max_size()) {
+      throw std::length_error(
+          "AlignAllocator<T>::allocate() - Int Overflow.");
+    }
+    void* r = nullptr;
+    CHECK_EQ(posix_memalign(&r, Alignment * 8, sizeof(T) * n), 0);
+    if (r == nullptr) {
+      throw std::bad_alloc();
+    } else {
+      return static_cast<T*>(r);
+    }
+  }
+
+  template <typename U>
+  T* allocate(const std::size_t n, const U* /* const hint */) const {
+    return this->allocate(n);
+  }
+
+private:
+  AlignedAllocator& operator=(const AlignedAllocator&);  // disable
+};
+
+
+class Deprecated {
+public:
+  explicit Deprecated(const std::string& msg = "") {
+    if (msg.empty()) {
+      LOG(WARNING) << "This class is deprecated, please do not use this class.";
+    } else {
+      LOG(WARNING) << msg;
+    }
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/utils/Version.cpp b/paddle/utils/Version.cpp
new file mode 100644
index 00000000000000..b59b78f5707bd4
--- /dev/null
+++ b/paddle/utils/Version.cpp
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "Version.h"
+
+#include "Flags.h"
+#include "Util.h"
+#include <iomanip>
+#include <numeric>
+//! TODO(yuyang18) in gflags, version has another define. Use another flag
+//! instead.
+#ifndef PADDLE_USE_GFLAGS
+P_DEFINE_bool(version, false, "print version");
+#else
+P_DECLARE_bool(version);
+#endif
+
+namespace paddle {
+namespace version {
+
+void printVersion(std::ostream& os) {
+#ifndef PADDLE_VERSION
+#define PADDLE_VERSION "unknown"
+#endif
+  os << "paddle version: " << PADDLE_VERSION << std::endl << std::boolalpha
+      << "\t" << "withGpu: " << version::isWithGpu() << std::endl
+      << "\t" << "withAvx: " << version::isWithAvx() << std::endl
+      << "\t" << "withPyDataProvider: " << version::isWithPyDataProvider()
+      << std::endl
+      << "\t" << "withTimer: " << version::isWithTimer() << std::endl
+      << "\t" << "withFpga: " << version::isWithFpga() << std::endl
+      << "\t" << "real byte size: "<< version::sizeofReal() << std::endl
+      << std::endl;
+}
+
+
+void printVersion() {
+  if (FLAGS_version) {
+    printVersion(std::cout);
+    exit(0);
+  }
+}
+
+}  //  namespace version
+}  //  namespace paddle
diff --git a/paddle/utils/Version.h b/paddle/utils/Version.h
new file mode 100644
index 00000000000000..e6655fa75dabfe
--- /dev/null
+++ b/paddle/utils/Version.h
@@ -0,0 +1,141 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+#include <stddef.h>
+#include "TypeDefs.h"
+#include <iostream>
+
+namespace paddle {
+
+/**
+ * namespace paddle::version
+ * Some constexpr to detect paddle version.
+ *    use paddle_trainer --version to print version information.
+ *
+ * Possible output as follow:
+ * paddle version:
+ *    withGpu: false
+ *    withAvx: false
+ *    withPyDataProvider: true
+ *    withTimer: false
+ *    withFpga: false
+ *    real byte size: 4
+ */
+
+
+namespace version {
+
+/**
+ * @brief print paddle version and exit when --version flag setted. Otherwise,
+ * do nothing.
+ */
+void printVersion();
+
+
+void printVersion(std::ostream& os);
+/**
+ * @brief isWithGpu
+ * @return return true if paddle compiled with GPU
+ */
+constexpr bool isWithGpu() {
+#ifdef PADDLE_ONLY_CPU
+  return false;
+#else
+  return true;
+#endif
+}
+
+/**
+ * @brief isWithPyDataProvider
+ * @return return true if paddle compiled with PyDataProvider
+ *
+ * @note: A complete python interpreter is embeded into paddle binary if paddle
+ * is compiled with PyDataProvider. Then the config parser just invoke python
+ * method. Otherwise, ConfigParser just serializes config into protobuf, and
+ * pass to C++ by using stdio.
+ */
+constexpr bool isWithPyDataProvider() {
+#ifdef PADDLE_NO_PYTHON
+  return false;
+#else
+  return true;
+#endif
+}
+
+
+/**
+ * @brief isWithTimer
+ * @return true if paddle compiled with timer.
+ */
+constexpr bool isWithTimer() {
+#ifdef PADDLE_DISABLE_TIMER
+  return false;
+#else
+  return true;
+#endif
+}
+
+/**
+ * @brief isWithAvx
+ * @return true if paddle compiled with AVX instructs.
+ */
+constexpr bool isWithAvx() {
+#ifdef __AVX__
+  return true;
+#else
+  return false;
+#endif
+}
+
+/**
+ * @brief isWithFpga
+ * @return true if paddle compiled with FPGA for prediction.
+ */
+constexpr bool isWithFpga() {
+#ifdef PADDLE_USE_FPGA
+  return true;
+#else
+  return false;
+#endif
+}
+
+/**
+ * @brief sizeofReal
+ * @return return the byte size of real
+ */
+constexpr size_t sizeofReal() {
+  return sizeof(real);
+}
+
+/**
+ * @brief isPaddleUseDouble
+ * @return true if paddle compiled with double precision.
+ */
+constexpr bool isPaddleUseDouble() {
+  return sizeofReal() == sizeof(double);
+}
+
+/**
+ * @brief isPaddleUseFloat
+ * @return true if paddle compiled with float precision
+ */
+constexpr bool isPaddleUseFloat() {
+  return sizeofReal() == sizeof(float);
+}
+
+}  //  namespace version
+
+}  //  namespace paddle
diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/utils/tests/CMakeLists.txt
new file mode 100644
index 00000000000000..147ee3f6d6d867
--- /dev/null
+++ b/paddle/utils/tests/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_simple_unittest(test_CommandLineParser)
+add_simple_unittest(test_Logging)
+add_simple_unittest(test_Thread)
+add_simple_unittest(test_StringUtils)
diff --git a/paddle/utils/tests/test_CommandLineParser.cpp b/paddle/utils/tests/test_CommandLineParser.cpp
new file mode 100644
index 00000000000000..d5f6018864cb9f
--- /dev/null
+++ b/paddle/utils/tests/test_CommandLineParser.cpp
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_USE_GFLAGS
+//! Test Command Line Parser for paddle internal implement.
+
+#include <paddle/utils/CommandLineParser.h>
+#include <gtest/gtest.h>
+
+P_DEFINE_int32(i1, 1, "test int flag 1");
+P_DEFINE_int32(i2, 2, "test int flag 2");
+
+P_DEFINE_string(str1, "1", "test str flag 1");
+P_DEFINE_string(str2, "2", "test str flag 2");
+
+P_DEFINE_bool(b1, true, "test bool flag 1");
+P_DEFINE_bool(b2, false, "test bool flag 2");
+
+P_DEFINE_double(d1, 0.1, "test double flag 1");
+P_DEFINE_double(d2, -42.3, "test double flag 2");
+
+P_DEFINE_int64(l1, 1, "test int64 flag 1");
+P_DEFINE_int64(l2, 2, "test int64 flag 2");
+
+P_DEFINE_uint64(ul1, 32, "test uint64 flag 1");
+P_DEFINE_uint64(ul2, 33, "test uint64 flag 2");
+
+constexpr double EPSILON = 1e-5;
+
+#define cc(x) const_cast<char*>((x))
+
+TEST(CommandLineParser, defaultValue) {
+  char* argv[] = {cc("test_program"), cc("--unused_flag=134")};
+  int argc = sizeof(argv) / sizeof(char*);
+
+  paddle::ParseCommandLineFlags(&argc, argv);
+
+  // Check Default Value
+  ASSERT_EQ(argc, 2);
+  ASSERT_EQ(FLAGS_i1, 1);
+  ASSERT_EQ(FLAGS_i2, 2);
+  ASSERT_EQ(FLAGS_str1, "1");
+  ASSERT_EQ(FLAGS_str2, "2");
+  ASSERT_EQ(FLAGS_b1, true);
+  ASSERT_EQ(FLAGS_b2, false);
+  ASSERT_NEAR(FLAGS_d1, 0.1, EPSILON);
+  ASSERT_NEAR(FLAGS_d2, -42.3, EPSILON);
+  ASSERT_EQ(FLAGS_i1, 1);
+  ASSERT_EQ(FLAGS_i2, 2);
+  ASSERT_EQ(FLAGS_ul1, 32UL);
+  ASSERT_EQ(FLAGS_ul2, 33UL);
+}
+
+TEST(CommandLineParser, normal) {
+  char* argv[] = {
+      cc("test_program"), cc("--i2=32"),              cc("--str1=abc"),
+      cc("--b2=1"),       cc("-b1=False"),            cc("--d2=.34"),
+      cc("--d1=0"),       cc("--l1=-12345678901234"), cc("-ul2=3212")};
+  int argc = sizeof(argv) / sizeof(char*);
+  paddle::ParseCommandLineFlags(&argc, argv);
+  ASSERT_EQ(argc, 1);
+  ASSERT_EQ(FLAGS_i2, 32);
+  ASSERT_EQ(FLAGS_str1, "abc");
+  ASSERT_EQ(FLAGS_b2, true);
+  ASSERT_EQ(FLAGS_b1, false);
+  ASSERT_NEAR(FLAGS_d2, 0.34, EPSILON);
+  ASSERT_NEAR(FLAGS_d1, 0.0, EPSILON);
+  ASSERT_EQ(FLAGS_l1, -12345678901234);
+  ASSERT_EQ(FLAGS_ul2, 3212UL);
+}
+
+TEST(CommandLineParser, printHelp) {
+  char* argv[] = {cc("test_program"), cc("--help")};
+  int argc = sizeof(argv) / sizeof(char*);
+
+  // Will Print Usage
+  ASSERT_DEATH(paddle::ParseCommandLineFlags(&argc, argv), ".*test_program.*");
+}
+
+TEST(CommandLineParser, parseError) {
+  char* argv[] = {cc("test_program"), cc("--i1=abc")};
+
+  int argc = sizeof(argv) / sizeof(char*);
+  ASSERT_DEATH(
+      paddle::ParseCommandLineFlags(&argc, argv),
+      "Parse command flag i1 error! User input is --i1=abc.*test_program.*");
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+
+int main(int argc, char** argv) {
+  return 0;
+}
+
+#endif
+
diff --git a/paddle/utils/tests/test_Logging.cpp b/paddle/utils/tests/test_Logging.cpp
new file mode 100644
index 00000000000000..a9382de6da4ef5
--- /dev/null
+++ b/paddle/utils/tests/test_Logging.cpp
@@ -0,0 +1,164 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * Basically from tensorflow/core/platform/default/logging.cc
+ * Used in embedded system where there is no glogs.
+ */
+
+#include <gtest/gtest.h>
+#include <fstream>
+#include <stdlib.h>
+#include <dirent.h>
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Util.h"
+#ifndef PADDLE_USE_GLOG
+TEST(Logging, BasicalLog) {
+  auto pinfo = [] {
+    P_LOG(INFO) << "INFO";
+    exit(1);
+  };
+  ASSERT_DEATH(pinfo(), "I .*test_Logging.cpp:[0-9]+] INFO");
+
+  auto pwarn = [] {
+    P_LOG(WARNING) << "WARN";
+    exit(1);
+  };
+  ASSERT_DEATH(pwarn(), "W .*test_Logging.cpp:[0-9]+] WARN");
+
+  auto perr = [] {
+    P_LOG(ERROR) << "ERROR";
+    exit(1);
+  };
+  ASSERT_DEATH(perr(), "E .*test_Logging.cpp:[0-9]+] ERROR");
+
+  auto pfatal = [] { P_LOG(FATAL) << "FATAL"; };
+  ASSERT_DEATH(pfatal(), "F .*test_Logging.cpp:[0-9]+] FATAL");
+}
+
+TEST(Logging, Check) {
+  int a = 1;
+  int b = 2;
+  P_CHECK(a != b);
+
+  auto pcheckDown = [&] { P_CHECK(a == b); };
+  ASSERT_DEATH(pcheckDown(),
+    "F .*test_Logging.cpp:[0-9]+] Check failed: a == b ");
+
+  P_CHECK_LE(a, b);
+  P_CHECK_LT(a, b);
+  double t = 1.2;
+  P_CHECK_LE(a, t);
+  double* ptr = nullptr;
+
+  auto pcheckDown2 = [&] { P_CHECK_NOTNULL(ptr); };
+  ASSERT_DEATH(pcheckDown2(), "F");
+}
+
+#define cc(x) const_cast<char*>(x)
+
+TEST(Logging, LogToStderr) {
+  auto logToStderrCallback = [] {
+    setenv("PLOG_LOGTOSTDERR", "0", true);
+    char* argv[] = {cc("test")};
+    paddle::initializeLogging(1, argv);
+    P_LOG(INFO) << "This output will not print to std error";
+    exit(1);
+  };
+
+  ASSERT_DEATH(logToStderrCallback(), "");
+}
+
+constexpr char kLogDirName[] = "./test_log_dir";
+const std::vector<std::string> kLevels = {"INFO", "WARNING", "ERROR", "FATAL"};
+
+TEST(Logging, LogToDir) {
+  ASSERT_EQ(0, mkdir(kLogDirName, 0777));
+  auto logToDirCallback = [] {
+    setenv("PLOG_LOGTOSTDERR", "0", true);
+    setenv("PLOG_LOGDIR", kLogDirName, true);
+    char* argv[] = {cc("test")};
+    paddle::initializeLogging(1, argv);
+
+    P_LOG(INFO) << "INFO";
+    P_LOG(WARNING) << "WARNING";
+    P_LOG(ERROR) << "ERROR";
+    P_LOG(FATAL) << "FATAL";
+  };
+  ASSERT_DEATH(logToDirCallback(), "");
+
+  // There 4 file in logdir
+  auto dir = opendir(kLogDirName);
+  size_t fileCount = 0;
+  std::vector<std::string> filenames;
+  for (auto dirContent = readdir(dir); dirContent != nullptr;
+       dirContent = readdir(dir)) {
+    std::string filename(dirContent->d_name);
+    if (filename == "." || filename == "..") {
+      continue;
+    } else {
+      ++fileCount;
+      for (size_t i = 0; i < kLevels.size(); ++i) {
+        const std::string& curLevel = kLevels[i];
+        if (filename.size() > curLevel.length()) {
+          size_t diff = filename.size() - curLevel.length();
+          size_t j = 0;
+          for (; j < curLevel.length(); ++j) {
+            if (filename[j + diff] != curLevel[j]) {
+              // File Suffix Not Same, then break.
+              break;
+            }
+          }
+          if (j == curLevel.length()) {  // Same suffix.
+            std::ifstream fin;
+            auto fn = paddle::path::join(kLogDirName, filename);
+            fin.open(fn);
+            filenames.push_back(fn);
+            ASSERT_TRUE(fin.is_open());
+            size_t lineCounter = 0;
+            for (std::string line; std::getline(fin, line); ++lineCounter) {
+              // Do Nothing, Just calc lineCounter.
+            }
+
+            // For example.
+            // The info channel will have all log which level >= INFO
+            // So the info file's lineCounter should == 4.
+            ASSERT_EQ(kLevels.size() - i, lineCounter);
+            fin.close();
+          }
+        }
+      }
+    }
+  }
+  closedir(dir);
+  ASSERT_EQ(4UL, fileCount);  // 4 levels.
+  // Clean Unittest.
+  for (std::string& fn : filenames) {
+    ASSERT_EQ(remove(fn.c_str()), 0);
+  }
+  ASSERT_EQ(rmdir(kLogDirName), 0);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+
+int main(int, char**) {
+  return 0;
+}
+
+#endif
diff --git a/paddle/utils/tests/test_StringUtils.cpp b/paddle/utils/tests/test_StringUtils.cpp
new file mode 100644
index 00000000000000..b8636709e9b42c
--- /dev/null
+++ b/paddle/utils/tests/test_StringUtils.cpp
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/StringUtil.h"
+
+#include <gtest/gtest.h>
+
+TEST(StringUtil, to) {
+  ASSERT_NEAR(paddle::str::to<double>("12.45"), 12.45, 1e-5);
+  ASSERT_DEATH(paddle::str::to<double>("12.45x23"), ".*");
+  ASSERT_DEATH(paddle::str::to<int>(""), ".*");
+}
diff --git a/paddle/utils/tests/test_Thread.cpp b/paddle/utils/tests/test_Thread.cpp
new file mode 100644
index 00000000000000..bf4e2753458e08
--- /dev/null
+++ b/paddle/utils/tests/test_Thread.cpp
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <atomic>
+#include <paddle/utils/Thread.h>
+#include <gtest/gtest.h>
+
+using paddle::AsyncThreadPool;  // NOLINT
+
+TEST(AsyncThreadPool, addJob) {
+  AsyncThreadPool pool(8);
+  auto a = pool.addJob([]{ return 1; });
+  auto b = pool.addJob([] { return true; });
+  auto c = pool.addJob([] { return false; });
+
+  ASSERT_EQ(a.get(), 1);
+  ASSERT_TRUE(b.get());
+  ASSERT_FALSE(c.get());
+}
+
+TEST(AsyncThreadPool, addBatchJob) {
+  AsyncThreadPool pool(8);
+  std::atomic<int> counter{0};
+
+  std::vector<AsyncThreadPool::JobFunc> jobs;
+
+  for (int i = 0; i < 10000; i++) {
+    jobs.emplace_back(
+        [&] {
+          counter++;
+        });
+  }
+
+  pool.addBatchJobs(jobs);
+
+  ASSERT_EQ(counter, 10000);
+}
+
+TEST(AsyncThreadPool, multiThreadAddBatchJob) {
+  AsyncThreadPool levelOnePool(200);
+  AsyncThreadPool levelTwoPool(200);
+
+  std::shared_ptr<std::mutex> mut = std::make_shared<std::mutex>();
+  int counter = 0;
+  const int numMonitors = 300;
+  const int numSlaves = 300;
+  std::vector<AsyncThreadPool::JobFunc> moniterJobs(numMonitors, [&] {
+      std::vector<AsyncThreadPool::JobFunc> slaveJobs(numSlaves,
+          [mut, &counter] {
+            std::lock_guard<std::mutex> lk(*mut);
+            counter++;
+          });
+      levelTwoPool.addBatchJobs(slaveJobs);
+      });
+  levelOnePool.addBatchJobs(moniterJobs);
+  ASSERT_EQ(counter, numMonitors * numSlaves);
+}
+
+TEST(AsyncThreadPool, addBatchJobWithResults) {
+  AsyncThreadPool pool(100);
+
+  std::vector<std::function<int()> > jobs;
+  const int numJobs = 100;
+  for (int i = 0; i < numJobs; i++) {
+    jobs.emplace_back(
+        [i]{
+          return i;
+        });
+  }
+
+  std::vector<int> res;
+  pool.addBatchJobs(jobs, res);
+
+  for (int i = 0; i < numJobs; i++) {
+    ASSERT_EQ(res[i], i);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/proto/.gitignore b/proto/.gitignore
new file mode 100644
index 00000000000000..a0f00082c8e5d4
--- /dev/null
+++ b/proto/.gitignore
@@ -0,0 +1 @@
+CMakeLists.txt
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
new file mode 100644
index 00000000000000..461c73f14c2dc9
--- /dev/null
+++ b/proto/CMakeLists.txt
@@ -0,0 +1,61 @@
+set(proto_filenames
+    DataConfig.proto
+    DataFormat.proto
+    ModelConfig.proto
+    ParameterConfig.proto
+    ParameterService.proto
+    TrainerConfig.proto)
+
+set(real_proto_files)
+
+# TODO(yuyang18): Some internal proto will also be depended on.
+#                 Find a way to automatically calculate all depends.
+foreach(filename ${proto_filenames})
+    add_custom_command(OUTPUT ${filename}
+        COMMAND ${M4_EXECUTABLE} -Dreal=${ACCURACY} -I '${INTERNAL_PROTO_PATH}'
+              ${PROJ_ROOT}/proto/${filename}.m4 > ${filename}
+        DEPENDS ${PROJ_ROOT}/proto/${filename}.m4
+        COMMENT "Generate ${filename}")
+endforeach()
+
+add_custom_target(proto_accuracy ALL
+                    DEPENDS ${proto_filenames})
+
+set(PROTO_GEN)
+set(PROTO_GEN_PY)
+
+foreach(filename ${proto_filenames})
+    get_filename_component(base_filename ${filename} NAME_WE)
+    set(CUR_PROTO_GEN
+        ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.pb.h
+        ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.pb.cc)
+    set(PROTO_GEN
+        ${PROTO_GEN}
+        ${CUR_PROTO_GEN})
+    add_custom_command(OUTPUT ${CUR_PROTO_GEN}
+        COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} 
+                  --cpp_out ${CMAKE_CURRENT_BINARY_DIR}
+                  --proto_path ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/${filename}
+        DEPENDS proto_accuracy
+                ${PROJ_ROOT}/proto/${filename}.m4)
+
+    set(CUR_PROTO_GEN_PY
+        ${PROJ_ROOT}/paddle/python/paddle/proto/${base_filename}_pb2.py)
+    set(PROTO_GEN_PY
+        ${CUR_PROTO_GEN_PY}
+        ${PROTO_GEN_PY})
+    add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
+        COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${PROJ_ROOT}/python/paddle/proto
+                  --proto_path ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/${filename}
+        DEPENDS proto_accuracy
+                ${PROJ_ROOT}/proto/${filename}.m4)
+endforeach()
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/proto)
+
+add_custom_target(gen_proto_cpp ALL DEPENDS ${PROTO_GEN})
+add_custom_target(gen_proto_py ALL DEPENDS ${PROTO_GEN_PY})
+add_library(paddle_proto STATIC
+    ${PROTO_GEN})
+add_dependencies(paddle_proto proto_accuracy)
+target_include_directories(paddle_proto PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/proto/DataConfig.proto.m4 b/proto/DataConfig.proto.m4
new file mode 100644
index 00000000000000..9862e4e7ef2ff9
--- /dev/null
+++ b/proto/DataConfig.proto.m4
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+package paddle;
+
+sinclude(`DataConfigExt.proto.m4')
+message FileGroupConf {
+  optional uint32 queue_capacity = 1 [default = 1];
+  // how many files to load for a load file thread
+  optional int32 load_file_count = 2 [default = 1];
+  // how many threads to load files
+  // Setting to be 5~10 is appropriate when loading files by hadoop vfs
+  optional int32 load_thread_num = 3 [default = 1];
+};
+
+message DataConfig {
+sinclude(`DataConfigInter.proto.m4')
+  required string type = 1;
+
+  // name of a text file which contains a list of file names at each line
+  optional string files = 3;
+
+  optional int32 feat_dim = 4;//feature dimension of one frame
+  repeated int32 slot_dims = 5;//feature slot dims
+  optional int32 context_len = 6;//max neibour frame numbers
+  optional uint64 buffer_capacity = 7;//the number of samples
+
+  //part of data used in training
+  //if not -1, part of train data is used in training
+  optional int64 train_sample_num = 8 [default = -1];
+
+  //The number of documents processed once
+  optional int32  file_load_num = 9 [default = -1];
+  optional bool  async_load_data = 12 [default = false];
+  /// Note the field number 10, 11 and 13 have been deprecated.
+  optional bool for_test = 14 [default = false];  // whether this data is for test
+  optional FileGroupConf file_group_conf = 15;
+  repeated int32 float_slot_dims = 16;
+
+  /// Note the field number 17, 18 and 19 have been deprecated.
+
+  // a list of values which will be used to create additional one dimensional real
+  // values slots. These one dimensional slots can be used as the weight input
+  // for cost layers.
+  // Currently this is only supported by ProtoDataProvider.
+  repeated real constant_slots = 20;
+
+  // for PyDataProvider.
+  // Specify the load data script module name, object name and user args
+  optional string load_data_module = 21;
+  optional string load_data_object = 22;
+  optional string load_data_args = 23;
+
+  // for MultiDataProvider
+  repeated DataConfig sub_data_configs = 24; // sub dataproviders
+  /*
+   * the ratio of each sub dataproviders:
+   * e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100,
+   * then each mini-batch is combined by 10 instance from A and 90 instances
+   * from B.
+   */
+  optional int32 data_ratio = 25;
+  /*
+   * if one of the sub dataproviders is running out of data, then
+   * (1) it is "main data", then finish current pass.
+   * (2) it is not "main data", then reset it, and try getNextBatch again.
+   */
+  optional bool is_main_data = 26 [default = true];
+
+  // the usage ratio of instances. Setting to 1.0 means the use of all instances.
+  optional real usage_ratio = 27 [default = 1.0];
+};
+
diff --git a/proto/DataFormat.proto.m4 b/proto/DataFormat.proto.m4
new file mode 100644
index 00000000000000..556eace5e194ef
--- /dev/null
+++ b/proto/DataFormat.proto.m4
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+package paddle;
+
+/*
+ If values is not empty and ids is empty, this is a dense vector.
+ If values is not empty and ids is not empty, this is a sparse vector. The position of each value
+ is specified by ids.
+ If values is empty and ids is not empty, this is a sparse vector whose non-zero values are 1.
+ The position of each 1 is specified by ids.
+*/
+message VectorSlot {
+  repeated float values = 1 [packed = true];
+  repeated uint32 ids = 2 [packed = true];
+  /* For multidimensional data, for example "image width height depth" */
+  repeated uint32 dims = 3 [packed = true];
+  repeated string strs = 4; 
+};
+
+/*
+ SubseqSlot use to record whether VectorSlot or any other slot in future has subseq.
+ If not all VectorSlot have subseq, we only store the one who has subseq, and use *slot_id* to record it.
+ One vector_slots has one sequence, and it may have N subseq, thus the number of *lens* will be N too. 
+*/
+message SubseqSlot {
+  required uint32 slot_id = 1; //the id of slot who has subseq
+  repeated uint32 lens = 2; // lengths of sub-sequence in the slot
+};
+
+message SlotDef {
+  enum SlotType {
+    VECTOR_DENSE = 0;
+    VECTOR_SPARSE_NON_VALUE = 1;
+    VECTOR_SPARSE_VALUE = 2;
+    INDEX = 3;  // This can be used as label, or word id, etc.
+    VAR_MDIM_DENSE = 4;
+    VAR_MDIM_INDEX = 5;
+    STRING = 6;
+  }
+  required SlotType type = 1;
+  required uint32 dim = 2;  // For INDEX slots, this means the maximal index plus 1.
+};
+
+message DataHeader {
+  // INDEX slot should be always after VECTOR slots.
+  repeated SlotDef slot_defs = 1;
+};
+
+message DataSample {
+  optional bool is_beginning = 1 [default = true]; // is the beginning of a sequence
+  repeated VectorSlot vector_slots = 2;
+  repeated uint32 id_slots = 3 [packed = true];
+  /* use ids of VectorSlot */
+  repeated VectorSlot var_id_slots = 4;
+  repeated SubseqSlot subseq_slots = 5;
+};
+
diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4
new file mode 100644
index 00000000000000..d04620d363c149
--- /dev/null
+++ b/proto/ModelConfig.proto.m4
@@ -0,0 +1,483 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+import "ParameterConfig.proto";
+
+package paddle;
+
+/**
+ * Various structs for the configuration of a neural network
+ */
+sinclude(`ModelConfigExt.proto.m4')
+
+message ExternalConfig {
+  repeated string layer_names = 1;
+  repeated string input_layer_names = 2;
+  repeated string output_layer_names = 3;
+}
+
+message ActivationConfig {
+  // identity: f(x) = x
+  // sigmoid: f(x) = 1 / (1 + exp(-x))
+  // logistic: f(x) = (1 - exp(-x)) / (1+ exp(-x))
+  // softmax: y_i = f(x_i) = exp(x_i) / (\sum_i exp(x_i))
+  // relu: y = max(0, x)
+  required string type = 1;
+};
+
+message ConvConfig {
+  // filter_size = 5, says that this layer will use
+  // filters of size 5x5 pixels.
+  required uint32 filter_size = 1;
+
+  // The image data dimensionality.
+  // This value must be either 1, 2, 3, or a multiple of 4.
+  required uint32 channels = 2;
+
+  // stride = 1, indicates that the distance between
+  // successive filter applications should be 1 pixel.
+  required uint32 stride = 3;
+
+  // padding = 4, instructs the net to implicitly
+  // pad the images with a 4-pixel border of zeros.
+  required uint32 padding = 4;
+
+  // If groups = 4 together with the filters = 32 parameter,
+  // they state that this convolutional layer is to have 4
+  // groups of 32 filters. Each filter will connect to 8
+  // input channels.
+  required uint32 groups = 5;
+  required uint32 filter_channels = 6;
+
+  // The size of output feature map.
+  required uint32 output_x = 7;
+
+  // The size of input feature map.
+  required uint32 img_size = 8;
+
+  // caffe mode for output size coherence
+  required bool caffe_mode = 9 [default = true];
+
+  // if filter_size_y is set , this convolutional layer will use
+  // filters of size filter_size * filter_size_y pixels.
+  // if filter_size_y is not set, this convolutional layer will use
+  // filters of size filter_size * filter_size
+  required uint32 filter_size_y = 10;
+  required uint32 padding_y = 11;
+  required uint32 stride_y = 12;
+}
+
+message PoolConfig {
+  // max or avg pooling
+  required string pool_type = 1;
+  required uint32 channels = 2;
+
+  // Defines the size of the pooling region in
+  // the x (equivalently, y) dimension.
+  required uint32 size_x = 3;
+
+  // Tell the net where in the input image to start the pooling.
+  required uint32 start = 4;
+
+  // Defines the stride size between successive pooling squares.
+  required uint32 stride = 5;
+
+  // The size of output feature map.
+  required uint32 output_x = 6;
+
+  // The size of input feature map.
+  required uint32 img_size = 7;
+
+  // padding = 4, instructs the net to implicitly
+  // pad the images with a 4-pixel border of zeros.
+  optional uint32 padding = 8 [default = 0];
+
+  // if not set, use size_x
+  optional uint32 size_y = 9 [default = 0];
+
+  // if not set, use stride
+  optional uint32 stride_y = 10 [default = 0];
+
+  // if not set, use output_x
+  optional uint32 output_y = 11 [default = 0];
+
+  // if not set, use img_size
+  optional uint32 img_size_y = 12 [default = 0];
+
+  // if not set, use padding
+  optional uint32 padding_y = 13 [default = 0];
+}
+
+message NormConfig {
+  // rnorm or cmrnorm
+  required string norm_type = 1;
+  required uint32 channels = 2;
+
+  // rnorm: this defines the size of the local regions
+  // used for response normalization.
+  // cmrnorm: The size parameter indicates how many
+  // nearby maps to use for normalization.
+  required uint32 size = 3;
+
+  // the parameters for normalization
+  // u = u / (1+scale*sum(u^2 in window))^pow
+  required real scale = 4;
+  required real pow = 5;
+
+  // The size of output feature map.
+  required uint32 output_x = 6;
+
+  // The size of input feature map.
+  required uint32 img_size = 7;
+
+  // normalize with fixed window or sliding window
+  // u = u / (1+scale*sum(u^2 in window))^pow
+  // fixed window: shared a fixed window for each value
+  // sliding window: have a different window for each value
+  optional bool blocked = 8;
+}
+
+message BlockExpandConfig {
+  required uint32 channels = 1;
+
+  required uint32 stride_x = 2;
+  required uint32 stride_y = 3;
+
+  required uint32 padding_x = 4;
+  required uint32 padding_y = 5;
+
+  required uint32 block_x = 6;
+  required uint32 block_y = 7;
+
+  // The size of output feature map.
+  required uint32 output_x = 8;
+  required uint32 output_y = 9;
+
+  // The size of input feature map.
+  required uint32 img_size_x = 10;
+  required uint32 img_size_y = 11;
+}
+
+message ProjectionConfig {
+  required string type = 1;
+  required string name = 2;
+  required uint64 input_size = 3;
+  required uint64 output_size = 4;
+
+  // For ShiftProjection
+  optional int32 context_start = 5;
+  optional int32 context_length = 6;
+  optional bool trainable_padding = 7 [default = false];
+
+  // For convolution
+  optional ConvConfig conv_conf = 8;
+  optional int32 num_filters = 9;
+
+  // For IdentityOffsetProjection
+  optional uint64 offset = 11 [default = 0];
+}
+
+message OperatorConfig {
+  required string type = 1;
+  repeated int32 input_indices = 2;
+  repeated uint64 input_sizes = 3;
+  required uint64 output_size = 4;
+
+  // For DotMulOperator
+  optional real dotmul_scale = 5 [default = 1.0];
+
+  // For ConvOperator
+  optional ConvConfig conv_conf = 6;
+  optional int32 num_filters = 7;
+}
+
+
+message ImageConfig {
+  // The image data dimensionality.
+  // This value must be either 1, 2, 3, or a multiple of 4.
+  required uint32 channels = 2;
+
+  // The size of input feature map.
+  required uint32 img_size = 8;
+}
+
+message LayerInputConfig {
+  required string input_layer_name = 1;
+  optional string input_parameter_name = 2;
+  optional ConvConfig conv_conf = 3;
+  optional PoolConfig pool_conf = 4;
+  optional NormConfig norm_conf = 5;
+  optional ProjectionConfig proj_conf = 6;
+  optional BlockExpandConfig block_expand_conf = 7;
+  optional ImageConfig image_conf = 8;
+  // If the input layer has multi-output.
+  // Set the argument name.
+  optional string input_layer_argument = 9;
+}
+
+message LayerConfig {
+sinclude(`ModelConfigLayer.proto.m4')
+  required string name = 1;
+  required string type = 2;
+  optional uint64 size = 3;
+  //optional ActivationConfig activation = 4;
+  optional string active_type = 4;
+  repeated LayerInputConfig inputs = 5;
+  optional string bias_parameter_name = 6;
+
+  // This number must be a multiple of 16.
+  optional uint32 num_filters = 7;
+
+  // indicates that the biases of every filter in this layer
+  // should be shared amongst all applications of that filter
+  // (which is how convnets are usually trained). Setting this to
+  // false will untie the biases, yielding a separate bias for
+  // every location at which the filter is applied.
+  optional bool shared_biases = 8;
+
+  // Valid values are ones that divide the area of the output
+  // grid in this convolutional layer. For example if this layer
+  // produces 32-channel 20x20 output grid, valid values of
+  // partialSum are ones which divide 20*20 = 400.
+  // I'll update this comments when confirmed
+  optional uint32 partial_sum = 9;
+
+  // for dropout
+  optional real drop_rate = 10;
+
+  // for HierarchicalSoftmaxLayer and NCELayer
+  // the number of classes
+  optional uint32 num_classes = 11;
+
+  // the gpu device which the Layer's data in.
+  // Only used by ParallelNeuralNetork. Ignored otherwise.
+  optional int32 device = 12 [default = -1];
+
+  // for recurrent layer. If true, the recurrence runs from the end to the beginning.
+  optional bool reversed = 13 [default = false];
+
+  // for lstmemory layer. Different types of nodes have different activation type.
+  optional string active_gate_type  = 14;
+  optional string active_state_type = 15;
+
+  // For NCELayer
+  // The number of random negative labels for each sample
+  optional int32 num_neg_samples = 16 [default = 10];
+
+  // For NCELayer
+  // The distribution for generating the random negative labels.
+  // A uniform distribution will be used if not provided
+  repeated real neg_sampling_dist = 17 [packed = true];
+
+  // For MaxLayer
+  // default: output VALUE of MaxLayer. set this flag to true for output INDEX
+  // INDEX will be put in Argument::value as real values.
+  optional bool output_max_index = 19 [default = false];
+
+  /// The filed number 20 have been deprecated.
+
+  // For self-normalized estimation
+  optional real softmax_selfnorm_alpha = 21 [default = 0.1];
+
+  /// The filed numbers 22 and 23 have been deprecated.
+
+  // for MDLstmLayer
+  repeated bool directions = 24;
+
+  // for CTCLayer
+  optional bool norm_by_times = 25;
+
+  // for CostLayers
+  optional real coeff = 26;
+
+  // for AverageLayer
+  // can be set to: 'average', 'sum' or 'squarerootn'
+  optional string average_strategy = 27;
+
+  // for error clipping
+  optional real error_clipping_threshold = 28 [default = 0.0];
+
+  // for operators used by mixed layer
+  repeated OperatorConfig operator_confs = 29;
+
+  // for lambdaCost
+  optional int32 NDCG_num = 30;
+  optional int32 max_sort_size = 31;
+
+  // for SlopeInterceptLayer
+  optional real slope = 32;
+  optional real intercept = 33;
+
+  // for CosSimVecMatLayer and CosSimLayer
+  optional real cos_scale = 34;
+
+  // for DataNormLayer
+  // can be set to: 'z-score', 'min-max' or 'decimal-scaling'
+  optional string data_norm_strategy = 36;
+
+  // for bos/eos id
+  optional uint32 bos_id = 37;
+  optional uint32 eos_id = 38;
+
+  // for max id layer
+  optional uint32 beam_size = 39;
+
+  // for seqlastins layer, whether select first instead last
+  optional bool select_first = 40 [default = false];
+
+  // for seqlastins layer, AverageLayer, MaxLayer and ExpandLayer
+  // can be set to: 'non-seq','seq'
+  optional string trans_type = 41 [default = 'non-seq'];
+
+  // to indicate whether selective_fc layer
+  // is used in sequence generation or not
+  optional bool selective_fc_pass_generation = 42 [default = false];
+
+  // to indicate whether selective_fc layer take its last input to
+  // selected several columns and only compute the multiplications
+  // between the input matrices and the selected columns of
+  // the parameter matrices of this layer.
+  // if set false, selective_fc degrades into fc.
+  optional bool has_selected_colums = 43 [default = true];
+
+  // this parameter is for speed consideration.
+  // if number of the selected columns is less than
+  // sample number * selective_fc output size * selective_fc_mull_mull_ratio
+  // sparse multiplication is used, otherwise, using full multiplication.
+  optional real selective_fc_full_mul_ratio = 44 [default = 0.02];
+
+  // to indicate how many threads selective_fc use to to accelate
+  // the plain_mul period
+  // leave empty or set to 0 to disable multi-thread accleleration
+  optional uint32 selective_fc_parallel_plain_mul_thread_num = 45 [default = 0];
+
+  // for batch normalization layer
+  // if set use_global_stats true, will use the loaded mean and variance.
+  optional bool use_global_stats = 46;
+
+  // use to compute moving mean and variance.
+  optional real moving_average_fraction = 47 [default = 0.9];
+}
+
+message EvaluatorConfig {
+  required string name = 1;
+  required string type = 2;
+  repeated string input_layers = 3;
+
+  // Used by ChunkEvaluator
+  optional string chunk_scheme = 4; // one of "IOB", "IOE", "IOBES"
+  optional int32 num_chunk_types = 5; // number of chunk types other than "other"
+
+  // Used by PrecisionRecallEvaluator and ClassificationErrorEvaluator
+  // For multi binary labels: true if output > classification_threshold
+  optional real classification_threshold = 6 [default = 0.5];
+  // The positive label. -1 means average precision and recall
+  optional int32 positive_label = 7 [default = -1];
+
+  // load dict from this file
+  optional string dict_file = 8;
+
+  // dump result in this file
+  optional string result_file = 9;
+
+  // top # results for max id printer
+  optional int32 num_results = 10 [default = 1];
+
+  // whether to delimit the sequence in the seq_text_printer
+  optional bool delimited = 11 [default = true];
+}
+
+message LinkConfig {
+  required string layer_name = 1;
+  required string link_name = 2;
+  // If true, this link has sub-sequence
+  optional bool has_subseq = 3 [default = false];
+}
+
+message MemoryConfig {
+  required string layer_name = 1;
+  required string link_name = 2;
+
+  optional string boot_layer_name = 3;
+  optional string boot_bias_parameter_name = 4;
+  optional string boot_bias_active_type = 5;
+  optional uint32 boot_with_const_id = 7;
+
+  // memory is a sequence, initailized by a sequence boot layer
+  optional bool is_sequence = 6 [default = false];
+}
+
+message GeneratorConfig {
+  required uint32 max_num_frames = 1;
+  required string eos_layer_name = 2;
+  optional int32 num_results_per_sample = 3 [default = 1];
+
+  // for beam search
+  optional int32 beam_size = 4 [default = 1];
+
+  optional bool log_prob = 5 [default = true];
+}
+
+message SubModelConfig {
+  required string name = 1;
+  repeated string layer_names = 2; // selected layers in sub model
+  repeated string input_layer_names = 3;
+  repeated string output_layer_names = 4;
+  repeated string evaluator_names = 5;
+
+  optional bool is_recurrent_layer_group = 6 [default = false];
+
+  // If true, the recurrence runs from the end to the beginning.
+  optional bool reversed = 7 [default = false];
+
+  // name and link name of memory
+  repeated MemoryConfig memories = 8;
+
+  // if use recurrent layer group, all layers in submodel will postfix by
+  // "_in_"+submodel.name, so we add a name pair to link between
+  // root model and layer group,
+  // note that these in/out layers are not input/output of the network.
+  repeated LinkConfig in_links = 9;
+  repeated LinkConfig out_links = 10;
+
+  optional GeneratorConfig generator = 11;
+}
+
+message ModelConfig {
+  // type of the model.
+  // Currently, "nn", "recurrent_nn" and "recursive_nn" are supported
+  required string type = 1 [default = "nn"];
+
+  // layers should be ordered in such a way that the forward propagation
+  // can be correctly executed by going from the first layer to the last layer
+  repeated LayerConfig layers = 2;
+
+  repeated ParameterConfig parameters = 3;
+
+  // Input layers should have the same order as the data streams provided
+  // by the data provider. The type of input layers should be "data"
+  repeated string input_layer_names = 4;
+
+  // For training, the type of a output layer is usually cost layer.
+  // For prediction, they should be the actual output layers.
+  repeated string output_layer_names = 5;
+
+  repeated EvaluatorConfig evaluators = 6;
+
+  repeated SubModelConfig sub_models = 8;
+
+  // For External Machine, defining how to split a neural network
+  // into multiple parts.
+  optional ExternalConfig external_config = 9;
+};
diff --git a/proto/ParameterConfig.proto.m4 b/proto/ParameterConfig.proto.m4
new file mode 100644
index 00000000000000..222e070089116e
--- /dev/null
+++ b/proto/ParameterConfig.proto.m4
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+package paddle;
+
+/**
+ * Configuration structure for parameter
+ */
+
+enum ParameterInitStrategy {
+  PARAMETER_INIT_NORMAL = 0;
+  PARAMETER_INIT_UNIFORM = 1;
+}
+
+message ParameterUpdaterHookConfig {
+  required string type = 1;
+  optional string purning_mask_filename = 2;
+}
+
+message ParameterConfig {
+  required string name = 1;
+  required uint64 size = 2;
+  required real learning_rate = 3;
+  required real momentum = 4;
+  optional real initial_mean = 5 [default = 0.0];
+  optional real initial_std = 6 [default = 0.01];
+  // use L2-regularization if decay_rate set and decay_rate_l1 not set
+  optional real decay_rate = 7 [default = 0.0];
+  // use L1-regularization if decay_rate_l1 set
+  optional real decay_rate_l1 = 8 [default = 0.0];
+  // dims of Parameter, e.g. dims[0] as height, dims[1] as width..
+  repeated uint64 dims = 9;
+  // the gpu device which the parameter in.
+  // Only used by ParallelNeuralNetork. Ignored otherwise.
+  optional int32 device = 10 [default = -1];
+  // how to init the parameter: 0 -> normal, 1 -> uniform
+  // 0: treat initial_mean as mean, intial_std as standard deviation
+  // 1: range is (initial_mean - initial_std) to (initial_mean + initial_std)
+  optional int32 initial_strategy = 11 [default = 0];
+  // define the variance when init the parameter, by height of the Matrix
+  optional bool initial_smart = 12 [default = false];
+  // apply regularization every # batches
+  optional int32 num_batches_regularization = 13 [default = 1];
+  // if is_sparse is true, para is sparse, else para is dense
+  optional bool is_sparse = 14[default = false];
+  // if para is sparse, format should be "csc" or "csr"
+  optional string format = 15[default = "csr"];
+  // sparse remote update or not
+  optional bool sparse_remote_update = 16 [default = false];
+  // gradient clipping threshold, no clipping by default
+  optional real gradient_clipping_threshold = 17 [default = 0.0];
+  // static parameters are fixed when training
+  optional bool is_static = 18 [default = false];
+  // para_id should NOT be set by config_parser. It is for
+  // internal use.
+  optional uint64 para_id = 19;
+
+  repeated ParameterUpdaterHookConfig update_hooks = 20;
+  // setup load mat -> csr
+  optional bool need_compact = 21 [default = false];
+  // whether to do sparse update for this parameter
+  optional bool sparse_update = 22 [default = false];
+
+  // whether this parameter is shared or not.
+  optional bool is_shared = 23 [default = false];
+  // parameter block size
+  optional uint64 parameter_block_size = 24 [default = 0];
+}
diff --git a/proto/ParameterService.proto.m4 b/proto/ParameterService.proto.m4
new file mode 100644
index 00000000000000..189dc1c9700bd8
--- /dev/null
+++ b/proto/ParameterService.proto.m4
@@ -0,0 +1,378 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+import "ParameterConfig.proto";
+import "TrainerConfig.proto";
+
+package paddle;
+
+/**
+ * Various structs for communicating with parameter server
+ */
+
+enum ParameterUpdateMode {
+  // Set parameter
+   PSERVER_UPDATE_MODE_SET_PARAM = 0;//use local param
+   PSERVER_UPDATE_MODE_SET_PARAM_ZERO = 1;//set zero param
+
+  // Update parameter once a gradient is received
+  PSERVER_UPDATE_MODE_ASYNC_SGD = 2;
+
+  // Accumulate gradient
+  PSERVER_UPDATE_MODE_ADD_GRADIENT = 3;
+
+  // Average parameters
+  PSERVER_UPDATE_MODE_AVERAGE_PARAMETER = 4;
+
+  // No update. Only get parameters back.
+  PSERVER_UPDATE_MODE_GET_PARAM = 5;
+  PSERVER_UPDATE_MODE_GET_PARAM_SPARSE = 6;//only get sparse rows
+};
+
+message ParameterBlock {
+  // it accurately means parameter id.
+  required uint64 para_id = 1;
+  // global sparse row or dense block for each block in parameter
+  required uint64 block_id = 2;
+  // offset in (local) storage
+  required uint64 begin_pos = 3;
+  // actual size of block, size for last block is [endDim -beginDim],
+  // others is parameter_block_size in ParameterConfig
+  required uint64 block_size = 4;
+}
+
+enum PServerStatus {
+  PSERVER_STATUS_NOT_SET = 0;
+  PSERVER_STATUS_PARAMETER_READY = 1;
+};
+
+enum BatchStatus {
+  BATCH_START = 0;
+  BATCH_ON = 1;
+  BATCH_FINISH = 2;
+  BATCH_START_AND_FINISH = 3;
+};
+
+message SendParameterRequest {
+  required ParameterUpdateMode update_mode = 1;
+  repeated ParameterBlock blocks = 2;
+  required bool send_back_parameter = 3;
+
+  // number of samples used for calculating this update
+  optional int64 num_samples = 4;
+
+  // cost will be used to calculate global objective value
+  optional real cost = 5;
+
+  required BatchStatus batch_status = 6;
+
+  optional int32 trainer_id = 7;
+
+  // send back parameter type on pserver, PARAMETER_VALUE by default
+  optional int32 send_back_parameter_type = 8 [default = 0];
+
+  // forwardbackward time in usec
+  optional uint64 forwardbackward_time = 9;
+
+}
+
+message WaitPassStartRequest {
+}
+
+message WaitPassStartResponse {
+}
+
+message WaitPassFinishRequest {
+}
+
+message WaitPassFinishResponse {
+}
+
+enum SyncObject {
+  SYNC_DEFAULT = 0; // wait for the synchronizeBarrier_
+  SYNC_DATA = 1; // wait for the synchronizeDataBarrier_
+}
+
+message SynchronizeRequest {
+  required SyncObject sync_object_id = 1 [default = SYNC_DEFAULT];
+
+  optional int32 trainer_id = 2;
+}
+
+message SynchronizeResponse {
+}
+
+message SendParameterResponse  {
+  repeated ParameterBlock blocks = 1;
+}
+
+message SetConfigRequest {
+  repeated ParameterConfig param_configs = 1;
+  required OptimizationConfig opt_config = 2;
+  required string save_dir = 4;
+  required int32 server_id = 5;
+  required bool is_sparse_server = 6;
+}
+
+message SetConfigResponse{
+}
+
+message GetStatusRequest {
+}
+
+message GetStatusResponse {
+  required PServerStatus status = 1;
+}
+
+message SetStatusRequest {
+  required PServerStatus status = 1;
+}
+
+message SetStatusResponse {
+}
+
+// create a column vector. The size is the dimension of parameter
+message CreateVectorRequest {
+}
+
+message CreateVectorResponse {
+  // error message. Empty if success
+  optional string return_message = 1;
+
+  required int64 handle = 2;
+}
+
+message ReleaseVectorRequest {
+  required int64 handle = 1;
+}
+
+message ReleaseVectorResponse {
+  // error message. Empty if success
+  optional string return_message = 1;
+}
+
+// Create a column major matrix. The number of rows is the dimension
+// of parameter. The number of columns is specifed by num_cols
+message CreateMatrixRequest {
+  required int32 num_cols = 1;
+}
+
+message CreateMatrixResponse {
+  // error message. Empty if success
+  optional string return_message = 1;
+
+  required int64 handle = 2;
+}
+
+message ReleaseMatrixRequest {
+  required int64 handle = 1;
+}
+
+message ReleaseMatrixResponse {
+  // error message. Empty if success
+  optional string return_message = 1;
+}
+
+
+/**
+ * The operations are defined using the variables commented at Operation
+ * and OperationResult
+ */
+enum MatrixVectorOperation {
+  // r = u^T u
+  PSERVER_OP_utu = 0;
+
+  // r = u^T v
+  PSERVER_OP_utv = 1;
+
+  // u = a u
+  PSERVER_OP_au = 2;
+
+  // v = a u + b v
+  PSERVER_OP_au_bv = 3;
+
+  // u = a A x + b u
+  PSERVER_OP_aAx_bu = 4;
+
+  // Stochastic gradient update
+  PSERVER_OP_SGD = 5;
+
+  // u = a
+  PSERVER_OP_RESET = 6;
+
+  // v = u
+  PSERVER_OP_COPY = 7;
+
+  // w = a u + b v + c w
+  PSERVER_OP_au_bv_cw = 8;
+
+  // owlqn: MakeSteepestDescDir
+  PSERVER_OP_MAKE_STEEPEST_DESC_DIR = 9;
+
+  // owlqn: FixDirSigns
+  PSERVER_OP_FIX_DIR_SIGNS = 10;
+
+  // owlqn: DirDeriv
+  PSERVER_OP_DIR_DERIV = 11;
+
+  // owlqn: FixOmegaSigns
+  PSERVER_OP_FIX_OMEGA_SIGNS = 12;
+
+  // Get overall cost
+  PSERVER_OP_COST = 13;
+
+  // Pass control
+  PSERVER_OP_START_PASS = 14;
+  PSERVER_OP_FINISH_PASS = 15;
+
+  // randomize value
+  PSERVER_OP_RANDOMIZE = 16;
+
+  // call optimizer apply
+  PSERVER_OP_APPLY = 17;
+}
+
+message ProtoVector {
+  required int64 dim = 1;
+  repeated real values = 2 [packed = true];
+}
+
+message ProtoMatrix {
+  required int64 num_rows = 1;
+  required int64 num_cols = 2;
+  repeated real values = 3 [packed = true];
+}
+
+message Operation {
+  required MatrixVectorOperation operation = 1;
+
+  // vector handles created on the pserver
+  repeated int64 pvectors = 2;        // u, v, w
+
+  // matrix handles created on the pserver
+  repeated int64 pmatrices = 3;       // A, B, C
+
+  repeated real scalars = 4;  	      // a, b, c
+  repeated ProtoVector vectors = 5;   // x, y, z
+  repeated ProtoMatrix matrices = 6;  // X, Y, Z
+}
+
+message OperationResult {
+  // error message. Empty if success
+  optional string return_message = 1;
+//
+  repeated real scalars = 2;  // d, e, f
+  repeated ProtoVector vectors = 3;  // p, q, r
+  repeated ProtoMatrix matrices = 4;  // P, Q, R
+}
+
+message DoOperationRequest {
+  repeated Operation operations = 1;
+
+  // If true, wait for gradient to be ready before starting the operations
+  required bool wait_for_gradient = 2;
+
+  // If true, send back the parameter to clients after the operations are
+  // finished
+  required bool send_back_parameter = 3;
+
+  // If true, and if all clients call waitPassFinish,
+  // signal all clients finish the pass
+  required bool release_pass = 4;
+}
+
+message DoOperationResponse {
+  // error message. Empty if success
+  optional string return_message = 1;
+
+  repeated OperationResult results = 2;
+
+  required bool pass_finish = 3;
+}
+
+message LoadValueRequest {
+  required string dir_name = 1;
+}
+
+message LoadValueResponse {
+  // error message. Empty if success
+  optional string return_message = 1;
+}
+
+message SaveValueRequest {
+  required string dir_name = 1;
+}
+
+message SaveValueResponse {
+  // error message. Empty if success
+  optional string return_message = 1;
+}
+
+enum DataUpdateMode {
+  // Client send it's own data to pserver
+  DATA_UPDATE_MODE_SET_OWN = 0;
+  // Client get all user data from all pservers
+  DATA_UPDATE_MODE_GET_ALL = 1;
+  // Client send it's own ref feature to pserver
+  DATA_UPDATE_MODE_SET_REF = 2;
+  // Client get all ref featuers from all pservers
+  DATA_UPDATE_MODE_GET_REF = 3;
+  // Client send it's own ref label to pserver
+  DATA_UPDATE_MODE_SET_REF_LABEL = 4;
+  // Client get all ref labels from all pservers
+  DATA_UPDATE_MODE_GET_REF_LABEL =5;
+  // Client send it's own ref grad to pserver
+  DATA_UPDATE_MODE_SET_REF_GRAD =6;
+  // Client get all ref grad from all pservers
+  DATA_UPDATE_MODE_GET_REF_GRAD =7;
+}
+
+enum SendDataType {
+  DATA_REF = 0;
+  DATA_REFLABEL = 1;
+  DATA_REFGRAD = 2;
+  DATA_REDUCE_SUM = 3;
+}
+
+enum TransDataType {
+  TRANS_INT32 = 0;
+  TRANS_UINT32_T = 1;
+  TRANS_INT64_T = 2;
+  TRANS_UINT64_T = 3;
+  TRANS_FLOAT = 5;
+  TRANS_DOUBLE = 6;
+}
+
+message DataBlock {
+  // total byte size of this data blcok
+  required uint64 total_size = 1;
+  // byte size of one data type
+  required int32 data_size = 2;
+  // data_type
+  optional TransDataType data_type = 3 [default = TRANS_DOUBLE];
+}
+
+message SendDataRequest {
+  required SendDataType type = 1;
+  required DataUpdateMode update_mode = 2;
+  repeated DataBlock blocks = 3;
+  required uint64 client_id = 4;
+  required uint64 server_id = 5;
+}
+
+message SendDataResponse {
+  required SendDataType type = 1;
+  repeated DataBlock blocks = 2;
+  required uint64 server_id = 3;
+}
diff --git a/proto/TrainerConfig.proto.m4 b/proto/TrainerConfig.proto.m4
new file mode 100644
index 00000000000000..a42ff88d54b5e4
--- /dev/null
+++ b/proto/TrainerConfig.proto.m4
@@ -0,0 +1,152 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+import "DataConfig.proto";
+import "ModelConfig.proto";
+
+package paddle;
+
+message OptimizationConfig {
+  required int32 batch_size = 3;
+  required string algorithm = 4 [default = "async_sgd"];
+  optional int32 num_batches_per_send_parameter = 5 [default = 1];
+  optional int32 num_batches_per_get_parameter = 6 [default = 1];
+
+  required real learning_rate = 7;
+  optional real learning_rate_decay_a = 8 [default = 0];
+  optional real learning_rate_decay_b = 9 [default = 0];
+  optional string learning_rate_schedule = 27 [default = "constant"];
+  // learning rate will be scaled according to learning_rate_schedule
+  // 1), constant:
+  // lr = learning_rate
+  // 2), poly:
+  // lr = learning_rate *
+  //      pow(1 + learning_rate_decay_a * num_samples_processed,
+  //          -learning_rate_decay_b)
+  // 3), exp:
+  // lr = learning_rate *
+  //      pow(learning_rate_decay_a,
+  //          num_samples_processed / learning_rate_decay_b)
+  // 4), discexp:
+  // lr = learning_rate *
+  //      pow(learning_rate_decay_a,
+  //          floor(num_samples_processed / learning_rate_decay_b))
+  // 5), linear:
+  // lr = max(learning_rate - learning_rate_decay_a * num_samples_processed,
+  //          learning_rate_decay_b)
+
+  // owlqn related
+  // L1-regularization
+  optional real l1weight = 10 [default = 0.1];
+  // L2-regularization
+  optional real l2weight = 11 [default = 0];
+  // "c1" in wolfe condition: if (newobj <= oldobj + c1 * origDirDeriv * step)
+  // then accept the step
+  optional real c1 = 12 [default = 0.0001];
+  // multiply the step with "backoff", when wolfe condition doesn't satisfy
+  optional real backoff = 13 [default = 0.5];
+  // how many "s"s and "y"s are kept in owlqn
+  optional int32 owlqn_steps = 14 [default = 10];
+  // accept the step if encountered "max_backoff" times of "reduce the step"
+  optional int32 max_backoff = 15 [default = 5];
+  // L2-regularization coefficient is reduced linearly from iteration 0 to
+  // "l2weight_zero_iter", and set to 0 after "l2weight_zero_iter"
+  // iterations. set "l2weight_zero_iter" to 0 to disable this strategy.
+  optional int32 l2weight_zero_iter = 17 [default = 0];
+
+  // averaged sgd
+  // About average_window * numBatchProcessed parameter are used
+  // for average. To be accurate, between average_window * numBatchProcessed
+  // and 2 * average_window * numBatchProcessed parameters are used for
+  // average.
+  optional double average_window = 18 [default = 0];
+  optional int64 max_average_window = 19 [default = 0x7fffffffffffffff];
+
+  //////////////////////////
+  // Options Adaptive SGD //
+  //////////////////////////
+
+  // learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta", "rmsprop"
+  // default learning method("momentum") use global decayed learning rate with momentum.
+  // "adagrad", "adadelta" and "rmsprop" can set momentum too.
+  optional string learning_method = 23 [default = "momentum"];
+  optional real ada_epsilon = 24 [default = 1e-6];
+  optional real ada_rou = 26 [default = 0.95];
+
+  // Force to do average in cpu in order to save gpu memory usage
+  optional bool do_average_in_cpu = 25 [default = false];
+
+  // delta add rate in pserver, used while num_batches_per_send_parameter>1
+  // will be divided by #machines automatically.
+  optional real delta_add_rate = 28 [default = 1.0];
+
+  // We split a large size into smaller mini-batches, whose sizes are
+  // determined by mini_batch_size. It only takes effect when there is
+  // an ExternalMachine.
+  optional int32 mini_batch_size = 29 [default = 128];
+
+  // automatically set if any one of parameters set sparse remote update flag
+  optional bool use_sparse_remote_updater = 30 [default = false];
+
+  // how to update center parameter and feedback to local parameter, 
+  // when use local sgd update in cluster training.
+  // A option is elastic_average, proposed by the paper: Deep learning with elastic averaging SGD.
+  // If use elastic_average method, every trainer node should sample from whole data sets.
+  optional string center_parameter_update_method = 31 [default = "average"];
+
+  // shrink sparse parameter value
+  // only works if parameter is remote sparse update and has L1 decay rate
+  optional real shrink_parameter_value = 32 [default = 0];
+
+  ////////////////////////////
+  // Options Adam Optimizer //
+  ////////////////////////////
+  optional real adam_beta1 = 33 [default = 0.9];
+  optional real adam_beta2 = 34 [default = 0.999];
+  optional real adam_epsilon = 35 [default = 1e-8];
+
+  // arguments for learning rate scheduler
+  // Format: num1:rate1,num2:rate2,...,numK:rateK
+  // For learning_rate_schedule="manual", num is the number of samples,
+  // For learning_rate_schedule="pass_manual",
+  //  num is the number of passes (starting from 0)
+  optional string learning_rate_args = 36 [default = ""];
+ 
+  // for async sgd gradient commit control.
+  // when async_lagged_grad_discard_ratio * num_gradient_servers commit passed,
+  // current async gradient will be discard silently.
+  optional real async_lagged_grad_discard_ratio = 37 [default = 1.5];
+};
+
+message TrainerConfig {
+  required ModelConfig model_config = 1;
+  optional DataConfig data_config = 2;
+  required OptimizationConfig opt_config = 3;
+  optional DataConfig test_data_config = 4;
+  repeated string config_files = 5;
+
+  // the directory to save/load model files for each training path
+  optional string save_dir = 6 [default = "./output/model"];
+
+  // Path of the initial model parameters.
+  // If it was set, start_pass will be ignored.
+  optional string init_model_path = 7;
+
+  // Start training from this pass.
+  // Will load parameter from the previous pass.
+  optional int32 start_pass = 8 [default = 0];
+
+  // file path to the trainer config file
+  optional string config_file = 9;
+}
diff --git a/python/.gitignore b/python/.gitignore
new file mode 100644
index 00000000000000..cc7d0ece4acaba
--- /dev/null
+++ b/python/.gitignore
@@ -0,0 +1,7 @@
+*pyc
+build
+dist
+paddle.egg-info
+.idea
+paddle/proto/*.py
+paddle/proto/*.pyc
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
new file mode 100644
index 00000000000000..c8e3ecd41cedac
--- /dev/null
+++ b/python/CMakeLists.txt
@@ -0,0 +1,31 @@
+set(OUTPUT_DIR
+    "${CMAKE_CURRENT_BINARY_DIR}/build")
+
+
+set(PADDLE_INTERNAL_PACKAGE "")
+if (PADDLE_WITH_INTERNAL)
+    set(PADDLE_INTERNAL_PACKAGE "paddle.internals")
+endif()
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
+    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
+
+add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
+    COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+    COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
+    DEPENDS gen_proto_py)
+
+add_custom_target(paddle_python ALL DEPENDS
+    ${OUTPUT_DIR}/.timestamp)
+
+find_python_module(pip REQUIRED)
+find_python_module(wheel REQUIRED)
+find_python_module(google.protobuf REQUIRED)
+
+install(CODE "execute_process(COMMAND ${PYTHON_EXECUTABLE} setup.py install -f
+    WORKING_DIRECTORY
+    ${CMAKE_CURRENT_BINARY_DIR})")
+
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/
+    DESTINATION opt/paddle/share/wheels
+)
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
new file mode 100644
index 00000000000000..7f9e87eee60376
--- /dev/null
+++ b/python/paddle/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/python/paddle/proto/__init__.py b/python/paddle/proto/__init__.py
new file mode 100644
index 00000000000000..7f9e87eee60376
--- /dev/null
+++ b/python/paddle/proto/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py
new file mode 100644
index 00000000000000..c4f61473933d04
--- /dev/null
+++ b/python/paddle/trainer/PyDataProvider2.py
@@ -0,0 +1,198 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cPickle
+import logging
+
+
+class SequenceType(object):
+    NO_SEQUENCE = 0
+    SEQUENCE = 1
+    SUB_SEQUENCE = 2
+
+
+# TODO(yuyang18): Add string data type here.
+class DataType(object):
+    Dense = 0
+    SparseNonValue = 1
+    SparseValue = 2
+    Index = 3
+
+
+class CacheType(object):
+    NO_CACHE = 0  # No cache at all
+
+    # First pass, read data from python.  And store them in memory. Read from
+    # memory during rest passes.
+    CACHE_PASS_IN_MEM = 1
+
+
+class InputType(object):
+    __slots__ = ['dim', 'seq_type', 'type']
+
+    def __init__(self, dim, seq_type, tp):
+        self.dim = dim
+        self.seq_type = seq_type
+        self.type = tp
+
+
+def dense_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
+    return InputType(dim, seq_type, DataType.Dense)
+
+
+def sparse_non_value_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
+    return InputType(dim, seq_type, DataType.SparseNonValue)
+
+
+def sparse_value_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
+    return InputType(dim, seq_type, DataType.SparseValue)
+
+
+def index_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
+    return InputType(dim, seq_type, DataType.Index)
+
+
+dense_vector = dense_slot
+sparse_binary_vector = sparse_non_value_slot
+sparse_vector = sparse_value_slot
+integer_value = index_slot
+
+def dense_vector_sequence(dim):
+    return dense_vector(dim, seq_type=SequenceType.SEQUENCE)
+
+def dense_vector_sub_sequence(dim):
+    return dense_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
+
+def sparse_binary_vector_sequence(dim):
+    return sparse_binary_vector(dim, seq_type=SequenceType.SEQUENCE)
+
+def sparse_binary_vector_sub_sequence(dim):
+    return sparse_binary_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
+
+def sparse_vector_sequence(dim):
+    return sparse_vector(dim, seq_type=SequenceType.SEQUENCE)
+
+def sparse_vector_sub_sequence(dim):
+    return sparse_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
+
+def integer_value_sequence(dim):
+    return integer_value(dim, seq_type=SequenceType.SEQUENCE)
+
+def integer_value_sub_sequence(dim):
+    return integer_value(dim, seq_type=SequenceType.SUB_SEQUENCE)
+
+def integer_sequence(dim):
+    return index_slot(dim, seq_type=SequenceType.SEQUENCE)
+
+
+class SingleSlotWrapper(object):
+    def __init__(self, generator):
+        self.generator = generator
+
+    def __call__(self, obj, filename):
+        for item in self.generator(obj, filename):
+            yield [item]
+
+
+def provider(input_types=None, should_shuffle=True, pool_size=-1,
+             can_over_batch_size=True,
+             calc_batch_size=None,
+             cache=CacheType.NO_CACHE,
+             init_hook=None, **kwargs):
+    """
+    Provider decorator. Use it to make a function into PyDataProvider2 object.
+    In this function, user only need to get each sample for some train/test
+    file.
+
+    The basic usage is:
+
+    ..  code-block:: python
+
+        @provider(some data provider config here...)
+        def process(settings, file_name):
+            while not at end of file_name:
+                sample = readOneSampleFromFile(file_name)
+                yield sample.
+
+    The configuration of data provider should be setup by\:
+
+    :param input_types: Specify the input types, can also be set in init_hook.
+                        It is a list of InputType object. For example, input_types= \
+                        [dense_vector(9), integer_value(2)].
+    :param should_shuffle: True if data should shuffle.
+    :type should_shuffle: bool
+    :param pool_size: Max number of sample in data pool.
+    :type pool_size: int
+    :param can_over_batch_size: True if paddle can return a mini-batch larger
+                                than batch size in settings. It is useful when
+                                custom calculate one sample's batch_size.
+
+                                It is very danger to set it to false and use
+                                calc_batch_size together. Default is false.
+    :param calc_batch_size: a method to calculate each sample's batch size.
+                            Default each sample's batch size is 1. But to you
+                            can customize each sample's batch size.
+    :param cache: Cache strategy of Data Provider. Default is CacheType.NO_CACHE
+
+    :param init_hook: Initialize hook. Useful when data provider need load some
+                      external data like dictionary. The parameter is
+                      (settings, file_list, \*\*kwargs).
+
+                      - settings\: Is the global settings. User can set
+                                   settings.input_types here.
+                      - file_list\: All file names for passed to data provider.
+                      - kwargs: Other keyword arguments passed from
+                        trainer_config's args parameter.
+    """
+
+    def __wrapper__(generator):
+        class DataProvider(object):
+            def __init__(self, file_list, **kwargs):
+                self.logger = logging.getLogger("")
+                self.logger.setLevel(logging.INFO)
+                self.input_types = None
+                if 'slots' in kwargs:
+                    self.logger.warning('setting slots value is deprecated, '
+                                        'please use input_types instead.')
+                    self.slots = kwargs['slots']
+                self.slots = input_types
+                self.should_shuffle = should_shuffle
+                self.pool_size = pool_size
+                self.can_over_batch_size = can_over_batch_size
+                self.calc_batch_size = calc_batch_size
+                self.file_list = file_list
+                self.generator = generator
+                self.cache = cache
+                if init_hook is not None:
+                    init_hook(self, file_list=file_list, **kwargs)
+                if self.input_types is not None:
+                    self.slots = self.input_types
+                assert self.slots is not None
+                assert self.generator is not None
+
+                if len(self.slots) == 1:
+                    self.generator = SingleSlotWrapper(self.generator)
+
+        return DataProvider
+
+    return __wrapper__
+
+
+def deserialize_args(args):
+    """
+    Internal use only.
+    :param args:
+    :return:
+    """
+    return cPickle.loads(args)
diff --git a/python/paddle/trainer/PyDataProviderWrapper.py b/python/paddle/trainer/PyDataProviderWrapper.py
new file mode 100644
index 00000000000000..c4b907af54699f
--- /dev/null
+++ b/python/paddle/trainer/PyDataProviderWrapper.py
@@ -0,0 +1,740 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This module provide a wrapper(decorator) to wrap a data process method into a
+PyDataProvider. Some examples are shown `here <data_provider/python_case.html>`_.
+"""
+
+import struct
+import array
+import random
+import gc
+import logging
+import pstats
+import sys
+import numpy
+import functools
+
+__all__ = [
+    'DenseSlot', 'SlotType', 'SparseNonValueSlot', 'StringSlot',
+    'SparseValueSlot', 'IndexSlot', 'PoolSize', 'GeneralPyDataProvider',
+    'provider', 'init_hook_wrapper'
+]
+
+try:  # Just for profile mode, will try to import cProfile first.
+    # Most python will contains cProfile, cProfile/profile are basically same.
+    # ref: https://docs.python.org/2/library/profile.html#introduction-to-the-profilers
+    import cProfile as profile
+except ImportError:
+    import profile
+
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+
+import io
+
+class SlotType(object):  # Just a hint for user.
+    pass
+
+
+class DenseSlot(SlotType):
+    """
+    Dense Slot Type: Each item is the value of a Dense Vector.
+
+    Its yield format for :code:`provider` is:
+
+    - **NonSeq**: [float, float, ... ]
+    - **Seq**: [[float, float, ...], [float, float ....], ... ]
+    - **SubSeq**: [[[float, float, ...], [float ....], ...] ,  \
+                   [[float, float, ...], [float ....], ...] , ...]
+    """
+
+    def __init__(self, dim):
+        """
+        :param dim: slot dimension
+        :type dim: int
+        """
+        self.dim = dim
+        self.type = 0
+
+
+class SparseNonValueSlot(SlotType):
+    """
+    Sparse NonValue Slot Type: Each item is the id of a Sparse Vector.
+
+    Its yield format for :code:`provider` is:
+
+    - **NonSeq**: [int, int, ...]
+    - **Seq**: [[int, int, ...], [int, int, ...], ... ]
+    - **SubSeq**: [[[int, int, ...], [int, ....], ...] ,  \
+                   [[int, int, ...], [int, ....], ...] , ...]
+    """
+    def __init__(self, dim):
+        """
+        :param dim: slot dimension
+        :type dim: int
+        """
+        self.dim = dim
+        self.type = 1
+
+
+class SparseValueSlot(SlotType):
+    """
+    Sparse Value Slot Type: Each item is the id and value of a Sparse Vector.
+
+    Its yield format for :code:`provider` is:
+
+    - **NonSeq**: [(int, float), (int, float), ... ]
+    - **Seq**: [[(int,float), (int, float), ... ], \
+                [(int, float), (int, float), ...], ... ]
+    - **SubSeq**: [[[(int,float), ...], [(int, float), ....], ...] ,  \
+                   [[(int,float), ...], [(int, float), ....], ...] , ...]
+    """
+
+    def __init__(self, dim):
+        """
+        :param dim: slot dimension.
+        :type dim: int
+        """
+        self.dim = dim
+        self.type = 2
+
+
+class IndexSlot(SlotType):
+    """
+    Index Value Slot Type: Each item is the id of Label.
+
+    Its yield format for :code:`provider` is:
+
+    - **NonSeq**: int
+    - **Seq**:  [int, int, ....]
+    - **SubSeq**: [[int, int, ...], [int, int, ...], ... ]
+    """
+
+    def __init__(self, dim):
+        """
+        :param dim: slot dimension
+        :type dim: int
+        """
+        self.dim = dim
+        self.type = 3
+
+
+class StringSlot(SlotType):
+    """
+    String Value Slot Type: Each item is a string for printout, \
+                            can be used in DataLayer too.
+
+    Its yield format for :code:`provider` is:
+
+    - **NonSeq**: string
+    - **Seq**: [string, string, ....]
+    - **SubSeq**:  [[string, string, ...], [string, string, ...], ... ]
+    """
+
+    def __init__(self, dim):
+        """
+        :param dim: slot dimension
+        :type dim: string
+        """
+        self.dim = dim
+        self.type = 6
+
+
+class SparseNonValueHandler(object):
+    """
+    Private Class, Use for converting python object to paddle string.
+    """
+
+    def __init__(self):
+        self.offsets = []
+        self.value = []
+        self.offset_count = 0
+
+    def __call__(self, ele):
+        """
+        It will be invoked when scan each sparse data.
+
+        :param ele: list of sparse data, maybe non-value [ idx, ... ] or value.
+                    [ (idx, val), ... ]
+        :type ele: list
+        """
+        self.offsets.append(self.offset_count)
+        self.offset_count += len(ele)
+        self.processElement(ele)
+
+    def processElement(self, ele):
+        """
+        Process for element list. See __call__ for more document.
+        """
+        self.value += ele
+
+    def done(self, data_stream, int_packer):
+        """
+        Dump data to stream.
+        :param data_stream: Output Stream.
+        :param int_packer:  A struct.Struct("i") object
+        """
+        data_stream.write(array.array("i", self.offsets).tostring())
+        data_stream.write(int_packer.pack(self.offset_count))
+        data_stream.write(array.array("i", self.value).tostring())
+
+
+class SparseValueHandler(SparseNonValueHandler):
+    """
+    Private class, use for converting python obj to paddle string.
+    """
+
+    def __init__(self):
+        SparseNonValueHandler.__init__(self)
+        self.weight = []
+
+    def processElement(self, ele):
+        for idx, w in ele:
+            self.value.append(idx)
+            self.weight.append(w)
+
+    def done(self, data_stream, int_packer):
+        SparseNonValueHandler.done(self, data_stream, int_packer)
+        data_stream.write(int_packer.pack(self.offset_count))
+        data_stream.write(array.array("f", self.weight).tostring())
+
+
+class StringHandler(object):
+    """
+    Private Class, Use for converting python object to paddle string.
+    """
+
+    def __init__(self, data_stream, int_packer):
+        self.data_stream = data_stream
+        self.int_packer = int_packer
+
+    def __call__(self, ele):
+        """
+        It will be invoked when scan each string data.
+        :param ele: string data
+        :type ele: str
+        """
+        self.data_stream.write(self.int_packer.pack(len(ele)))
+        self.data_stream.write(array.array("c", ele).tostring())
+
+
+class GeneralPyDataProvider:
+    def __init__(self, *file_list, **kwargs):
+        """
+        :param file_list: input file_list
+        """
+        del kwargs  # unused
+        gc.disable()
+        assert isinstance(self.logger, logging.Logger)
+        self.use_seq_flag = hasattr(self, "use_seq_flag") and self.use_seq_flag
+        self.slots_num = len(self.getSlots())
+        self.file_list = list(file_list)
+        self.generators = map(self.generateData, self.file_list)
+        self.int_packer = struct.Struct("i")
+        self.head_packer = struct.Struct("ii")
+        self.float_packer = struct.Struct("f")
+        self.shuffler = lambda *args, **kwargs: None
+        self.data_pool = []
+        self.has_subseq = []
+        self.has_checked = False
+
+        self.debug = hasattr(self, "debug") and self.debug
+
+        if hasattr(self, "profile_filename") and isinstance(
+                self.profile_filename, str):
+            self.profile_count = 0
+            self.is_profile = True
+        else:
+            self.is_profile = False
+
+        if not hasattr(self, "file_count") or not isinstance(self.file_count,
+                                                             int):
+            self.file_count = sys.maxint
+
+        if not hasattr(self, "can_over_batch_size"):
+            self.can_over_batch_size = True
+        elif not self.can_over_batch_size:
+            self.logger.warn(
+                "User should ensure every data size is not larger than batch"
+                " size when can_over_batch_size = False")
+
+        self.data_pool_idx = 0
+
+    def reset(self):
+        """Reset all data in provider."""
+
+        self.logger.debug("reset dataprovider.")
+        self.generators = map(self.generateData, self.file_list)
+        self.shuffler = lambda *args, **kwargs: None
+        self.data_pool = []
+        self.data_pool_idx = 0
+        if self.file_count != 0:
+            self.max_pool_size = 0
+
+        # When use Profile, each pass will print a profile result.
+        if self.is_profile:
+            if hasattr(self, "profiler") and isinstance(self.profiler,
+                                                        profile.Profile):
+                self.profiler.disable()
+                fn = "%s_%d" % (self.profile_filename, self.profile_count)
+                sortby = "cumulative"
+                with open(fn, "w") as f:
+                    pstats.Stats(self.profiler, stream=f).sort_stats(
+                        sortby).print_stats()
+                self.logger.info("saving profile to file %s" % fn)
+                self.profile_count += 1
+            self.logger.info("resetting profile")
+            self.profiler = profile.Profile()
+            self.profiler.enable()
+
+    def shuffle(self):
+        """ shuffle data"""
+        if not self.should_shuffle:
+            return
+        else:
+            self.logger.debug("shuffling data.")
+            random.shuffle(self.generators)
+            self.shuffler = random.shuffle
+
+    def getSlots(self):
+        """
+        :return : return a list of SlotType
+        :rtype: list
+        """
+        return []
+
+    def generateData(self, fn):
+        """
+        :param fn: file name
+        :return: a generator to yield data one by one.
+        """
+        raise NotImplementedError
+
+    def calculateDataBatchSize(self, data):
+        """
+        :param data: One sample which yield by generateData
+        :type data: list
+        :return: The batch size that the data contribute.
+        :rtype: int
+        """
+        return 1
+
+    def getHeader(self):
+        """return paddle header format"""
+        ret = self.head_packer.pack(self.slots_num, self.use_seq_flag)
+        for obj in self.getSlots():
+            ret += self.head_packer.pack(obj.type, obj.dim)
+        return ret
+
+    def getHeaderNative(self):
+        return self.use_seq_flag, self.getSlots()
+
+    def getNextBatchNative(self, batch_size):
+        ret_list = []
+        self.__prepareData(batch_size, ret_list)
+        return ret_list
+
+    def getNextBatch(self, batch_size):
+        """
+        :param batch_size: the batch_size approximately return.
+        :return: return paddle pyDataProvider format, just see documents.
+        :rtype: str
+
+        NOTE: If can_over_batch_size is True, the return batch_size >= input batch_size.
+              Otherwise, the return batch_size < input batch_size, BUT USER MUST ENSURE THAT each data's batch size
+              is less than input batch_size.
+        """
+        ret_list = []
+        current_batch_size = self.__prepareData(batch_size, ret_list)
+        # create unified format for ret_list with differnt slots_num
+        if self.slots_num == 1:
+            ret_list = [ret_list]
+
+        if current_batch_size == 0:
+            return self.int_packer.pack(current_batch_size)
+        data_bytes = io.BytesIO()
+        seq_bytes = io.BytesIO()
+        subseq_bytes = io.BytesIO()
+        data_stream = io.BufferedWriter(data_bytes)
+        seq_stream = io.BufferedWriter(seq_bytes)
+        subseq_stream = io.BufferedWriter(subseq_bytes)
+
+        def convertDataImpl(idx, data_callback):
+            """
+            This method will handle sequence in return data. invoke data_callback one by one.
+            :param idx: the slot index.
+            :param data_callback: a callback, which type is (each sample) => None.
+            """
+            indices = 0
+            slot_sample_num = len(ret_list)
+            if self.use_seq_flag:
+                slot_sample_num = 0
+                if self.has_subseq[idx]:  # has sub-sequence
+                    slot_subseq_num = 0
+                    for dat in ret_list:
+                        dat = dat[idx]
+                        slot_subseq_num += len(dat)
+                        for sub_dat in dat:
+                            slot_sample_num += len(sub_dat)
+                    subseq_stream.write(self.int_packer.pack(slot_subseq_num))
+                else:
+                    for dat in ret_list:
+                        dat = dat[idx]
+                        slot_sample_num += len(dat)
+                seq_stream.write(self.int_packer.pack(len(ret_list)))
+            data_stream.write(self.int_packer.pack(slot_sample_num))
+
+            for dat in ret_list:
+                dat = dat[idx]
+                if self.use_seq_flag:
+                    seq_stream.write(self.int_packer.pack(indices))
+                    if self.has_subseq[idx]:  # has sub-sequence
+                        for sub_dat in dat:
+                            writeDataStream(sub_dat, data_callback)
+                            subseq_stream.write(self.int_packer.pack(indices))
+                            indices += len(sub_dat)
+                    else:
+                        writeDataStream(dat, data_callback)
+                        indices += len(dat)
+                else:
+                    writeDataStream(dat, data_callback)
+
+        def writeDataStream(dat, data_callback):
+            if self.use_seq_flag > 0:
+                if data_callback is None:  # Special for index slot
+                    data_stream.write(array.array("i", dat).tostring())
+                else:
+                    for ele in dat:
+                        data_callback(ele)
+            else:
+                if data_callback is None:  # Special for index slot
+                    data_stream.write(self.int_packer.pack(dat))
+                else:
+                    data_callback(dat)
+
+        try:
+            for i in range(self.slots_num):
+                slot = self.getSlots()[i]
+                # According to the data_type, each slot data will be converted to binary
+                if isinstance(slot, DenseSlot):
+                    convertDataImpl(i, lambda e: data_stream.write(
+                        array.array("f", e).tostring()))
+                elif isinstance(slot, SparseNonValueSlot):
+                    handler = SparseNonValueHandler()
+                    convertDataImpl(i, handler)
+                    handler.done(data_stream, self.int_packer)
+                elif isinstance(slot, SparseValueSlot):
+                    handler = SparseValueHandler()
+                    convertDataImpl(i, handler)
+                    handler.done(data_stream, self.int_packer)
+                elif isinstance(slot, IndexSlot):
+                    convertDataImpl(i, None)
+                elif isinstance(slot, StringSlot):
+                    handler = StringHandler(data_stream, self.int_packer)
+                    convertDataImpl(i, handler)
+                else:
+                    raise RuntimeError("The data_type must be 0/1/2/3/6")
+            data_stream.flush()
+            seq_stream.flush()
+            subseq_stream.flush()
+
+            return "".join([self.int_packer.pack(current_batch_size),
+                            data_bytes.getvalue(),
+                            seq_bytes.getvalue(), subseq_bytes.getvalue()])
+
+        finally:
+            data_stream.close()
+            seq_stream.close()
+            subseq_stream.close()
+            data_bytes.close()
+            seq_bytes.close()
+            subseq_bytes.close()
+
+    def hasSubseq(self, ret_list):
+        # create unified format for ret_list with differnt slots_num
+        if self.slots_num == 1:
+            ret_list = [ret_list]
+        # decide whether slot has sub-sequence using its first sample
+        for i in range(self.slots_num):
+            slot = self.getSlots()[i]
+            dat = ret_list[0][i][0]
+            if isinstance(slot, IndexSlot) or isinstance(slot, StringSlot):
+                if isinstance(dat, list) or isinstance(dat, numpy.ndarray):
+                    self.has_subseq.append(1)  # has_subseq = True
+                    continue
+            elif isinstance(dat[0], list) or isinstance(dat[0], numpy.ndarray):
+                self.has_subseq.append(1)  # has_subseq = True
+                continue
+            self.has_subseq.append(0)  # has_subseq = False
+
+    def checkOrder(self):
+        first_noSubseq_slot = self.slots_num
+        last_subseq_slot = -1
+        for i in range(self.slots_num):
+            if not self.has_subseq[i]:
+                first_noSubseq_slot = i
+                break
+        for i in range(self.slots_num):
+            if self.has_subseq[i]:
+                last_subseq_slot = i
+        if first_noSubseq_slot < last_subseq_slot:
+            raise RuntimeError(
+                "slot hasSubseq must put before than slot without subseq")
+        self.has_checked = True
+
+    def __prepareData(self, batch_size, ret_list):
+        current_batch_size = 0
+        could_exit = False
+        while not could_exit:
+            if len(self.data_pool) == 0:
+                self.data_pool_idx = 0
+                self.fillPool()
+            if len(self.data_pool) != 0:
+                for idx in xrange(self.data_pool_idx, len(self.data_pool)):
+                    current_batch_size += self.calculateDataBatchSize(
+                        self.data_pool[idx])
+                    if current_batch_size >= batch_size:
+                        could_exit = True
+                        break
+                if current_batch_size > batch_size and not self.can_over_batch_size:  # if cannot over batch size
+                    current_batch_size -= self.calculateDataBatchSize(
+                        self.data_pool[idx])
+                    idx -= 1
+
+                ret_list += self.data_pool[self.data_pool_idx: idx + 1]
+
+                # for speed reason, just shift left index, not delete data actually.
+                self.data_pool_idx = idx + 1
+
+                if self.data_pool_idx == len(self.data_pool):
+                    self.data_pool = []
+            else:
+                break
+        if self.use_seq_flag and not self.has_checked:  # compute self.has_subseq and checkOrder only at first time
+            self.hasSubseq(ret_list)
+            self.checkOrder()
+        return current_batch_size
+
+    def fillPool(self):
+        """
+        Fill the pool to max_pool_size. If max_pool_size is None, then read file_count to pool.
+        """
+        if self.max_pool_size == 0:
+            for i in xrange(min(self.file_count, len(self.generators))):
+                self.data_pool += list(self.generators[i])
+            self.generators = self.generators[
+                              min(self.file_count, len(self.generators)):]
+            self.max_pool_size = len(self.data_pool)
+        else:
+            while len(self.data_pool) < self.max_pool_size and len(
+                    self.generators) != 0:
+                try:
+                    self.data_pool.append(self.generators[0].next())
+                except StopIteration:
+                    self.generators.pop(0)
+        self.shuffler(self.data_pool)
+
+
+class PoolSize(object):
+    """Max number of sample which contains in provider."""
+
+    def __init__(self, pool_size):
+        self.size = pool_size
+
+
+def default_init_hook(cls, *args, **kwargs):
+    """ default hook, do nothing """
+    del cls, args, kwargs
+
+
+def provider(slots=None, use_seq=False, should_shuffle=True, pool_size=1,
+             can_over_batch_size=True, calc_batch_size=lambda data: 1,
+             debug=False, init_hook=default_init_hook, profile_filename=None):
+    """
+    The decorator for PyDataProvider. User should use this to create Provider class.
+    User should only concern how to read sample from file.
+
+    So the basic usage is:
+
+    ..  code-block:: python
+
+        @provider(some data provider config here...)
+        def process(obj, file_name):
+            while not at end of file_name:
+                sample = readOneSampleFromFile(file_name)
+                yield sample.
+
+    The configuration of data provider should be setup by:
+
+    :param init_hook: A callback will be invoked when PyDataProvider instance \
+                      created. The parameter is (obj, \*args, \*\*kwargs).
+
+                      - **obj**: actually data provider instance, which \
+                                 contains some global objects in obj.xxxxx, \
+                                 and is used by process function.
+
+                        1. **obj.slots**: a list of SlotType Object. Can be \
+                                          set in init. For example, obj.slots = \
+                                          [DenseSlot(9), IndexSlot(2)].
+                        2. **obj.logger**: a logger object. User can invoke \
+                                          obj.logger.info(), obj.logger.fatal(), etc.
+
+                      - **args** and **kwargs**: the data provider __init__ \
+                                                 parameters. For example, load_data_args \
+                                                 will be found in \*\*kwargs, \
+                                                 and if you want to recieve \
+                                                 it from trainer_config, \
+                                                 recommand to use init_hook_wrapper
+    :type init_hook: callable
+
+    :param pool_size:
+                      - **int**: it will read at most pool_size files to memory.
+                      - **PoolSize**: it will read at most PoolSize.size samples to memory.
+                      - If not set, it will read all the files to memory.
+    :type pool_size: int | PoolSize
+
+    :param slots: Specify the SlotTypes, can also be set in init_hook. It has two formats:
+
+                  - A list of SlotType objects. For example, slots = \
+                    [DenseSlot(9), IndexSlot(2)].
+                  - A method return a list of SlotTypes, and the parameter of \
+                    method is (obj, \*file_list, \*\*kwargs).
+    :type slots: list | callable
+
+    :param use_seq:  False if use no sequence (Default). True if use sequence:
+
+                     - If sequence has **no sub-sequence**: Each slot will \
+                       return a list of data. This list is one sequence. \
+                       So the return format likes \
+                       [[a0, a1, a2], [b1, b2, b3, b4], [c1]].
+                     - If sequence has **sub-sequence**: Each slot will return \
+                       a nested-list of data. This list contains several \
+                       sub-lists, each sub-list is one sub-sequence. \
+                       So the return format likes \
+                       [[[a0, a1, a2], [a4, a5]], [[b1, b2, b3, b4], [b5, b6]], [[c1], [c2]]].
+    :type use_seq: bool
+
+    :param should_shuffle: True if data should shuffle.
+    :type should_shuffle: bool
+
+    :param calc_batch_size: The method calculate each data's batch size.
+
+                            - Default is the batch size of one sample.
+                            - User can customize by **lamda** funtion. For example, \
+                              :code:`calc_batch_size = lambda data : len(data)` \
+                              means calculating the token number of a sequence data.
+    :type calc_batch_size: callable
+
+    :param can_over_batch_size: Whether :code:`actual batch size >= input batch size`
+
+                                - **True** (>=): getNextBatch method can return more data (Default).
+                                - **False** (<): user must ensure that each data's batch size < input batch size.
+    :type can_over_batch_size: bool
+
+    :param debug: True if enable debug logger and some debug check. Default is False.
+    :type debug: bool
+
+    :param profile_filename: None if disable profile (Default). Otherwise, \
+                             the data provider will dump profile result when \
+                             reset. And the dump filename is \
+                             **<profile_filename>_<reset_count>**.
+    :type profile_filename: None | Str
+    """
+
+    def _wrapper(handler):
+        class Cls(GeneralPyDataProvider):
+            """ Real PyDataProvider Class. """
+
+            def __init__(self, *file_list, **kwargs):
+                logging.basicConfig(
+                    format="[%(levelname)s %(asctime)s %(filename)s:%(lineno)s]"
+                           " %(message)s")
+
+                self.logger = logging.getLogger("")
+                if debug:
+                    self.logger.setLevel(logging.DEBUG)
+                    self.logger.debug("Running pydataprovider in debug mode.")
+                else:
+                    self.logger.setLevel(logging.INFO)
+
+                init_hook(self, *file_list, **kwargs)
+                if callable(slots):
+                    self.slots = slots(self, *file_list, **kwargs)
+                elif slots is not None:
+                    self.slots = slots
+
+                if isinstance(pool_size, int):
+                    self.max_pool_size = 0
+                    self.file_count = pool_size
+                elif isinstance(pool_size, PoolSize):
+                    self.max_pool_size = pool_size.size
+                    self.file_count = 0
+                else:
+                    raise RuntimeError
+                self.can_over_batch_size = can_over_batch_size
+                self.debug = debug
+                self.profile_filename = profile_filename
+                self.use_seq_flag = use_seq
+                self.should_shuffle = should_shuffle
+                GeneralPyDataProvider.__init__(self, *file_list, **kwargs)
+
+            def getSlots(self):
+                return self.slots
+
+            def generateData(self, f):
+                return handler(self, f)
+
+            def calculateDataBatchSize(self, data):
+                return calc_batch_size(data)
+
+        return Cls
+
+    return _wrapper
+
+
+def init_hook_wrapper(func):
+    """
+    Wrap a method for PyDataProviderWrapper's init_hook. This method can
+    receive parameter from trainer_config's load_data_args. The load_data_args
+    must pass a pickle.dumps() value, and dump a map as keyword args. The
+    wrapped method :code:`func` will receive them as keyword args.
+
+    So an example usage is:
+
+    ..  code-block:: python
+
+        @init_hook_wrapper
+        def hook(obj, dictionary, file_list, **kwargs):
+            obj.dictionary = dictionary
+            obj.slots = [IndexSlot(len(obj.dictionary)),
+                         IndexSlot(len(open(file_list[0], "r").readlines()))]
+
+    :param func: init_hook function
+    :type func: callable
+    :return: wrapped method, can be passed into @provider.
+    """
+
+    @functools.wraps(func)
+    def wrapper(obj, *file_list, **kwargs):
+        args = kwargs.get("load_data_args", dict())
+        if isinstance(args, basestring):
+            args = pickle.loads(args)
+        args['file_list'] = file_list
+        func(obj=obj, **args)
+
+    return wrapper
diff --git a/python/paddle/trainer/__init__.py b/python/paddle/trainer/__init__.py
new file mode 100644
index 00000000000000..7f9e87eee60376
--- /dev/null
+++ b/python/paddle/trainer/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
new file mode 100644
index 00000000000000..53d8bb98f09e81
--- /dev/null
+++ b/python/paddle/trainer/config_parser.py
@@ -0,0 +1,3127 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+'''
+The following functions are available in the config file:
+
+Bias: define bias. To be used as value of bias argument in Layer().
+
+Data: define data provider.
+
+Input: define input layer for a layer. To be used as element of inputs argument
+       in Layer().
+
+Conv: define a convolution operation for an input of a layer.
+
+Norm: define a normalization operation for an input of a layer.
+
+Pool: define a pooling operation for an input of a layer.
+
+Layer: define a layer.
+
+Parameter: define a parameter.
+
+Import: import another config file. If the imported config file name is
+        a relative path, then it will be searched under the directory of the
+        current config file.
+
+Inputs(layer_names...):
+    Define the name of the input layers of the NeuralNetwork.
+    The type of these layers must be "data".
+    These layers will be provided with the DataBatch obtained
+    from DataProvider. The data streams from DataProvider must
+    have the same order.
+
+Outputs(layer_names...):
+    Define the name of the output layers of the NeuralNetwork.
+    Usually the output is simply the cost layer.
+    You can specify other layers as outputs and  calculate the
+    cost (and its derivative) yourself.
+
+
+default_initial_std(val)
+default_initial_mean(val)
+default_momentum(val):
+default_decay_rate(val): Set the default value for these parameters
+
+
+get_config_arg(name, type, default): Get the value for a config parameter.
+
+
+*** customized extension to config_parser ***
+The functionality of the config_parser can be extended.
+If the config_arg_str for parse_config() contains
+extension_module_name=[MODULE_NAME], then config_parser will call
+MODULE_NAME.get_config_funcs(g_config)
+MODULE_NAME.get_config_funcs() should return a dictionary of name to functions,
+those functions will be available in the config file.
+See trainer/tests/config_parser_test.py for example
+
+To use this from paddle_trainer, paddle_trainer should be called with
+--config_args=extension_module_name=[MODULE_NAME]
+
+'''
+
+import copy
+import logging
+import os
+import sys
+import traceback
+import math
+import shutil
+
+try:
+    from paddle.proto.DataConfig_pb2 import DataConfig
+    from paddle.proto.ModelConfig_pb2 import ModelConfig
+    from paddle.proto.ModelConfig_pb2 import LayerConfig
+    from paddle.proto.ModelConfig_pb2 import LayerInputConfig
+    from paddle.proto.ModelConfig_pb2 import ProjectionConfig
+    from paddle.proto.ModelConfig_pb2 import OperatorConfig
+    from paddle.proto.ModelConfig_pb2 import GeneratorConfig
+    from paddle.proto.ModelConfig_pb2 import LinkConfig
+    from paddle.proto.ParameterConfig_pb2 import ParameterConfig
+    from paddle.proto.ParameterConfig_pb2 import ParameterUpdaterHookConfig
+    from paddle.proto.TrainerConfig_pb2 import TrainerConfig
+
+except Exception as e:
+    traceback.print_exc()
+    raise
+
+logging.basicConfig(
+    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s',
+)
+logger = logging.getLogger('paddle')
+logger.setLevel(logging.INFO)
+__real_print__ = print
+print=logger.info
+
+# from layer type name to layer class
+g_layer_type_map = {}
+
+# Initialize global variables. We use this function so that we can
+# call parse_config() multiple times
+def init_config_environment(
+        g_default_momentum = 0.,
+        g_default_decay_rate = 0.,
+        g_default_initial_mean = 0.,
+        g_default_initial_std = 0.01,
+        g_default_num_batches_regularization = 1,
+        g_default_initial_strategy = 0,
+        g_default_initial_smart = False,
+        g_default_gradient_clipping_threshold = 0.,
+        g_default_device = -1,
+        g_default_update_hooks = None,
+        g_default_compact_func = None,
+
+        g_config = TrainerConfig(),
+        g_layer_map = {},
+        g_parameter_map = {},
+
+        g_extended_config_funcs = {},
+
+        # store command args of paddle_trainer
+        g_command_config_args = {},
+
+        # Used for PyDataProvider to avoid duplicate module name
+        g_py_module_name_list = [],
+
+        g_current_submodel = None,
+        g_root_submodel = None,
+        g_submodel_map = {},
+        g_submodel_stack = [],
+
+        g_add_submodel_suffix = False,
+    ):
+
+    for k, v in locals().iteritems():
+        globals()[k] = copy.deepcopy(v)
+
+
+# Because type is widely used as a variable name in this code.
+# we need a different function name for the builtin type()
+def type_of(x):
+    return type(x)
+
+
+# Check a condition derived config file
+def config_assert(b, msg):
+    if not b:
+        logger.fatal(msg)
+
+g_config_funcs = {}
+
+# decorator for indicating a function which can be used in config file
+def config_func(func):
+    g_config_funcs[func.func_name] = func
+    return func
+
+# decorator for indicating a class which can be used in config file
+def config_class(cls):
+    g_config_funcs[cls.__name__] = cls
+    return cls
+
+# decorator for indicating a class for a layer type
+def config_layer(layer_type):
+    def wrap(cls):
+        g_config_funcs[cls.__name__] = cls
+        g_layer_type_map[layer_type] = cls
+        return cls
+    return wrap
+
+def gen_parameter_name(layer_name, input_index):
+    return '_%s.w%d' % (layer_name, input_index)
+
+def gen_bias_parameter_name(layer_name):
+    return '_%s.wbias' % layer_name
+
+def default(x, default_value):
+    return default_value if x is None else x
+
+class Cfg(object):
+    def add_keys(self, locals):
+        for k, v in locals.iteritems():
+            if not k.startswith('_'):
+                self.__setattr__(k, v)
+
+# functions available in config file
+
+# Define the name of the input layers of the NeuralNetwork.
+# The type of these layers must be "data".
+# These layers will be provided with the DataBatch obtained
+# from DataProvider. The data streams from DataProvider must
+# have the same order.
+@config_func
+def Inputs(*args):
+    for name in args:
+        name = MakeLayerNameInSubmodel(name)
+        global g_current_submodel, g_root_submodel
+        if g_current_submodel.is_recurrent_layer_group:
+            config_assert(False, "Do not set Inputs in recurrent layer group")
+        else:
+            g_current_submodel.input_layer_names.append(name)
+
+        if g_current_submodel is g_root_submodel:
+            g_config.model_config.input_layer_names.append(name)
+
+
+# Define the name of the output layers of the NeuralNetwork.
+# Usually the output is simply the cost layer.
+# You can specify other layers as outputs and calculate the
+# cost (and its derivative) yourself.
+@config_func
+def Outputs(*args):
+    for name in args:
+        name = MakeLayerNameInSubmodel(name)
+        global g_current_submodel, g_root_submodel
+        if g_current_submodel.is_recurrent_layer_group:
+            config_assert(False, "Do not set Outputs in recurrent layer group")
+        else:
+            g_current_submodel.output_layer_names.append(name)
+
+        if g_current_submodel is g_root_submodel:
+            g_config.model_config.output_layer_names.append(name)
+
+
+@config_func
+def SubModelBegin(name):
+    global g_current_submodel, g_root_submodel, g_submodel_stack
+    g_submodel_stack.append(g_current_submodel)
+
+    name = MakeLayerNameInParentSubmodel(name) #rename in nested submodel
+
+    config_assert(name not in g_submodel_map,
+                  'Duplicated submodel name: %s' % name)
+
+    sub_model = g_config.model_config.sub_models.add()
+    sub_model.name = name
+    g_submodel_map[name] = sub_model
+    g_current_submodel = sub_model
+
+@config_func
+def SubModelEnd(name = None):
+    global g_current_submodel, g_root_submodel, g_submodel_stack
+    config_assert(g_current_submodel is not g_root_submodel, "submodel not begin")
+    if name is not None:
+        config_assert(g_current_submodel.name == MakeLayerNameInParentSubmodel(name),
+                      "submodel name error")
+
+    g_current_submodel = g_submodel_stack.pop()
+
+def MakeLayerNameInParentSubmodel(name):
+    suffix = ""
+    for submodel in g_submodel_stack[1:]:
+        suffix = "@" + submodel.name + suffix
+    return name + suffix
+
+def GetLayerBaseName(name):
+    return name.split('@')[0]
+
+def MakeLayerNameInSubmodel(name, submodel_name = None):
+    global g_current_submodel
+    global g_add_submodel_suffix
+    if (submodel_name is None
+        and not g_add_submodel_suffix
+        and not g_current_submodel.is_recurrent_layer_group):
+        return name
+    if submodel_name is None:
+        submodel_name = g_current_submodel.name
+    return name + "@" + submodel_name
+
+# Define a recurrent layer group begin with RecurrentLayerGroupBegin
+# and end with RecurrentLayerGroupEnd.
+# A recurrent layer group forward/backward one frame after previous frame
+# forward/backward through all layers in layer group.
+# in_links are names of layer used as input layer in the layer group.
+# out_links are names of layer in layer group used as outside layer's input.
+#
+# If generator is set, the layer group need one or more than one outlinks.
+# The first outlink should always be the generated token ids.
+# If generator.num_results_per_sample is not set, the output for one sample is
+# a ids sequence. Else if num_results_per_sample is more than one,
+# the output for one sample is up to #num_results_per_sample generated
+# sequences, which are packed in one sequence in output ids vector. Each
+# generated sequence has a generation probability. The probabilities for one
+# sample are stored in one row of output value matrix.
+# Packed generated sequences format, for each i:
+#   seq_i_length: one interger, seq_i content length,
+#   [seq_i content], length = seq_i_length
+#   seq_i_end_mark: one interger, for format check, always -1
+# You can use "seq_text_printer" to print the output of the generator.
+@config_func
+def RecurrentLayerGroupWithoutOutLinksBegin(name,
+                                            in_links,
+                                            seq_reversed=False):
+    global g_current_submodel
+    config_assert(g_config.model_config.type == "recurrent_nn",
+                  "RecurrentLayerGroup should be used only in recurrent_nn")
+    RecurrentLayerGroup(name=name)  # add to father model
+    SubModelBegin(name)
+    g_current_submodel.is_recurrent_layer_group = True
+    g_current_submodel.reversed = seq_reversed
+    in_links_count = 0
+    for link in in_links:
+        if isinstance(link, basestring):
+            name = link
+            has_subseq = False
+        else:
+            name = link.link_name
+            has_subseq = link.has_subseq
+        if in_links_count == 0:
+            in_links_has_subseq = has_subseq
+        else:
+            config_assert(in_links_has_subseq == has_subseq,
+                          "The sequence type of in_links should be the same in RecurrentLayerGroup")
+        in_links_count += 1
+        layer_name = MakeLayerNameInParentSubmodel(name)
+        layer = g_layer_map[layer_name]
+        if has_subseq:
+            SequenceScatterAgentLayer(name=name, size=layer.size)
+        else:
+            ScatterAgentLayer(name=name, size=layer.size)
+        pair = g_current_submodel.in_links.add()
+        pair.layer_name = layer_name
+        pair.link_name = MakeLayerNameInSubmodel(name)
+        pair.has_subseq = has_subseq
+
+@config_func
+def RecurrentLayerGroupSetOutLink(link):
+    if isinstance(link, basestring):
+        name = link
+        has_subseq = False
+    else:
+        name = link.link_name
+        has_subseq = link.has_subseq
+    layer_name = MakeLayerNameInParentSubmodel(name)
+    pair = g_current_submodel.out_links.add()
+    pair.layer_name = MakeLayerNameInSubmodel(name)
+    pair.link_name = layer_name
+    pair.has_subseq = has_subseq
+
+
+def RecurrentLayerGroupSetGenerator(generator=None):
+    generator.eos_layer_name = MakeLayerNameInSubmodel(
+        generator.eos_layer_name)
+    g_current_submodel.generator.CopyFrom(generator)
+
+
+@config_func
+def RecurrentLayerGroupBegin(name,
+                             in_links,
+                             out_links,
+                             generator=None,
+                             seq_reversed=False):
+    RecurrentLayerGroupWithoutOutLinksBegin(name,
+                                            in_links,
+                                            seq_reversed)
+    for link in out_links:
+        RecurrentLayerGroupSetOutLink(link)
+
+
+    if generator is not None:
+        RecurrentLayerGroupSetGenerator(generator)
+        config_assert(len(in_links) == 0,
+                      "no in_links should be passed to generator")
+        config_assert(len(out_links) >= 1,
+                      "one or more than one out_links should be passed to generator")
+
+
+
+@config_func
+def RecurrentLayerGroupEnd(name):
+    global g_current_submodel
+    config_assert(g_current_submodel.is_recurrent_layer_group,
+                  "RecurrentLayerGroup not begin")
+    for pair in g_current_submodel.memories: #check exist
+        layer = g_layer_map[pair.layer_name]
+        config_assert(layer is not None, "memory declare wrong name:%s" % pair.layer_name)
+        memory_link = g_layer_map[pair.link_name]
+        config_assert(layer.size == memory_link.size,
+                      "memory declare wrong size:%d" % memory_link.size)
+
+    prev_submodel = g_current_submodel
+    SubModelEnd(name)
+
+    for pair in prev_submodel.out_links:
+        layer = g_layer_map[pair.layer_name]
+        # add out agent to father model
+        agent_name = GetLayerBaseName(pair.link_name)
+        if prev_submodel.HasField("generator"):
+            DataLayer(name=agent_name, size=layer.size)
+        elif pair.has_subseq:
+            SequenceGatherAgentLayer(name=agent_name, size=layer.size)
+        else:
+            GatherAgentLayer(name=agent_name, size=layer.size)
+
+# Define the model type
+# currently, the paddle supports "nn", "recurrent_nn", "recursive_nn" and "multi_nn"
+@config_func
+def model_type(name):
+    g_config.model_config.type = name
+
+@config_class
+class Bias(Cfg):
+    def __init__(
+            self,
+            parameter_name=None,
+            learning_rate=None,
+            momentum=None,
+            decay_rate=None,
+            decay_rate_l1=None,
+            initial_mean=None,
+            initial_std=None,
+            initial_strategy=None,
+            initial_smart=None,
+            num_batches_regularization=None,
+            sparse_remote_update=None,
+            gradient_clipping_threshold=None,
+            is_static=None,
+            is_shared=None,
+            ):
+        self.add_keys(locals())
+
+# Define one input for a layer
+@config_class
+class Input(Cfg):
+    def __init__(
+            self,
+            input_layer_name,
+            parameter_name=None,
+            learning_rate=None,
+            momentum=None,
+            decay_rate=None,
+            decay_rate_l1=None,
+            initial_mean=None,
+            initial_std=None,
+            initial_strategy=None,
+            initial_smart=None,
+            num_batches_regularization=None,
+            sparse_remote_update=None,
+            sparse_update=None,
+            gradient_clipping_threshold=None,
+            conv=None,
+            norm=None,
+            pool=None,
+            image=None,
+            block_expand=None,
+            format=None,
+            nnz=None,
+            is_static=None,
+            is_shared=None,
+            update_hooks=None,
+            input_layer_argument=None,
+            ):
+        self.add_keys(locals())
+        self.input_layer_name = MakeLayerNameInSubmodel(input_layer_name)
+
+# Define a projection for iexed layer
+@config_class
+class Projection(Input):
+    type = None # subclass should set it correctly
+    def __init__(
+            self,
+            input_layer_name,
+            size = 0, # projection output size
+            parameter_name=None,
+            learning_rate=None,
+            momentum=None,
+            decay_rate=None,
+            decay_rate_l1=None,
+            initial_mean=None,
+            initial_std=None,
+            initial_strategy=None,
+            initial_smart=None,
+            num_batches_regularization=None,
+            sparse_remote_update=None,
+            sparse_update=None,
+            gradient_clipping_threshold=None,
+            ptype=None,
+            format=None,
+            nnz=None,
+            is_static=None,
+            is_shared=None,
+            update_hooks=None,
+            input_layer_argument=None,
+            ):
+        self.add_keys(locals())
+        self.input_layer_name = MakeLayerNameInSubmodel(input_layer_name)
+
+        self.proj_conf = ProjectionConfig()
+        if ptype is not None:
+            self.proj_conf.type = ptype
+        else:
+            self.proj_conf.type = self.type
+
+    # calculate the output_size given input_size. return 0
+    # to indicate using the size from Layer config
+    def calc_output_size(self, input_layer_config):
+        return self.size
+    def calc_parameter_size(self, input_size, output_size):
+        raise NotimplementedError
+    def calc_parameter_dims(self, input_size, output_size):
+        raise NotimplementedError
+
+
+@config_class
+class IdentityProjection(Projection):
+    type = 'identity'
+
+    def calc_output_size(self, input_layer_config):
+        return input_layer_config.size
+    def calc_parameter_size(self, input_size, output_size):
+        return 0
+    def calc_parameter_dims(self, input_size, output_size):
+        return []
+
+# Like IdentityProjection, but layer size may smaller than input size,
+# the projection select dimesions [offset, offset+layer_size) from input
+@config_class
+class IdentityOffsetProjection(Projection):
+    type = 'identity_offset'
+
+    def __init__(
+            self,
+            input_layer_name,
+            offset,
+            **xargs):
+        super(IdentityOffsetProjection, self).__init__(
+            input_layer_name, **xargs)
+        self.proj_conf.offset = offset
+
+    def calc_parameter_size(self, input_size, output_size):
+        return 0
+    def calc_parameter_dims(self, input_size, output_size):
+        return []
+
+# DotMulProjection performs element-wise multiplication with weight
+@config_class
+class DotMulProjection(Projection):
+    type = 'dot_mul'
+
+    def calc_output_size(self, input_layer_config):
+        return input_layer_config.size
+    def calc_parameter_size(self, input_size, output_size):
+        return output_size
+    def calc_parameter_dims(self, input_size, output_size):
+        return [1, output_size]
+
+@config_class
+class TableProjection(Projection):
+    type = 'table'
+
+    def calc_parameter_size(self, input_size, output_size):
+        return input_size * output_size
+    def calc_parameter_dims(self, input_size, output_size):
+        return [input_size, output_size]
+
+@config_class
+class FullMatrixProjection(Projection):
+    type = 'fc'
+
+    def calc_parameter_size(self, input_size, output_size):
+        return input_size * output_size
+    def calc_parameter_dims(self, input_size, output_size):
+        return [input_size, output_size]
+
+@config_class
+class TransposedFullMatrixProjection(Projection):
+    type = 'trans_fc'
+
+    def calc_parameter_size(self, input_size, output_size):
+        return input_size * output_size
+    def calc_parameter_dims(self, input_size, output_size):
+        return [output_size, input_size]
+
+@config_class
+class ContextProjection(Projection):
+    type = 'context'
+
+    def __init__(
+            self,
+            input_layer_name,
+            context_start,
+            context_length,
+            trainable_padding,
+            **xargs):
+        super(ContextProjection, self).__init__(input_layer_name, **xargs)
+        self.proj_conf.context_start = context_start
+        self.proj_conf.context_length = context_length
+        self.proj_conf.trainable_padding = trainable_padding
+        self._total_pad = max(0, -self.proj_conf.context_start) \
+                          + max(0, self.proj_conf.context_start \
+                                + self.proj_conf.context_length - 1)
+
+    def calc_output_size(self, input_layer_config):
+        return input_layer_config.size * self.proj_conf.context_length
+
+    def calc_parameter_size(self, input_size, output_size):
+        if self.proj_conf.trainable_padding == False:
+            return 0
+        else:
+            return input_size * self._total_pad
+
+    def calc_parameter_dims(self, input_size, output_size):
+        return [self._total_pad, input_size]
+
+    _total_pad = 0
+
+
+# Define a operator for mixed layer
+@config_class
+class Operator(Cfg):
+    type = None # subclass should set it correctly
+    def __init__(
+            self,
+            input_layer_names,
+            ):
+        self.add_keys(locals())
+
+        self.operator_conf = OperatorConfig()
+        self.operator_conf.type = self.type
+
+    def check_dims(self):
+        pass
+
+    def calc_output_size(self, input_sizes):
+        return 0
+
+@config_class
+class DotMulOperator(Operator):
+    type = 'dot_mul'
+    def __init__(
+            self,
+            input_layer_names,
+            scale=None,
+            **xargs):
+        super(DotMulOperator, self).__init__(
+            input_layer_names, **xargs)
+        if scale is not None:
+            self.operator_conf.dotmul_scale = scale
+
+        config_assert(len(input_layer_names) == 2, "DotMul is binary operator")
+
+    def check_dims(self):
+        for i in range(2):
+            config_assert(self.operator_conf.input_sizes[i] ==
+                          self.operator_conf.output_size,
+                          "DotMul input_size != output_size")
+
+    def calc_output_size(self, input_sizes):
+        return input_sizes[0]
+
+
+
+@config_class
+class ConvOperator(Operator):
+    type = 'conv'
+    def __init__(
+            self,
+            input_layer_names,
+            num_filters=None,
+            conv_conf=None,
+            **xargs):
+        super(ConvOperator, self).__init__(
+            input_layer_names, **xargs)
+        if num_filters is not None:
+            self.operator_conf.num_filters = num_filters
+
+        parse_conv(conv_conf, input_layer_names[0], self.operator_conf.conv_conf, True)
+        self.operator_conf.output_size = (self.operator_conf.conv_conf.output_x  ** 2) * num_filters
+
+        config_assert(len(input_layer_names) == 2, "Conv is binary operator")
+
+
+
+
+# please refer to the comments in proto/ModelConfig.proto
+@config_class
+class Conv(Cfg):
+    def __init__(
+            self,
+            filter_size,
+            channels,
+            padding = None,
+            stride = None,
+            groups = None,
+            filter_channels = None,
+            output_x = None,
+            img_size = None,
+            caffe_mode = True,
+            filter_size_y = None,
+            padding_y = None,
+            stride_y = None):
+        self.add_keys(locals())
+        if filter_size_y is None:
+          self.filter_size_y = filter_size
+        if padding_y is None:
+          self.padding_y = padding
+        if stride_y is None:
+          self.stride_y = stride
+        if output_x is not None:
+          config_assert(output_x <= 0)
+
+# please refer to the comments in proto/ModelConfig.proto
+@config_class
+class Pool(Cfg):
+    def __init__(
+            self,
+            pool_type,
+            channels,
+            size_x,
+            size_y = None,
+            img_width = None,
+            start = None,
+            stride = None,
+            stride_y = None,
+            padding = None,
+            padding_y = None):
+        self.add_keys(locals())
+
+# please refer to the comments in proto/ModelConfig.proto
+@config_class
+class Norm(Cfg):
+    def __init__(
+            self,
+            norm_type,
+            channels,
+            size,
+            scale,
+            pow,
+            output_x = None,
+            img_size = None,
+            blocked = None):
+        self.add_keys(locals())
+
+# please refer to the comments in proto/ModelConfig.proto
+@config_class
+class Image(Cfg):
+    def __init__(
+            self,
+            channels,
+            img_size = None):
+        self.add_keys(locals())
+
+@config_class
+class BlockExpand(Cfg):
+    def __init__(
+            self,
+            channels,
+            padding_x = 0,
+            padding_y = 0,
+            stride_x = 0,
+            stride_y = 0,
+            block_x = 0,
+            block_y = 0,
+            img_size_x = 0,
+            img_size_y = 0,
+            output_x = 0,
+            output_y = 0):
+        self.add_keys(locals())
+
+def DataBase(async_load_data=False,
+             constant_slots=None,
+             data_ratio=1,
+             is_main_data=True,
+             usage_ratio=None):
+    # default: all sub dataproviders are treat as "main data".
+    # see proto/DataConfig.proto for is_main_data
+    data_config = DataConfig()
+
+    data_config.async_load_data = async_load_data
+
+    if constant_slots:
+        data_config.constant_slots.extend(constant_slots)
+    data_config.data_ratio=data_ratio
+    data_config.is_main_data=is_main_data
+
+    usage_ratio=default(usage_ratio, settings_deprecated["usage_ratio"])
+    config_assert(usage_ratio >= 0 and usage_ratio <= 1,
+                  "The range of usage_ratio is [0, 1]")
+    data_config.usage_ratio = usage_ratio
+
+    return data_config
+
+@config_func
+def SimpleData(
+        files=None,
+        feat_dim=None,
+        context_len=None,
+        buffer_capacity=None,
+        **xargs):
+    data_config = DataBase(**xargs)
+    data_config.type = 'simple'
+    data_config.files = files
+    data_config.feat_dim = feat_dim
+    if context_len is not None:
+        data_config.context_len = context_len
+    if buffer_capacity:
+        data_config.buffer_capacity = buffer_capacity
+    return data_config
+
+@config_func
+def PyData(
+        files=None,
+        type=None,
+        file_group_queue_capacity=None,
+        load_data_module=None,
+        load_data_object=None,
+        load_data_args="",
+        load_file_count=None,
+        constant_slots=None,
+        load_thread_num=None,
+        **xargs):
+    data_config = DataBase(**xargs)
+    data_config.type = 'py'
+    if load_data_module in g_py_module_name_list:
+        def get_path(module):
+            m = __import__(load_data_module)
+            return os.path.split(os.path.realpath(m.__file__))[0]
+        # python C-api is not thread safe, one module can only be import once,
+        # so here we nedd to copy the module with different names if it has to be
+        # imported several times.
+        module_new_name = "%s_copy_%d" % (load_data_module, len(g_py_module_name_list))
+        g_py_module_name_list.append(module_new_name)
+        module_path = "%s/%s.py" % (get_path(load_data_module), load_data_module)
+        new_module_path = "%s/%s.py" % (get_path(load_data_module), module_new_name)
+        if os.path.isfile(module_path) == False:
+            raise Exception("File %s is not exist." % module_path)
+        shutil.copy2(module_path, new_module_path)
+        load_data_module = module_new_name
+    else:
+        g_py_module_name_list.append(load_data_module)
+    if load_data_module is not None and load_data_object is not None:
+        data_config.load_data_module = load_data_module
+        data_config.load_data_object = load_data_object
+    else:
+        raise ValueError('load_data_module, load_data_object is not defined.')
+    data_config.load_data_args = load_data_args
+
+    data_config.files = files or ''
+    if file_group_queue_capacity is not None:
+        data_config.file_group_conf.queue_capacity = file_group_queue_capacity
+    if load_file_count is not None:
+        data_config.file_group_conf.load_file_count = load_file_count
+    if load_thread_num is not None:
+        data_config.file_group_conf.load_thread_num = load_thread_num
+    if constant_slots:
+        data_config.constant_slots.extend(constant_slots)
+    return data_config
+
+@config_func
+def ProtoData(
+        files=None,
+        type=None,
+        file_group_queue_capacity=None,
+        load_file_count=None,
+        constant_slots=None,
+        load_thread_num=None,
+        **xargs):
+    data_config = DataBase(**xargs)
+    if type is None:
+        data_config.type = 'proto'
+    else:
+        data_config.type = type
+    data_config.files = files
+
+    # When type="proto_group", one data provider contains at most
+    # load_file_count files, and there are at most
+    # (queue_capacity + load_thread_num + 1) data providers in memory
+    if file_group_queue_capacity is not None:
+        data_config.file_group_conf.queue_capacity = file_group_queue_capacity
+    if load_file_count is not None:
+        data_config.file_group_conf.load_file_count = load_file_count
+    if load_thread_num is not None:
+        data_config.file_group_conf.load_thread_num = load_thread_num
+    if constant_slots:
+        data_config.constant_slots.extend(constant_slots)
+    return data_config
+
+#real data for training is actually provided by "sub_data" data providers.
+@config_func
+def MultiData(
+        sub_data=[]
+        ):
+    data_config = DataConfig()
+    data_config.type = 'multi'
+    data_config.sub_data_configs.extend(sub_data)
+    return data_config
+
+@config_func
+def Data(
+        type,
+        files=None,
+        feat_dim=None,
+        slot_dims=None,
+        context_len=None,
+        buffer_capacity=None,
+        **xargs):
+
+    data_config = DataBase(**xargs)
+    data_config.type = type
+    data_config.files = files
+    data_config.feat_dim = feat_dim
+    data_config.slot_dims.extend(slot_dims)
+    if context_len is not None:
+        data_config.context_len = context_len
+    data_config.buffer_capacity = buffer_capacity
+    return data_config
+
+
+@config_func
+def TrainData(data_config, async_load_data=None):
+    config_assert(not g_config.HasField('data_config'),
+                  'Only one TrainData definition is allowed')
+    g_config.data_config.CopyFrom(data_config)
+    g_config.data_config.for_test = False
+    if async_load_data is not None:
+        logger.warning("Deprecated: async_load_data should be used inside"
+                       " Data definition")
+        g_config.data_config.async_load_data = async_load_data
+
+
+@config_func
+def TestData(data_config, async_load_data=None):
+    config_assert(not g_config.HasField('test_data_config'),
+                  'Only one TestData definition is allowed')
+    g_config.test_data_config.CopyFrom(data_config)
+    g_config.test_data_config.for_test = True
+    if async_load_data is not None:
+        logger.warning("Deprecated: async_load_data should be used inside"
+                       " Data definition")
+        g_config.test_data_config.async_load_data = async_load_data
+
+def parse_pool(pool, input_layer_name, pool_conf):
+    pool_conf.pool_type = pool.pool_type
+    config_assert(pool.pool_type in ['max-projection', 'avg-projection',
+                  'cudnn-max-pool', 'cudnn-avg-pool'],
+                  "pool-type %s is not in "
+                  "['max-projection', 'avg-projection', "
+                  "'cudnn-max-pool', 'cudnn-avg-pool']"
+                  % pool.pool_type)
+    if pool.size_y or pool.stride_y or pool.img_width or pool.padding_y:
+        config_assert(pool.pool_type.startswith('cudnn'),
+                      "'size_y', 'stride_y' and 'img_width' and 'padding_y'"
+                      "can only be used for cudnn")
+
+    pool_conf.channels = pool.channels
+    pool_conf.size_x = pool.size_x
+    pool_conf.stride = pool.stride
+
+    pool_conf.size_y = default(pool.size_y, pool_conf.size_x)
+    pool_conf.stride_y = default(pool.stride_y, pool_conf.stride);
+
+    img_pixels = g_layer_map[input_layer_name].size / pool.channels
+    pool_conf.img_size = default(pool.img_width, int(img_pixels ** 0.5))
+    pool_conf.img_size_y = img_pixels / pool_conf.img_size
+    config_assert(pool_conf.img_size * pool_conf.img_size_y == img_pixels,
+                  "Incorrect input image size %d for input image pixels %d"
+                  % (pool_conf.img_size, img_pixels))
+
+    if pool.start is not None:
+        config_assert(pool.padding is None,
+              'At most one of start and padding can be set.')
+        pool_conf.start = pool.start
+        pool_conf.padding = 0
+        pool_conf.output_x = int(math.ceil((pool_conf.img_size - \
+            pool_conf.start - pool_conf.size_x) / \
+            float(pool_conf.stride))) + 1
+
+        pool_conf.output_y = int(math.ceil((pool_conf.img_size_y - \
+            pool_conf.start - pool_conf.size_y) / \
+            float(pool_conf.stride_y))) + 1
+    elif pool.padding is not None:
+        pool_conf.padding = pool.padding
+        pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
+        pool_conf.start = 0
+        pool_conf.output_x = int(math.ceil((pool_conf.img_size + \
+            2*pool_conf.padding - pool_conf.size_x) / \
+            float(pool_conf.stride))) + 1
+        pool_conf.output_y = int(math.ceil((pool_conf.img_size_y + \
+            2*pool_conf.padding_y - pool_conf.size_y) / \
+            float(pool_conf.stride_y))) + 1
+    else:
+        raise ValueError('At least one of start and padding should be set.')
+
+def parse_image(image, input_layer_name, image_conf):
+    image_conf.channels = image.channels
+    image_pixels = g_layer_map[input_layer_name].size / image_conf.channels
+    image_conf.img_size = int(image_pixels ** 0.5)
+    config_assert((image_conf.img_size ** 2) == image_pixels,
+                  "Incorrect input image size %d for input image pixels %d"
+                  % (image_conf.img_size, image_pixels))
+
+def parse_norm(norm, input_layer_name, norm_conf):
+    norm_conf.norm_type = norm.norm_type
+    config_assert(norm.norm_type in ['rnorm', 'cmrnorm-projection'],
+                  "norm-type %s is not in [rnorm, 'cmrnorm-projection']"
+                  % norm.norm_type)
+    norm_conf.channels = norm.channels
+    norm_conf.size = norm.size
+    norm_conf.scale = norm.scale
+    norm_conf.pow = norm.pow
+    norm_conf.blocked = norm.blocked
+
+    img_pixels = g_layer_map[input_layer_name].size / norm.channels
+    norm_conf.img_size = int(img_pixels ** 0.5)
+    config_assert((norm_conf.img_size ** 2) == img_pixels,
+                  "Incorrect input image size %d for input image pixels %d"
+                  % (norm_conf.img_size, img_pixels))
+    norm_conf.output_x = norm_conf.img_size
+    if norm.norm_type in ['cmrnorm-projection']:
+        norm_conf.scale /= norm.size
+    else:
+        norm_conf.scale /= norm.size ** 2
+'''
+caffe_mode: compute the output size using floor instead of ceil,
+            which is consistent of caffe and CuDNN's convention.
+'''
+def parse_conv(conv, input_layer_name, conv_conf):
+    conv_conf.filter_size = conv.filter_size
+    conv_conf.filter_size_y = conv.filter_size_y
+    conv_conf.channels = conv.channels
+    conv_conf.padding = conv.padding
+    conv_conf.padding_y = conv.padding_y
+    conv_conf.stride = conv.stride
+    conv_conf.stride_y = conv.stride_y
+    conv_conf.groups = conv.groups
+    conv_conf.filter_channels = conv.channels / conv.groups
+    conv_conf.caffe_mode = conv.caffe_mode
+
+    img_pixels = g_layer_map[input_layer_name].size / conv.channels
+    print('channels=%d size=%d'%(conv.channels,
+      g_layer_map[input_layer_name].size))
+    conv_conf.img_size = int(img_pixels ** 0.5)
+    config_assert((conv_conf.img_size ** 2) == img_pixels,
+                  ("Input layer %s: Incorrect input image size %d for input "
+                   + "image pixels %d")
+                  % (input_layer_name, conv_conf.img_size, img_pixels))
+    if conv.caffe_mode:
+        conv_conf.output_x = \
+            1 + int(math.floor((2 * conv.padding + conv_conf.img_size \
+            - conv.filter_size) / float(conv.stride)))
+    else:
+        conv_conf.output_x = \
+            1 + int(math.ceil((2 * conv.padding + conv_conf.img_size \
+            - conv.filter_size) / float(conv.stride)))
+
+def parse_block_expand(block_expand, input_layer_name, block_expand_conf):
+    block_expand_conf.channels = block_expand.channels
+    block_expand_conf.stride_x = block_expand.stride_x
+    block_expand_conf.stride_y = block_expand.stride_y
+    block_expand_conf.padding_x = block_expand.padding_x
+    block_expand_conf.padding_y = block_expand.padding_y
+    block_expand_conf.block_x = block_expand.block_x
+    block_expand_conf.block_y = block_expand.block_y
+    block_expand_conf.img_size_x = block_expand.img_size_x
+    block_expand_conf.img_size_y = block_expand.img_size_y
+    if block_expand_conf.img_size_x == 0:
+        block_expand_conf.output_x = 0
+    else:
+        block_expand_conf.output_x = \
+            1 + \
+            int(math.ceil((2 * block_expand.padding_x + block_expand.img_size_x \
+            - block_expand.block_x) / float(block_expand.stride_x)))
+
+    if block_expand_conf.img_size_y == 0:
+      block_expand_conf.output_y = 0
+    else:
+        block_expand_conf.output_y = \
+            1 + \
+            int(math.ceil((2 * block_expand.padding_y + block_expand.img_size_y \
+            - block_expand.block_y) / float(block_expand.stride_y)))
+
+# Define an evaluator
+@config_func
+def Evaluator(
+        name,
+        type,
+        inputs,
+        chunk_scheme = None,
+        num_chunk_types = None,
+        classification_threshold = 0.5,
+        positive_label = -1,
+        dict_file = "",
+        result_file = "",
+        num_results = 1,
+        delimited = True,
+        ):
+    evaluator = g_config.model_config.evaluators.add()
+    evaluator.type = type
+    evaluator.name = MakeLayerNameInSubmodel(name)
+    if type_of(inputs) == str:
+        inputs = [inputs]
+
+    evaluator.input_layers.extend(
+        [MakeLayerNameInSubmodel(name) for name in inputs])
+
+    if chunk_scheme is not None:
+        evaluator.chunk_scheme = chunk_scheme
+        evaluator.num_chunk_types = num_chunk_types
+    g_current_submodel.evaluator_names.append(evaluator.name)
+
+    evaluator.classification_threshold = classification_threshold
+    evaluator.positive_label = positive_label
+    evaluator.dict_file = dict_file
+    evaluator.result_file = result_file
+    evaluator.num_results = num_results
+    evaluator.delimited = delimited
+
+class LayerBase(object):
+    def __init__(
+            self,
+            name,
+            type,
+            size, # size can be 0. In this case, subclass should set it.
+            inputs,
+            device=None,
+            active_type="",
+            drop_rate=0.,
+            coeff=1.):
+        config_assert('@' not in name,
+                "layer name: %s contain special character @" % name)
+        global g_current_submodel
+        name = MakeLayerNameInSubmodel(name)
+
+        config_assert(name not in g_layer_map,
+                      'Duplicated layer name: %s' % name)
+
+        self.inputs = copy.deepcopy(inputs)
+        self.operators = []
+
+        if self.inputs is None:
+            self.inputs = []
+        elif type_of(self.inputs) != list:
+            self.inputs = [self.inputs]
+
+        self.config = g_config.model_config.layers.add()
+        self.config.name = name
+        self.config.type = type
+        self.config.active_type = active_type
+        self.config.coeff = coeff
+        if size != 0:
+            self.config.size = size
+        if drop_rate != 0:
+            self.config.drop_rate = drop_rate
+
+        if device is not None:
+            self.config.device = device
+        else:
+            self.config.device = g_default_device
+
+        for input_index in xrange(len(self.inputs)):
+            input = self.inputs[input_index]
+            input_config = None
+            input_layer_name = ''
+            if type_of(input) == str:
+                input_layer_name = input
+                input_config = Input(
+                    input_layer_name = input,
+                    parameter_name = gen_parameter_name(name, input_index))
+                input_layer_name = input_config.input_layer_name
+            elif isinstance(input, Input):
+                input_layer_name = input.input_layer_name
+                input_config = input
+                if input_config.parameter_name is None:
+                    input_config.parameter_name = \
+                        gen_parameter_name(name, input_index)
+            elif isinstance(input, Operator):
+                self.operators.append(input);
+                input.operator_conf.input_indices.append(input_index)
+                input_config = Input(input.input_layer_names[0])
+                input_layer_name = input_config.input_layer_name
+            else:
+                raise ValueError(
+                    'Wrong type for inputs: %s' % type_of(input))
+            config_assert(input_layer_name in g_layer_map,
+                          "Unknown input layer '%s' for layer %s"
+                          % (input_layer_name, name))
+            self.inputs[input_index] = input_config
+            layer_input = self.config.inputs.add()
+            layer_input.input_layer_name = input_config.input_layer_name
+            if input_config.input_layer_argument is not None:
+                layer_input.input_layer_argument = \
+                    input_config.input_layer_argument
+
+        g_layer_map[name] = self.config
+
+        g_current_submodel.layer_names.append(self.config.name)
+
+
+    def get_input_layer(self, input_index):
+        return g_layer_map[self.config.inputs[input_index].input_layer_name]
+
+    # will return the bias created if not *for_self*
+    def create_bias_parameter(
+            self,
+            bias, # True/False or BiasCfg
+            size,
+            dims = None,
+            for_self = True, # whether create bias for layer self
+            ):
+
+        if size == 0:
+            return
+        if dims is None:
+            dims = [1, size]
+
+        config_assert(type_of(bias) == bool or type_of(bias) == Bias,
+                      'Incorrect type for bias: %s' % type_of(bias))
+
+        if type_of(bias) == bool:
+            if bias:
+                bias = Bias()
+
+        if type_of(bias) == Bias:
+            if bias.parameter_name is None:
+                bias.parameter_name = gen_bias_parameter_name(self.config.name)
+            if bias.parameter_name not in g_parameter_map:
+                Parameter(
+                    bias.parameter_name,
+                    size,
+                    self.config.device,
+                    dims,
+                    bias.learning_rate,
+                    bias.momentum,
+                    decay_rate=bias.decay_rate,
+                    decay_rate_l1=bias.decay_rate_l1,
+                    initial_mean=bias.initial_mean,
+                    initial_std=bias.initial_std,
+                    initial_strategy=bias.initial_strategy,
+                    initial_smart=bias.initial_smart,
+                    num_batches_regularization=bias.num_batches_regularization,
+                    sparse_remote_update=bias.sparse_remote_update,
+                    gradient_clipping_threshold=bias.gradient_clipping_threshold,
+                    is_static=bias.is_static,
+                    is_shared=bias.is_shared,
+                    )
+            if for_self:
+                self.config.bias_parameter_name = bias.parameter_name
+            else:
+                return bias.parameter_name
+
+    def create_input_parameter(
+            self,
+            input_index,
+            size,
+            dims=None,
+            sparse = False,
+            format = "csr"):
+        if dims is None:
+            # TODO(yuyang18): print warning and callstack here!
+            dims = list()
+
+        if size == 0:
+            return
+
+        input_config = self.inputs[input_index]
+
+        self.config.inputs[input_index].input_parameter_name = \
+            input_config.parameter_name
+
+        if input_config.parameter_name in g_parameter_map:
+            para = g_parameter_map[input_config.parameter_name]
+            config_assert(size == para.size, ('Shared parameter "%s" does not '
+                                              + 'have same size: %s vs. %s')
+                          % (input_config.parameter_name, para.size, size))
+
+            config_assert(dims == para.dims, ('Shared parameter "%s" does not '
+                                              + 'have same dims: %s vs. %s')
+                          % (input_config.parameter_name, para.dims, dims))
+            return
+
+        Parameter(
+            input_config.parameter_name,
+            size,
+            self.config.device,
+            dims,
+            input_config.learning_rate,
+            input_config.momentum,
+            decay_rate=input_config.decay_rate,
+            decay_rate_l1=input_config.decay_rate_l1,
+            initial_mean=input_config.initial_mean,
+            initial_std=input_config.initial_std,
+            initial_strategy=input_config.initial_strategy,
+            initial_smart=input_config.initial_smart,
+            num_batches_regularization=input_config.num_batches_regularization,
+            sparse_remote_update=input_config.sparse_remote_update,
+            sparse_update=input_config.sparse_update,
+            gradient_clipping_threshold=input_config.gradient_clipping_threshold,
+            sparse=sparse,
+            format=format,
+            is_static=input_config.is_static,
+            is_shared=input_config.is_shared,
+            update_hooks=input_config.update_hooks
+            )
+
+    def set_layer_size(self, size):
+        if self.config.size == 0:
+            self.config.size = size
+        else:
+            config_assert(self.config.size == size,
+                          'Different inputs result in' +
+                          'different layer size at layer %s' % self.config.name)
+
+@config_layer('multi_class_cross_entropy_with_selfnorm')
+class MultiClassCrossEntropySelfNormCostLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            softmax_selfnorm_alpha=0.1,
+            **xargs):
+        super(MultiClassCrossEntropySelfNormCostLayer, self).__init__(name,
+            'multi_class_cross_entropy_with_selfnorm', 0, inputs, **xargs)
+        self.config.softmax_selfnorm_alpha = softmax_selfnorm_alpha
+
+@config_layer('fc')
+class FCLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            size,
+            inputs,
+            bias=True,
+            **xargs):
+        super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            psize = self.config.size * input_layer.size
+            dims = [input_layer.size, self.config.size]
+            format = self.inputs[input_index].format
+            sparse = format == "csr" or format == "csc"
+
+            if sparse:
+                psize = self.inputs[input_index].nnz
+
+            self.create_input_parameter(input_index, psize, dims, sparse, format)
+        self.create_bias_parameter(bias, self.config.size)
+
+@config_layer('selective_fc')
+class SelectiveFCLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            size,
+            inputs,
+            bias=True,
+            selective_fc_pass_generation=False,
+            has_selected_colums=True,
+            selective_fc_full_mul_ratio=0.02,
+            selective_fc_parallel_plain_mul_thread_num=None,
+            **xargs):
+        super(SelectiveFCLayer, self).__init__(
+            name, 'selective_fc', size, inputs=inputs, **xargs)
+        # user MUST know if selctive fc is used in training,
+        # parameter matrices saved by this layer are automatically transposed,
+        # BUT bias is not.
+
+        # if selective_fc is used only in testing mode, and parameters for
+        # this layer are trained by fully connected layers,
+        # then TranposedFullMatrixProjectin MUST be used in training
+        # to avoid manual transpose in testing.
+
+        self.config.selective_fc_pass_generation = selective_fc_pass_generation
+        self.config.has_selected_colums = has_selected_colums
+        self.config.selective_fc_full_mul_ratio = selective_fc_full_mul_ratio
+        if selective_fc_parallel_plain_mul_thread_num is not None:
+            self.config.selective_fc_parallel_plain_mul_thread_num = selective_fc_parallel_plain_mul_thread_num
+
+        input_num = len(self.inputs)
+        if has_selected_colums:
+            config_assert(input_num >= 2,
+                ("if indices of selected columns are not specified, "
+                "selective_fc Layer has at least two inputs"))
+            input_num -= 1
+
+        for input_index in xrange(input_num):
+            input_layer = self.get_input_layer(input_index)
+            psize = self.config.size * input_layer.size
+            dims = [input_layer.size, self.config.size]
+            dims = dims[::-1]  # transpose the parameter
+            format = self.inputs[input_index].format
+            sparse = format == "csr" or format == "csc"
+            if sparse:
+                psize = self.inputs[input_index].nnz
+
+            self.create_input_parameter(
+                input_index, psize, dims, sparse, format)
+        self.create_bias_parameter(bias, self.config.size)
+
+@config_layer('data')
+class DataLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            size,
+            device=None):
+        super(DataLayer, self).__init__(name, 'data' , size, inputs=[], device=device)
+
+'''
+DataNormLayer: A layer for data normalization
+Input: One and only one input layer is accepted. The input layer must
+       be DataLayer with dense data type
+Output: The normalization of the input data
+
+Reference:
+    LA Shalabi, Z Shaaban, B Kasasbeh. Data mining: A preprocessing engine
+
+Example:
+    Layer(
+        name = "norm_input_layer",
+        type = "data_norm",
+        inputs = [Input("input_layer",
+                        parameter_name = "_slot0.stats")],
+        data_norm_strategy = "z-score",
+    )
+
+Note:
+  (1) The parameter has been calculated in the preprocessing stage,
+      and should be initialized by --init_model_path when training.
+  (2) Three data normalization methoeds are considered
+          z-score: y = (x-mean)/std
+          min-max: y = (x-min)/(max-min)
+          decimal-scaling: y = x/10^j, where j is the smallest integer such that max(|y|)<1
+'''
+@config_layer('data_norm')
+class DataNormLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            data_norm_strategy="z-score",
+            device=None):
+        super(DataNormLayer, self).__init__(
+            name, 'data_norm', 0, inputs=inputs, device=device)
+        self.config.data_norm_strategy = data_norm_strategy
+        config_assert(len(inputs) == 1, 'DataNormLayer must have 1 input')
+        input_layer = self.get_input_layer(0)
+        self.set_layer_size(input_layer.size)
+        para_size = 5 * input_layer.size
+        para_dims = [5, input_layer.size]
+        self.inputs[0].is_static = True
+        self.create_input_parameter(0, para_size, para_dims)
+
+@config_layer('prelu')
+class ParameterReluLayer(LayerBase):
+    layer_type = 'prelu'
+    def __init__(
+            self,
+            name,
+            inputs,
+            partial_sum = 1,
+            **args):
+        super(ParameterReluLayer, self).__init__(
+            name, self.layer_type, 0, inputs=inputs, **args)
+        config_assert(len(self.inputs) == 1)
+        config_assert(self.input_layer.size % partial_sum == 0)
+        input_layer = self.get_input_layer(0)
+        self.set_layer_size(input_layer.size)
+        self.create_input_parameter(0, input_layer.size / partial_sum)
+
+@config_layer('conv')
+class ConvLayerBase(LayerBase):
+    layer_type = 'conv'
+    def __init__(
+            self,
+            name,
+            inputs=[],
+            bias=True,
+            num_filters=None,
+            shared_biases=False,
+            **xargs):
+        super(ConvLayerBase, self).__init__(
+            name, self.layer_type, 0, inputs=inputs, **xargs)
+
+        if num_filters is not None:
+            self.config.num_filters = num_filters
+
+        use_gpu = int(g_command_config_args.get("use_gpu", 0))
+        parallel_nn = int(g_command_config_args.get("parallel_nn", 0))
+
+        # Automatically select cudnn_type for GPU and exconv for CPU
+        # if set type=conv, but still reserve the way user specify
+        # exconv or cudnn_conv manually.
+        if self.layer_type == "cudnn_conv":
+            config_assert(use_gpu, "cudnn_conv only support GPU")
+
+        if (use_gpu == 1 and self.layer_type != "exconv" and
+           (parallel_nn == 0 or self.config.device > -1)):
+            self.layer_type = "cudnn_conv"
+        else:
+            self.layer_type = "exconv"
+        # need to specify layer in config
+        self.config.type = self.layer_type
+
+        if shared_biases is not None:
+            self.config.shared_biases = shared_biases
+
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            parse_conv(
+                self.inputs[input_index].conv,
+                input_layer.name,
+                self.config.inputs[input_index].conv_conf)
+            conv_conf = self.config.inputs[input_index].conv_conf
+            psize = self.calc_parameter_size(conv_conf)
+            print("output size for %s is %d " % (name, conv_conf.output_x))
+            self.create_input_parameter(input_index, psize)
+            self.set_layer_size(
+                (conv_conf.output_x ** 2) * self.config.num_filters)
+
+        psize = self.config.size
+        if shared_biases:
+            psize = self.config.num_filters
+        self.create_bias_parameter(bias, psize, [psize, 1])
+
+    def calc_parameter_size(self, conv_conf):
+        return self.config.num_filters * conv_conf.filter_channels \
+                    * (conv_conf.filter_size * conv_conf.filter_size_y)
+
+@config_layer('exconv')
+class ConvLayer(ConvLayerBase):
+    layer_type = 'exconv'
+
+@config_layer('cudnn_conv')
+class ConvLayer(ConvLayerBase):
+    layer_type = 'cudnn_conv'
+
+@config_layer('norm')
+class NormLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            device=None):
+        super(NormLayer, self).__init__(name, 'norm', 0, inputs=inputs, device=device)
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            parse_norm(
+                self.inputs[input_index].norm,
+                input_layer.name,
+                self.config.inputs[input_index].norm_conf)
+            norm_conf = self.config.inputs[input_index].norm_conf
+            self.set_layer_size((norm_conf.output_x ** 2) * norm_conf.channels)
+
+@config_layer('pool')
+class PoolLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            device=None):
+        super(PoolLayer, self).__init__(name, 'pool', 0, inputs=inputs, device=device)
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            parse_pool(
+                self.inputs[input_index].pool,
+                input_layer.name,
+                self.config.inputs[input_index].pool_conf)
+            pool_conf = self.config.inputs[input_index].pool_conf
+            print("output size for %s is %d*%d " % (
+                name, pool_conf.output_y, pool_conf.output_x))
+            self.set_layer_size((pool_conf.output_x ** 2) * pool_conf.channels)
+
+@config_layer('batch_norm')
+class BatchNormLayer(LayerBase):
+    layer_type = 'batch_norm'
+    def __init__(
+            self,
+            name,
+            inputs,
+            active_type="linear",
+            bias=True,
+            device=None,
+            use_global_stats=True,
+            moving_average_fraction=0.9,
+            batch_norm_type=None,
+            **xargs):
+        if inputs is None:
+            inputs = []
+        elif not isinstance(inputs, list):
+            inputs = [inputs]
+        config_assert(len(inputs) == 1,
+                      "BatchNormLayer must have one and only one input")
+        # Create Input for moving mean and std,
+        # in batch normalization layer.
+        # These paras no need to update, so set is_static is true.
+        # If not use is_static, even set learning_rate = 0, decay_rate = 0,
+        # these paras will change if set average_window in configure.
+        use_gpu = bool(int(g_command_config_args.get("use_gpu", 0)))
+        is_shared = True if not use_gpu else False
+        for i in xrange(2):
+            inputs.append(Input(inputs[0].input_layer_name,
+                                initial_std=0.0,
+                                initial_mean=0.0,
+                                is_static=True,
+                                 is_shared=is_shared,
+                                ))
+
+        parallel_nn = bool(int(g_command_config_args.get("parallel_nn", 0)))
+        cudnn_version = int(g_command_config_args.get("cudnn_version", 0))
+        # Automatically select cudnn_batch_norm for GPU and batch_norm for CPU.
+        # Also based on cudnn version.
+        use_cudnn = use_gpu and batch_norm_type != "batch_norm" and \
+            ((not parallel_nn) or self.config.device > -1) and \
+            cudnn_version >= 4000
+        self.layer_type = "cudnn_batch_norm" if use_cudnn else "batch_norm"
+        super(BatchNormLayer, self).__init__(name, self.layer_type, 0,
+                                             active_type=active_type,
+                                             inputs=inputs, device=device, **xargs)
+
+        if use_global_stats is not None:
+            self.config.use_global_stats = use_global_stats
+        if moving_average_fraction is not None:
+            self.config.moving_average_fraction = moving_average_fraction
+
+        input_layer= self.get_input_layer(0)
+        parse_image(self.inputs[0].image,
+                    input_layer.name,
+                    self.config.inputs[0].image_conf)
+        image_conf = self.config.inputs[0].image_conf
+        self.set_layer_size((image_conf.img_size ** 2) * image_conf.channels)
+
+        psize = self.calc_parameter_size(image_conf)
+        dims = [1, psize]
+        self.create_input_parameter(0, psize)
+        self.create_input_parameter(1, psize, dims)
+        self.create_input_parameter(2, psize, dims)
+
+        self.create_bias_parameter(bias, psize)
+
+    def calc_parameter_size(self, image_conf):
+        return image_conf.channels
+
+@config_layer('trans')
+class TransLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            device=None):
+        super(TransLayer, self).__init__(name, 'trans', 0, inputs=inputs, device=device)
+        config_assert(len(self.inputs) == 1,
+                      'TransLayer must have one and only one input')
+        self.set_layer_size(self.get_input_layer(0).size)
+
+@config_layer('resize')
+class ResizeLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            size,
+            inputs,
+            device=None):
+        super(ResizeLayer, self).__init__(name, 'resize', size=size, inputs=inputs, device=device)
+        config_assert(len(self.inputs) == 1,
+                      'ResizeLayer must have one and only one input')
+
+@config_layer('blockexpand')
+class BlockExpandLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            device=None):
+        super(BlockExpandLayer, self).__init__(name, 'blockexpand', 0, inputs=inputs, device=device)
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            parse_block_expand(self.inputs[input_index].block_expand,
+                input_layer.name,
+                self.config.inputs[input_index].block_expand_conf)
+            block_expand_conf = self.config.inputs[input_index].block_expand_conf
+            self.set_layer_size(block_expand_conf.block_x * block_expand_conf.block_y
+                * block_expand_conf.channels)
+
+# key: cost type
+# value: cost class
+g_cost_map = {}
+
+# define a cost layer without any parameters
+def define_cost(class_name, cost_type):
+    def init(cls, name, inputs, device=None, coeff=1.):
+        super(type(cls), cls).__init__(name, cost_type, 1, inputs, device=device, coeff=coeff)
+
+    cls = type(class_name, (LayerBase,), dict(__init__=init))
+    global g_cost_map
+    g_cost_map[cost_type] = cls
+
+define_cost('MultiClassCrossEntropy', 'multi-class-cross-entropy')
+define_cost('ClassificationErrorLayer', 'classification_error')
+define_cost('RankingCost', 'rank-cost')
+define_cost('AucValidation', 'auc-validation')
+define_cost('PnpairValidation', 'pnpair-validation')
+define_cost('SumOfSquaresCostLayer', 'square_error')
+define_cost('MultiBinaryLabelCrossEntropy', 'multi_binary_label_cross_entropy')
+define_cost('SoftBinaryClassCrossEntropy', 'soft_binary_class_cross_entropy')
+define_cost('HuberTwoClass', 'huber')
+
+@config_layer('hsigmoid')
+class HierarchicalSigmoidLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            num_classes,
+            inputs,
+            device=None,
+            bias=True):
+        super(HierarchicalSigmoidLayer, self).__init__(
+            name, 'hsigmoid', 1, inputs=inputs, device=device)
+        config_assert(len(self.inputs) >= 2,
+                      'HierarchicalSigmoidLayer must have at least 2 inputs')
+        self.config.num_classes = num_classes
+        for input_index in xrange(len(self.inputs) - 1):
+            input_layer = self.get_input_layer(input_index)
+            psize = (num_classes - 1) * input_layer.size
+            dims = [num_classes - 1, input_layer.size]
+            self.create_input_parameter(input_index, psize, dims)
+        self.create_bias_parameter(bias, num_classes - 1)
+
+'''
+lambdaCost for lambdaRank LTR approach
+
+Usage:
+  Example: Layer(name = "cost", type = "lambda_cost", NDCG_num = 8,
+             max_sort_size = -1, inputs = ["output", "score"])
+
+  Input data: Samples of the same query should be loaded as a sequence,
+          by ProtoDataProvider or PyDataProvider etc.. User should provide
+          scores for each sample. The score slot should be the 2nd
+          input of lambdaRank layer.
+
+  NDCG_num = the size of NDCG, e.g., 5 for NDCG@5.
+    Note: NDCG_num must be less than or equal to the minimum
+          size of lists.
+
+  max_sort_size = the size of partial sorting in calculating gradient.
+    Note: If max_sort_size = -1, then for each list, the algorithm will
+          sort the entire list to get gradient.
+          In other cases, max_sort_size must be greater than or equal
+          to NDCG_num.
+          max_sort_size can be greater than the size of a list, in which
+          case the algorithm will sort the entire list to get gradient.
+'''
+@config_layer('lambda_cost')
+class LambdaCost(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            NDCG_num = 5,
+            max_sort_size = -1,
+            device=None):
+        super(LambdaCost, self).__init__(
+            name, 'lambda_cost', 1, inputs=inputs, device=device)
+        config_assert(len(self.inputs) == 2,
+                      'lambdaCost must have 2 inputs')
+        self.config.NDCG_num = NDCG_num
+        if max_sort_size != -1:
+          config_assert(NDCG_num <= max_sort_size,
+                        'NDCG_num must be less than or equal to max_sort_size')
+        self.config.max_sort_size = max_sort_size
+
+@config_layer('nce')
+class NCELayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            num_classes,
+            inputs,
+            num_neg_samples=10,
+            neg_sampling_dist=None,
+            bias=True,
+            **xargs):
+        super(NCELayer, self).__init__(name, 'nce', 1, inputs=inputs, **xargs)
+        config_assert(len(self.inputs) >= 2,
+                      'NCELayer must have at least 2 inputs')
+        self.config.num_classes = num_classes
+        if neg_sampling_dist is not None:
+            config_assert(len(neg_sampling_dist) == num_classes,
+                          'len(neg_sampling_dist)(%s) is not same as num_classes (%s)'
+                          % (len(neg_sampling_dist), num_classes))
+            s = sum(neg_sampling_dist)
+            config_assert(abs(s - 1) < 1e-5,
+                          'The sum of neg_sampling_dist (%s) is not 1' % s)
+
+            self.config.neg_sampling_dist.extend(neg_sampling_dist)
+
+        self.config.num_neg_samples = num_neg_samples
+        num_real_inputs = len(self.inputs) - 1
+        input_layer =  self.get_input_layer(num_real_inputs)
+        config_assert(input_layer.type == 'data',
+                      'Expecting the last input layer of an nce layer to be '
+                      'a data layer')
+
+        if (num_real_inputs > 1 and input_layer.size == 1
+            and self.get_input_layer(num_real_inputs - 1).type == 'data'):
+            # This input layer is assumed to be a sample weight layer
+            num_real_inputs -= 1
+
+        for input_index in xrange(num_real_inputs):
+            input_layer = self.get_input_layer(input_index)
+            psize = num_classes * input_layer.size
+            dims = [num_classes, input_layer.size]
+            self.create_input_parameter(input_index, psize, dims)
+        self.create_bias_parameter(bias, num_classes)
+
+
+@config_layer('addto')
+class AddToLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            bias=True,
+            **xargs):
+        super(AddToLayer, self).__init__(
+            name, 'addto', 0, inputs=inputs, **xargs)
+        config_assert(len(inputs) > 0,
+                      'inputs cannot be empty for AddToLayer')
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            self.set_layer_size(input_layer.size)
+        self.create_bias_parameter(bias, self.config.size)
+
+@config_layer('agent')
+class AgentLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            size,
+            device=None):
+        super(AgentLayer, self).__init__(name, 'agent', size, inputs=[], device=device)
+
+@config_layer('sequence_agent')
+class SequenceAgentLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            size,
+            device=None):
+        super(SequenceAgentLayer, self).__init__(
+            name, 'sequence_agent', size, inputs=[], device=device)
+
+@config_layer('gather_agent')
+class GatherAgentLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            size,
+            device=None):
+        super(GatherAgentLayer, self).__init__(
+            name, 'gather_agent', size, inputs=[], device=device)
+
+@config_layer('scatter_agent')
+class ScatterAgentLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            size,
+            device=None):
+        super(ScatterAgentLayer, self).__init__(
+            name, 'scatter_agent', size, inputs=[], device=device)
+
+@config_layer('sequence_gather_agent')
+class SequenceGatherAgentLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            size,
+            device=None):
+        super(SequenceGatherAgentLayer, self).__init__(
+                name, 'sequence_gather_agent', size, inputs=[], device=device)
+
+@config_layer('sequence_scatter_agent')
+class SequenceScatterAgentLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            size,
+            device=None):
+        super(SequenceScatterAgentLayer, self).__init__(
+                name, 'sequence_scatter_agent', size, inputs=[], device=device)
+
+@config_layer('multiplex')
+class MultiplexLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            size,
+            device=None):
+        super(MultiplexLayer, self).__init__(name, 'multiplex', size, inputs=inputs, device=device)
+        config_assert(len(inputs) > 2,
+          'MultiplexLayer should have more than 2 inputs.')
+        for i in range(1, len(inputs)):
+            config_assert(self.get_input_layer(i).size == size,
+                          "All the input layers except the first one should"
+                          "have the same size as the MultiplexLayer.")
+
+@config_func
+def Link(name,
+        has_subseq=False,
+        ):
+    link_config = LinkConfig()
+    link_config.link_name = name
+    link_config.has_subseq = has_subseq
+    return link_config
+
+# memory for recurrent layer group.
+# *name* and *size* are actual layer's name and size.
+# will return name of the memory,
+# use this name if you assign the memory as other layer's input
+#
+# boot frame of memory is zeroed by default,
+# or initialize by boot layer output if *boot_layer* set,
+# or initialize by trainable bias if *boot_bias* set,
+# or initialize by a constant id if *boot_with_const_id* set
+#
+# Memory can be a sequence if *is_sequence* set, this type of memory
+# can only be initailized by a *boot_layer* which is a sequence.
+#
+@config_func
+def Memory(name,
+           size,
+           is_sequence=False,
+           boot_layer=None,
+           boot_bias=False,
+           boot_bias_active_type="",
+           boot_with_const_id=None,
+           ):
+    agent_name = name + "+delay1"
+    if is_sequence:
+        agent_layer = SequenceAgentLayer(agent_name, size)
+    else:
+        agent_layer = AgentLayer(agent_name, size)
+    config_assert(g_current_submodel.is_recurrent_layer_group,
+                      'Memory should be used in recurrent layer group only')
+    memory = g_current_submodel.memories.add()
+    memory.layer_name = MakeLayerNameInSubmodel(name)
+    memory.link_name = MakeLayerNameInSubmodel(agent_name)
+    memory.is_sequence = is_sequence
+    options = sum((boot_layer is not None,
+                   bool(boot_bias),
+                   boot_with_const_id is not None))
+    config_assert(options <= 1,
+        'take one option at most from boot_layer, boot_bias, or boot_with_const_id')
+    if boot_layer is not None:
+        boot_layer = MakeLayerNameInParentSubmodel(boot_layer)
+        config_assert(boot_layer in g_layer_map,
+                      'boot_layer "%s" does not correspond to a layer name' % boot_layer)
+        memory.boot_layer_name = boot_layer
+    elif boot_bias:
+        memory.boot_bias_parameter_name = agent_layer.create_bias_parameter(
+            boot_bias, size, for_self = False)
+        memory.boot_bias_active_type = boot_bias_active_type
+    elif boot_with_const_id is not None:
+        memory.boot_with_const_id = boot_with_const_id
+    return agent_name
+
+# Generator for recurrent layer group, to use it:
+#  1. define a id layer as output of layer group
+#  2. define a memory of this id layer, and assign a boot id(begin of sequence)
+#  3. define a eos check layer and fill its name in generator's *eos_layer_name*
+# Sequence generation will stop when eos check return 1 or *max_num_frames* reached.
+# If *beam_size* is greater than one, generator will use beam search.
+#   in beam search, if *num_results_per_sample* set, one sample sequence can output
+#   multiple results each with a probility.
+@config_func
+def Generator(
+        max_num_frames,
+        eos_layer_name = "eos_check",
+        num_results_per_sample = 1,
+        beam_size = 1,
+        log_prob = None,
+        ):
+    generator_config = GeneratorConfig()
+    generator_config.max_num_frames = max_num_frames
+    generator_config.eos_layer_name = eos_layer_name
+    generator_config.num_results_per_sample = num_results_per_sample
+    generator_config.beam_size = beam_size
+    if log_prob is not None:
+        generator_config.log_prob = log_prob
+    return generator_config
+
+@config_layer('expand')
+class ExpandLayer(LayerBase):
+   def __init__(
+            self,
+            name,
+            inputs,
+            trans_type='non-seq',
+            device=None,
+            bias=False):
+       super(ExpandLayer, self).__init__(
+           name, 'expand', 0, inputs=inputs, device=device)
+       config_assert(len(self.inputs) == 2,
+                     'ExpandLayer takes 2 and only 2 inputs')
+       self.config.trans_type =  trans_type
+       for input_index in xrange(len(self.inputs)):
+           input_layer = self.get_input_layer(input_index)
+       self.set_layer_size(self.get_input_layer(0).size)
+       self.create_bias_parameter(bias, self.config.size)
+
+@config_layer('featmap_expand')
+class FeatMapExpandLayer(LayerBase):
+   def __init__(
+            self,
+            name,
+            inputs,
+            device=None,
+            num_filters=None,
+            bias=False):
+       super(FeatMapExpandLayer, self).__init__(
+           name, 'featmap_expand', 0, inputs=inputs, device=device)
+       config_assert(len(self.inputs) == 1,
+                     'ExpandLayer takes 1 and only 1 inputs')
+       if num_filters is not None:
+            self.config.num_filters = num_filters
+       else:
+            logger.fatal("FeatMapExpandLayer must specify num_filters.")
+       self.set_layer_size(self.get_input_layer(0).size * num_filters)
+
+
+@config_layer('max')
+class MaxLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            trans_type='non-seq',
+            active_type='linear',
+            device=None,
+            bias=False,
+            output_max_index=False):
+        super(MaxLayer, self).__init__(name, 'max', 0, inputs=inputs, device=device)
+        config_assert(len(self.inputs) == 1, 'MaxLayer must have 1 input')
+        self.config.trans_type =  trans_type
+        self.config.active_type =  active_type
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            self.set_layer_size(input_layer.size)
+        self.create_bias_parameter(bias, self.config.size)
+        self.config.output_max_index=output_max_index
+
+
+@config_layer('maxid')
+class MaxIdLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            beam_size=None,
+            device=None):
+        super(MaxIdLayer, self).__init__(
+            name, 'maxid', 0, inputs=inputs, device=device)
+        config_assert(len(self.inputs) == 1, 'MaxIdLayer must have 1 input')
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            self.set_layer_size(input_layer.size)
+
+        if beam_size is None:
+            global g_current_submodel
+            if g_current_submodel.HasField("generator"):
+                self.config.beam_size = g_current_submodel.generator.beam_size
+        else:
+            self.config.beam_size = beam_size
+
+
+@config_layer('eos_id')
+class EosIdLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            eos_id,
+            device=None):
+        super(EosIdLayer, self).__init__(
+            name, 'eos_id', 0, inputs=inputs, device=device)
+        config_assert(len(self.inputs) == 1, 'EosIdLayer must have 1 input')
+        self.set_layer_size(2) # boolean output
+        self.config.eos_id = eos_id
+
+@config_layer('seqlastins')
+class SequenceLastInstanceLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            active_type='linear',
+            trans_type='non-seq',
+            device=None,
+            bias=False):
+        super(SequenceLastInstanceLayer, self).__init__(name, 'seqlastins',
+          0, inputs=inputs, device=device, active_type=active_type)
+        config_assert(len(inputs) == 1, 'SequenceLastInstanceLayer must have 1 input')
+        self.config.trans_type =  trans_type
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            self.set_layer_size(input_layer.size)
+        self.create_bias_parameter(bias, self.config.size)
+
+@config_layer('seqfirstins')
+class SequenceFirstInstanceLayer(SequenceLastInstanceLayer):
+    def __init__(
+            self,
+            name,
+            inputs,
+            active_type='linear',
+            trans_type='non-seq',
+            device=None,
+            bias=False,
+            ):
+        super(SequenceFirstInstanceLayer, self).__init__(name,
+          inputs=inputs, active_type=active_type, device=device, bias=bias)
+        self.config.trans_type =  trans_type
+        self.config.select_first = True
+
+@config_layer('seqconcat')
+class SequenceConcatLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            active_type='linear',
+            device=None,
+            bias=False):
+        super(SequenceConcatLayer, self).__init__(name, 'seqconcat',
+          0, inputs=inputs, device=device, active_type=active_type)
+        config_assert(len(inputs) == 2, 'SequenceConcatLayer must have 2 inputs')
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            self.set_layer_size(input_layer.size)
+        self.create_bias_parameter(bias, self.config.size)
+
+@config_layer('seqreshape')
+class SequenceReshapeLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            size,
+            inputs,
+            active_type='linear',
+            device=None,
+            bias=False):
+        super(SequenceReshapeLayer, self).__init__(name, 'seqreshape',
+          size, inputs=inputs, device=device, active_type=active_type)
+        config_assert(len(inputs) == 1, 'SequenceReshapeLayer must have 1 inputs')
+        self.set_layer_size(size)
+        self.create_bias_parameter(bias, size)
+
+@config_layer('subseq')
+class SubSequenceLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            active_type='linear',
+            device=None,
+            bias=False):
+        super(SubSequenceLayer, self).__init__(name, 'subseq',
+          0, inputs=inputs, device=device, active_type=active_type)
+        config_assert(len(inputs) == 3, 'SubSequenceLayer must have 3 inputs')
+        input_layer0 = self.get_input_layer(0)
+        size = input_layer0.size
+        self.set_layer_size(size)
+        self.create_bias_parameter(bias, size)
+
+@config_layer('out_prod')
+class OuterProdLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            device=None):
+        super(OuterProdLayer, self).__init__(name, 'out_prod',
+          0, inputs=inputs, device=device)
+        config_assert(len(inputs) == 2, 'OuterProdLayer must have 2 inputs')
+        input_layer0 = self.get_input_layer(0)
+        input_layer1 = self.get_input_layer(1)
+        self.set_layer_size(input_layer0.size * input_layer1.size)
+
+@config_layer('power')
+class PowerLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            device=None):
+        super(PowerLayer, self).__init__(name, 'power',
+          0, inputs=inputs, device=device)
+        config_assert(len(inputs) == 2, 'PowerLayer must have 2 inputs')
+        input_layer1 = self.get_input_layer(1)
+        self.set_layer_size(input_layer1.size)
+        input_layer0 = self.get_input_layer(0)
+        config_assert(1==input_layer0.size,
+          'The left input is the exponent and should be of size 1')
+
+@config_layer('slope_intercept')
+class SlopeInterceptLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            slope=1.0,
+            intercept=0.0,
+            device=None):
+        super(SlopeInterceptLayer, self).__init__(name, 'slope_intercept',
+          0, inputs=inputs, device=device)
+        self.config.slope = slope
+        self.config.intercept = intercept
+        config_assert(len(inputs) == 1, 'SlopeInterceptLayer must have 1 input')
+        input_layer0 = self.get_input_layer(0)
+        self.set_layer_size(input_layer0.size)
+
+@config_layer('scaling')
+class ScalingLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            device=None):
+        super(ScalingLayer, self).__init__(name, 'scaling',
+          0, inputs=inputs, device=device)
+        config_assert(len(inputs) == 2, 'ScalingLayer must have 2 inputs')
+        input_layer1 = self.get_input_layer(1)
+        self.set_layer_size(input_layer1.size)
+        input_layer0 = self.get_input_layer(0)
+        config_assert(1==input_layer0.size,
+          'The left input should be of size 1')
+
+@config_layer('conv_shift')
+class ConvShiftLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            device=None):
+        super(ConvShiftLayer, self).__init__(name, 'conv_shift',
+          0, inputs=inputs, device=device)
+        config_assert(len(inputs) == 2, 'ConvShiftLayer must have 2 inputs')
+        input_layer0 = self.get_input_layer(0)
+        self.set_layer_size(input_layer0.size)
+
+@config_layer('convex_comb')
+class ConvexCombinationLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            size,
+            inputs,
+            device=None):
+        super(ConvexCombinationLayer, self).__init__(
+           name, 'convex_comb', size, inputs=inputs, device=device)
+        config_assert(len(self.inputs) == 2,
+          'ConvexCombinationLayer must have 2 inputs')
+        self.set_layer_size(size)
+
+@config_layer('interpolation')
+class InterpolationLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            device=None):
+        super(InterpolationLayer, self).__init__(
+            name, 'interpolation', 0, inputs=inputs, device=device)
+        config_assert(len(self.inputs) == 3,
+            'InterpolationLayer must have 3 inputs')
+        input_layer0 = self.get_input_layer(0)
+        input_layer1 = self.get_input_layer(1)
+        input_layer2 = self.get_input_layer(2)
+        self.set_layer_size(input_layer1.size)
+        config_assert(input_layer0.size == 1, 'weight should be of size 1')
+        config_assert(input_layer1.size == input_layer2.size,
+                      'the two vector inputs should be of the same size')
+
+@config_layer('sum_to_one_norm')
+class SumToOneNormLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            device=None):
+        super(SumToOneNormLayer, self).__init__(
+           name, 'sum_to_one_norm', 0, inputs=inputs, device=device)
+        config_assert(len(self.inputs) == 1,
+          'SumToOneNormLayer must have 1 input')
+        input_layer0 = self.get_input_layer(0)
+        self.set_layer_size(input_layer0.size)
+
+@config_layer('cos_vm')
+class CosSimVecMatLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            size,
+            inputs,
+            cos_scale=1.0,
+            device=None):
+        super(CosSimVecMatLayer, self).__init__(
+          name, 'cos_vm', size, inputs=inputs, device=device)
+        self.config.cos_scale = cos_scale
+        config_assert(len(self.inputs) == 2,
+          'CosSimVecMatLayer must have 2 inputs')
+
+@config_layer('sampling_id')
+class SamplingIdLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            device=None):
+        super(SamplingIdLayer, self).__init__(
+            name, 'sampling_id', 0, inputs=inputs, device=device)
+        config_assert(len(self.inputs) == 1, 'SamplingIdLayer must have 1 input')
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            self.set_layer_size(input_layer.size)
+
+
+# AverageLayer: "average" for each sample within a sequence.
+# average_stratrgy: set to one of the following:
+# 'average': plain average.
+# 'sum': sum each sample instead of average (which is divide by sample_num).
+# 'squarerootn': sum each sample, but divide by sqrt(sample_num).
+@config_layer('average')
+class AverageLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            average_strategy='average',
+            trans_type='non-seq',
+            active_type='linear',
+            device=None,
+            bias=False):
+        super(AverageLayer, self).__init__(name, 'average', 0, inputs=inputs,
+            device=device, active_type=active_type)
+        self.config.average_strategy = average_strategy
+        self.config.trans_type =  trans_type
+        config_assert(len(inputs) == 1, 'AverageLayer must have 1 input')
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            self.set_layer_size(input_layer.size)
+        self.create_bias_parameter(bias, self.config.size)
+
+@config_layer('cos')
+class CosSimLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            device=None):
+        super(CosSimLayer, self).__init__(
+            name, 'cos', 1, inputs=inputs, device=device)
+        config_assert(len(self.inputs) == 2, 'CosSimLayer must have 2 inputs')
+        config_assert(
+            self.get_input_layer(0).size == self.get_input_layer(1).size,
+            'inputs of CosSimLayer must have same dim')
+
+
+@config_layer('tensor')
+class TensorLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            size,
+            inputs,
+            device=None,
+            bias=True,
+            **xargs):
+        super(TensorLayer, self).__init__(name, 'tensor', size, inputs=inputs, device=device, **xargs)
+        config_assert(len(self.inputs) == 2, 'TensorLayer must have 2 inputs')
+        config_assert(size > 0, 'size must be positive')
+        config_assert(inputs[1].parameter_name == None, 'second parameter should be None.')
+        input_layer0 = self.get_input_layer(0)
+        input_layer1 = self.get_input_layer(1)
+        psize = size * input_layer0.size * input_layer1.size
+        dims = [input_layer0.size, input_layer1.size, size]
+        self.create_input_parameter(0, psize, dims)
+        self.create_bias_parameter(bias, size)
+
+
+@config_layer('mixed')
+class MixedLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            size=0,
+            bias=True,
+            error_clipping_threshold=0.0,
+            **xargs):
+        config_assert(inputs, 'inputs cannot be empty')
+        super(MixedLayer, self).__init__(
+            name, 'mixed', size, inputs=inputs, **xargs)
+
+        operator_input_index = []
+        for operator in self.operators:
+            operator_conf = operator.operator_conf
+            for i in xrange(1, len(operator.input_layer_names)):
+                input_index = len(self.config.inputs)
+                operator_conf.input_indices.append(input_index)
+                input_config = Input(operator.input_layer_names[i])
+                self.inputs.append(input_config)
+                layer_input = self.config.inputs.add()
+                layer_input.input_layer_name = input_config.input_layer_name
+            for input_index in operator_conf.input_indices:
+                input_layer = self.get_input_layer(input_index)
+                operator_conf.input_sizes.append(input_layer.size)
+                operator_input_index.append(input_index)
+            if self.config.size ==  0:
+                size = operator.calc_output_size(operator_conf.input_sizes)
+                if size != 0:
+                    self.set_layer_size(size)
+
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            input = self.inputs[input_index]
+            if input_index not in operator_input_index:
+                config_assert(isinstance(input, Projection), "input should be projection or operation")
+            if self.config.size ==  0 and isinstance(input, Projection):
+                size = input.calc_output_size(input_layer)
+                if size != 0:
+                    self.set_layer_size(size)
+
+        config_assert(size != 0, "size is not set")
+
+        for input_index in xrange(len(self.inputs)):
+            input = self.inputs[input_index]
+            if isinstance(input, Projection):
+                input_layer = self.get_input_layer(input_index)
+                input.proj_conf.input_size = input_layer.size
+                input.proj_conf.output_size = size
+
+                input_config = self.config.inputs[input_index]
+                input_config.proj_conf.CopyFrom(input.proj_conf)
+                input_config.proj_conf.name = gen_parameter_name(name, input_index)
+                psize = input.calc_parameter_size(input_layer.size, size)
+                dims = input.calc_parameter_dims(input_layer.size, size)
+                self.create_input_parameter(input_index, psize, dims)
+
+        for operator in self.operators:
+            operator_conf = operator.operator_conf
+            operator_conf.output_size = self.config.size
+            operator.check_dims()
+            record_operator_conf = self.config.operator_confs.add()
+            record_operator_conf.CopyFrom(operator_conf)
+
+
+        self.create_bias_parameter(bias, self.config.size)
+
+        self.config.error_clipping_threshold = error_clipping_threshold
+
+# like MixedLayer, but no bias parameter
+@config_func
+def ExpressionLayer(name,
+            inputs,
+            **xargs):
+    MixedLayer(name, inputs, bias=False, **xargs)
+
+@config_layer('concat')
+class ConcatenateLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            **xargs):
+        config_assert(inputs, 'inputs cannot be empty')
+        super(ConcatenateLayer, self).__init__(
+            name, 'concat', 0, inputs=inputs, **xargs)
+        size = 0
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            input = self.inputs[input_index]
+            if self.config.size ==  0:
+                size += input_layer.size
+
+        self.set_layer_size(size)
+
+# like concat layer, but each input layer was processed by a Projection.
+@config_layer('concat2')
+class ConcatenateLayer2(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            **xargs):
+        config_assert(inputs, 'inputs cannot be empty')
+        super(ConcatenateLayer2, self).__init__(
+            name, 'concat2', 0, inputs=inputs, **xargs)
+        size = 0
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            input = self.inputs[input_index]
+            output_size = input.calc_output_size(input_layer)
+            config_assert(output_size != 0, "proj output size is not set")
+            size += output_size
+
+        self.set_layer_size(size)
+
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            input = self.inputs[input_index]
+            input.proj_conf.input_size = input_layer.size
+            input.proj_conf.output_size = input.calc_output_size(input_layer)
+
+            input_config = self.config.inputs[input_index]
+            input_config.proj_conf.CopyFrom(input.proj_conf)
+            input_config.proj_conf.name = gen_parameter_name(name, input_index)
+            psize = input.calc_parameter_size(input.proj_conf.input_size,
+              input.proj_conf.output_size)
+            dims = input.calc_parameter_dims(input.proj_conf.input_size,
+              input.proj_conf.output_size)
+            self.create_input_parameter(input_index, psize, dims)
+
+@config_layer('recurrent')
+class RecurrentLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            reversed=False,
+            bias=True,
+            **xargs):
+        super(RecurrentLayer, self).__init__(name, 'recurrent', 0, inputs, **xargs)
+        config_assert(len(self.inputs) == 1, 'RecurrentLayer must have 1 input')
+        input_layer = self.get_input_layer(0)
+        size = input_layer.size
+        self.set_layer_size(size)
+        self.config.reversed = reversed
+        dims = [size, size]
+        self.create_input_parameter(0, size * size, dims)
+        self.create_bias_parameter(bias, self.config.size)
+
+@config_layer('lstmemory')
+class LstmLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            reversed=False,
+            active_gate_type="sigmoid",
+            active_state_type="sigmoid",
+            bias=True,
+            **xargs):
+        super(LstmLayer, self).__init__(name, 'lstmemory', 0, inputs, **xargs)
+        config_assert(len(self.inputs) == 1, 'LstmLayer must have 1 input')
+        input_layer = self.get_input_layer(0)
+        #check input_layer.size is divided by 4
+        config_assert(input_layer.size % 4 == 0, "size % 4 should be 0!")
+        size = input_layer.size / 4
+        self.set_layer_size(size)
+        self.config.reversed = reversed
+        self.config.active_gate_type  = active_gate_type
+        self.config.active_state_type = active_state_type
+        self.create_input_parameter(0, size * size * 4, [size, size, 4])
+        #bias includes 3 kinds of peephole, 4 + 3 = 7
+        self.create_bias_parameter(bias, size * 7)
+
+@config_layer('lstm_step')
+class LstmStepLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            size,
+            inputs,
+            active_gate_type="sigmoid",
+            active_state_type="sigmoid",
+            bias=True,
+            **xargs):
+        super(LstmStepLayer, self).__init__(name, 'lstm_step',
+          size, inputs, **xargs)
+        config_assert(len(inputs) == 2, 'LstmStepLayer must have 2 inputs')
+        input_layer0 = self.get_input_layer(0)
+        input_layer1 = self.get_input_layer(1)
+        config_assert(input_layer0.size == 4 * size, 'input_layer0.size != 4 * layer.size')
+        config_assert(input_layer1.size == size, 'input_layer1.size != layer.size')
+        self.config.active_gate_type  = active_gate_type
+        self.config.active_state_type = active_state_type
+        self.create_bias_parameter(bias, size * 3)
+
+# get the specific output from the input layer.
+@config_layer('get_output')
+class GetOutputLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            size,
+            inputs):
+        super(GetOutputLayer, self).__init__(name, 'get_output' , size, inputs)
+        config_assert(len(self.inputs) == 1, 'GetOutputLayer must have 1 inputs')
+        inputs = self.inputs[0]
+        config_assert(inputs.input_layer_argument,
+                      'input_layer_argument cannot be empty')
+
+@config_layer('mdlstmemory')
+class MDLstmLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            directions=True,
+            active_gate_type="sigmoid",
+            active_state_type="sigmoid",
+            bias=True,
+            **xargs):
+        super(MDLstmLayer, self).__init__(name, 'mdlstmemory', 0, inputs, **xargs)
+        config_assert(len(self.inputs) == 1, 'MDLstmLayer must have 1 input')
+        input_layer = self.get_input_layer(0)
+        dim_num = len(directions)
+        #check input_layer.size is divided by (3+dim_num)
+        config_assert(input_layer.size % (3+dim_num) == 0, "size % (dim_num) should be 0!")
+        size = input_layer.size / (3+dim_num)
+        self.set_layer_size(size)
+        self.config.active_gate_type  = active_gate_type
+        self.config.active_state_type = active_state_type
+        for i in xrange(len(directions)):
+            self.config.directions.append(int(directions[i]))
+        self.create_input_parameter(0, size * size * (3+dim_num), [size, size, 3+dim_num])
+        #bias includes 3 kinds of peephole, 3+dim_num+2+dim_num
+        self.create_bias_parameter(bias, size * (5+2*dim_num))
+
+@config_layer('gated_recurrent')
+class GatedRecurrentLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            reversed=False,
+            active_gate_type="sigmoid",
+            bias=True,
+            **xargs):
+        super(GatedRecurrentLayer, self).__init__(name, 'gated_recurrent', 0, inputs, **xargs)
+        config_assert(len(self.inputs) == 1, 'GatedRecurrentLayer must have 1 input')
+        input_layer = self.get_input_layer(0)
+        #check input_layer.size is divided by 3
+        config_assert(input_layer.size % 3 == 0, "size % 3 should be 0!")
+        size = input_layer.size / 3
+        self.set_layer_size(size)
+        self.config.reversed = reversed
+        self.config.active_gate_type  = active_gate_type
+        self.create_input_parameter(0, size * size * 3, [size, size * 3])
+        self.create_bias_parameter(bias, size * 3)
+
+@config_layer('gru_step')
+class GruStepLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            size,
+            inputs,
+            active_gate_type="sigmoid",
+            bias=True,
+            **xargs):
+        super(GruStepLayer, self).__init__(name, 'gru_step', size, inputs, **xargs)
+        config_assert(len(self.inputs) == 2, 'GruStepLayer must have 2 input')
+        input_layer0 = self.get_input_layer(0)
+        input_layer1 = self.get_input_layer(1)
+        config_assert(input_layer0.size == 3 * size, 'input_layer0.size != 3 * layer.size')
+        config_assert(input_layer1.size == size, 'input_layer1.size != layer.size')
+        self.config.active_gate_type  = active_gate_type
+        self.create_input_parameter(0, size * size * 3, [size, size * 3])
+        self.create_bias_parameter(bias, size * 3)
+
+'''
+ A layer for calculating the cost of sequential conditional random field model.
+ Example: CRFLayer(name="crf_cost", size=label_num,
+                   inputs=["output", "label", "weight"])
+          where "weight" is optional, one weight for each sequence
+ @param coeff: weight of the layer
+'''
+@config_layer('crf')
+class CRFLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            size,
+            inputs,
+            coeff=1.0,
+            device=None):
+        super(CRFLayer, self).__init__(name, 'crf', size, inputs, device=device)
+        config_assert(2 <= len(self.inputs) <= 3, 'CRFLayer must have 2 or 3 inputs')
+        self.create_input_parameter(0, size * (size + 2), [size, size + 2])
+        self.config.coeff = coeff
+
+'''
+ A layer for calculating the decoding sequence of sequential conditional
+ random field model.
+ The decoding sequence is stored in output_.ids
+ If a second input is provided, it is treated as the ground-truth label, and
+ this layer will also calculate error, output_.value[i] is 1 for incorrect
+ decoding or 0 for correct decoding
+'''
+@config_layer('crf_decoding')
+class CRFDecodingLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            size,
+            inputs,
+            device=None):
+        super(CRFDecodingLayer, self).__init__(
+            name, 'crf_decoding', size, inputs, device=device)
+        config_assert(
+            len(self.inputs) <= 2,
+            'CRFDecodingLayer cannot have more than 2 inputs')
+        self.create_input_parameter(0, size * (size + 2), [size, size + 2])
+
+@config_layer('ctc')
+class CTCLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            size,
+            inputs,
+            norm_by_times = False,
+            device=None):
+        super(CTCLayer, self).__init__(name, 'ctc', size, inputs, device=device)
+        self.config.norm_by_times = norm_by_times
+        config_assert(len(self.inputs) == 2, 'CTCLayer must have 2 inputs')
+
+@config_layer('recurrent_layer_group')
+class RecurrentLayerGroup(LayerBase):
+    def __init__(
+            self,
+            name,
+            device=None):
+        super(RecurrentLayerGroup, self).__init__(
+            name, 'recurrent_layer_group', 0, inputs=[], device=device)
+
+
+# Deprecated, use a new layer specific class instead
+@config_func
+def Layer(
+        name,
+        type,
+        **xargs):
+    layers = {}
+    layers.update(g_cost_map)
+    layers.update(g_layer_type_map)
+    layer_func = layers.get(type)
+    config_assert(layer_func,
+                  "layer type '%s' not supported." % type)
+    layer_func(name, **xargs)
+
+@config_func
+def ParameterHook(
+    type,
+    **kwargs):
+    if type == 'pruning':
+        mask_filename = kwargs.get('mask_filename', None)
+        assert mask_filename is not None
+        hook = ParameterUpdaterHookConfig()
+        hook.type = type
+        hook.purning_mask_filename = mask_filename
+        return hook
+    else:
+        return None
+
+
+@config_func
+def Parameter(
+        name,
+        size,
+        device,
+        dims,
+        learning_rate=None,
+        momentum=None,
+        decay_rate=None,
+        decay_rate_l1=None,
+        initial_mean=None,
+        initial_std=None,
+        initial_strategy=None,
+        initial_smart=None,
+        num_batches_regularization=None,
+        sparse_remote_update=None,
+        sparse_update=None,
+        gradient_clipping_threshold=None,
+        sparse=None,
+        format=None,
+        need_compact=None,
+        is_static=None,
+        is_shared=None,
+        update_hooks=None
+        ):
+
+    config_assert(name not in g_parameter_map,
+                  'Duplicated parameter name: ' + name)
+
+    para = g_config.model_config.parameters.add()
+    para.name = name
+    para.size = size
+    para.device = device
+    para.dims.extend(dims);
+    para.learning_rate = default(learning_rate, 1.)
+    para.momentum = default(momentum, g_default_momentum)
+    config_assert(not momentum or not decay_rate_l1,
+                  "momentum and decay_rate_l1 cannot both be non-zero")
+    para.decay_rate = default(decay_rate, g_default_decay_rate)
+    if decay_rate_l1 is not None:
+        para.decay_rate_l1 = decay_rate_l1
+    para.initial_std = default(initial_std, g_default_initial_std)
+    para.initial_mean = default(initial_mean, g_default_initial_mean)
+    para.num_batches_regularization = default(
+        num_batches_regularization, g_default_num_batches_regularization)
+    if sparse_remote_update is not None:
+        para.sparse_remote_update = sparse_remote_update
+        if sparse_remote_update:
+            g_config.opt_config.use_sparse_remote_updater = True
+    if sparse_update is not None:
+        para.sparse_update = sparse_update
+    para.gradient_clipping_threshold = default(
+        gradient_clipping_threshold, g_default_gradient_clipping_threshold);
+    para.initial_strategy = default(initial_strategy, g_default_initial_strategy)
+    para.initial_smart = default(initial_smart, g_default_initial_smart)
+    if para.initial_smart:
+        para.initial_mean = 0.
+        if len(para.dims) != 0:
+            para.initial_std = 1. / math.sqrt(para.dims[0])
+        else:
+            print("Use initial_smart, but dims not set. Initial_smart may not be used in this layer")
+            traceback.print_exc()
+            para.initial_std = 1. / math.sqrt(para.size)
+    if g_default_compact_func is not None:
+        sparse, format, need_compact = g_default_compact_func(para.name)
+    para.is_sparse = default(sparse, False)
+    para.format = default(format, "")
+    para.need_compact = default(need_compact, False)
+    if is_static is not None:
+        para.is_static = is_static
+    config_assert(not para.sparse_remote_update or not para.is_static,
+                  "sparse_remote_update and is_static cannot both be true")
+
+    para.is_shared = default(is_shared, False)
+
+    update_hooks = default(update_hooks, g_default_update_hooks)
+
+    if update_hooks is not None:
+        if hasattr(update_hooks, '__call__'):
+            update_hooks = update_hooks(para.name)
+
+        if isinstance(update_hooks, list):
+            for hook in update_hooks:
+                para.update_hooks.extend([hook])
+        else:
+            para.update_hooks.extend(update_hooks)
+
+    g_parameter_map[name] = para
+
+
+@config_func
+def default_initial_std(val):
+    global g_default_initial_std
+    g_default_initial_std = val
+
+@config_func
+def default_initial_mean(val):
+    global g_default_initial_mean
+    g_default_initial_mean = val
+
+@config_func
+def default_initial_strategy(val):
+    global g_default_initial_strategy
+    g_default_initial_strategy = val
+
+@config_func
+def default_initial_smart(val):
+    global g_default_initial_smart
+    g_default_initial_smart = val
+
+@config_func
+def default_momentum(val):
+    global g_default_momentum
+    g_default_momentum = val
+
+@config_func
+def default_decay_rate(val):
+    global g_default_decay_rate
+    g_default_decay_rate = val
+
+@config_func
+def default_num_batches_regularization(val):
+    global g_default_num_batches_regularization
+    g_default_num_batches_regularization = val
+
+@config_func
+def default_gradient_clipping_threshold(val):
+    global g_default_gradient_clipping_threshold
+    g_default_gradient_clipping_threshold = val
+
+@config_func
+def default_device(val):
+    global g_default_device
+    g_default_device = val
+
+@config_func
+def default_update_hooks(val):
+    global g_default_update_hooks
+    g_default_update_hooks = val
+
+@config_func
+def default_compact_func(val):
+    global g_default_compact_func
+    g_default_compact_func = val
+
+def make_importer(config_dir, config_args):
+    def Import(config_file, local_args={}):
+        if not config_file.startswith('/'):
+            config_file = config_dir + '/' + config_file
+            g_config.config_files.append(config_file)
+        execfile(config_file, make_config_environment(config_file, config_args), local_args)
+    return Import
+
+settings = dict(
+    batch_size=None,
+    mini_batch_size=None,
+    algorithm='async_sgd',
+    async_lagged_grad_discard_ratio=1.5,
+    learning_method='momentum',
+    num_batches_per_send_parameter=None,
+    num_batches_per_get_parameter=None,
+    center_parameter_update_method=None,
+    learning_rate=1.,
+    learning_rate_decay_a=0.,
+    learning_rate_decay_b=0.,
+    learning_rate_schedule='poly',
+    learning_rate_args='',
+    l1weight=0.1,
+    l2weight=0.,
+    l2weight_zero_iter=0,
+    c1=0.0001,
+    backoff=0.5,
+    owlqn_steps=10,
+    max_backoff=5,
+    average_window=0,
+    do_average_in_cpu=False,
+    max_average_window=None,
+    ada_epsilon=1e-6,
+    ada_rou=0.95,
+    delta_add_rate=1.0,
+    shrink_parameter_value=0,
+    adam_beta1 = 0.9,
+    adam_beta2 = 0.999,
+    adam_epsilon = 1e-8,
+)
+
+settings_deprecated = dict(
+    usage_ratio=1.,
+)
+
+trainer_settings = dict(
+    save_dir="./output/model",
+    init_model_path=None,
+    start_pass=0,
+)
+
+@config_func
+def Settings(**args):
+    for k, v in args.iteritems():
+        if k == "usage_ratio":
+            logger.warning("Deprecated: define usage_ratio in DataConfig instead")
+            if g_config.HasField("data_config"):
+                g_config.data_config.__setattr__(k, v)
+            settings_deprecated[k] = v
+            continue
+        elif k in settings:
+            settings[k] = v
+        elif k in trainer_settings:
+            trainer_settings[k] = v
+        else:
+            logger.fatal('Unkown setting: %s' % k)
+
+@config_func
+def cluster_config(**args):
+    pass
+
+@config_func
+def EnableSubmodelSuffix(flag=True):
+    """
+    If enabled, the layer and evaluator names in submodel will be automatically
+    appended with @submodel_name
+    """
+    global g_add_submodel_suffix
+    g_add_submodel_suffix = flag
+
+def make_config_environment(config_file, config_args):
+    def make_setter(k):
+        def setter(v):
+            logger.fatal("Obsolete: use Settings(%s=%s, ...) instead" % (k, v))
+        return setter
+
+    funcs = {}
+    funcs.update(g_config_funcs)
+
+    for k in settings.iterkeys():
+        funcs[k] = make_setter(k)
+    for k in settings_deprecated.iterkeys():
+        funcs[k] = make_setter(k)
+    config_dir = os.path.dirname(config_file)
+    if not config_dir:
+        config_dir = '.'
+
+    funcs.update(
+        Import=make_importer(config_dir, config_args),
+        get_config_arg=make_get_config_arg(config_args),
+    )
+
+    funcs.update(g_extended_config_funcs)
+
+    return funcs
+
+def make_get_config_arg(config_args):
+    def get_config_arg(name, type, default=None):
+        if type == bool:
+            s = config_args.get(name)
+            if not s:
+                return default
+            if s == 'True' or s == '1' or s == 'true':
+                return True
+            if s == 'False' or s == '0' or s == 'false':
+                return False
+            raise ValueError('Value of config_arg %s is not boolean' % name)
+        else:
+            return type(config_args.get(name, default))
+
+    return get_config_arg
+
+def importlib(name):
+    __import__(name)
+    return sys.modules[name]
+
+
+def find_caller():
+    stack = traceback.extract_stack()
+    for s in stack[-4::-1]:
+        if not s[0].endswith('config_parser.py'):
+            return s[0], s[1], s[2]
+    return "(unknown file)", 0, "(unknown function)"
+
+def my_fatal(s):
+    logger.critical(s)
+    raise Exception()
+
+def parse_config(config_file, config_arg_str):
+    '''
+    @param config_arg_str: a string of the form var1=val1,var2=val2. It will be
+    passed to config script as a dictionary CONFIG_ARGS
+    '''
+    init_config_environment()
+
+    config_args = {}
+
+    logger.findCaller = find_caller
+    logger.fatal = my_fatal
+
+    g_config.model_config.type = "nn"
+    if config_arg_str:
+        config_args = dict([f.split('=') for f in config_arg_str.split(',')])
+
+    global g_command_config_args
+    g_command_config_args.update(config_args)
+
+    extension_module_name = config_args.get('extension_module_name')
+    if extension_module_name:
+        global g_extended_config_funcs
+        extension_module = importlib(extension_module_name)
+        g_extended_config_funcs = extension_module.get_config_funcs(g_config)
+
+    g_config.model_config.type = 'nn'
+
+    global g_current_submodel, g_root_submodel
+    g_root_submodel = g_config.model_config.sub_models.add()
+    g_root_submodel.name = 'root'
+    g_root_submodel.is_recurrent_layer_group = False
+    g_current_submodel = g_root_submodel
+
+    execfile(config_file, make_config_environment(config_file, config_args))
+    for k, v in settings.iteritems():
+        if v is None:
+            continue
+        g_config.opt_config.__setattr__(k, v);
+
+    for k, v in trainer_settings.iteritems():
+        if v is None:
+            continue
+        g_config.__setattr__(k, v)
+
+    for name in g_config.model_config.input_layer_names:
+        assert name in g_layer_map, \
+            'input name "%s" does not correspond to a layer name' % name
+        assert (g_layer_map[name].type == "data" or g_layer_map[name].type == "data_trim"), \
+            'The type of input layer "%s" is not "data"' % name
+    for name in g_config.model_config.output_layer_names:
+        assert name in g_layer_map, \
+            'input name "%s" does not correspond to a layer name' % name
+    return g_config
+
+
+def parse_config_and_serialize(config_file, config_arg_str):
+    try:
+        config = parse_config(config_file, config_arg_str)
+        #logger.info(config)
+        return config.SerializeToString()
+    except:
+        traceback.print_exc()
+        raise
+
+if __name__ == '__main__':
+    try:
+        config = parse_config(sys.argv[1], '')
+        config.SerializeToString()
+        __real_print__(str(config))
+    except:
+        traceback.print_exc()
+        raise
diff --git a/python/paddle/trainer/config_parser_extension.py b/python/paddle/trainer/config_parser_extension.py
new file mode 100644
index 00000000000000..3445076274b0a8
--- /dev/null
+++ b/python/paddle/trainer/config_parser_extension.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.proto.DataConfig_pb2 import DataConfig
+
+g_config = None
+
+
+def SimpleData(
+        files=None,
+        feat_dim=None,
+        context_len=None,
+        buffer_capacity=None):
+
+    data_config = DataConfig()
+    data_config.type = 'simple'
+    data_config.files = files
+    data_config.feat_dim = feat_dim
+    if context_len is not None:
+        data_config.context_len = context_len
+    if buffer_capacity:
+        data_config.buffer_capacity = buffer_capacity
+    return data_config
+
+def get_config_funcs(trainer_config):
+    global g_config
+    g_config = trainer_config
+    return dict(SimpleData=SimpleData)
diff --git a/python/paddle/trainer/recurrent_units.py b/python/paddle/trainer/recurrent_units.py
new file mode 100644
index 00000000000000..7d51de78b0d79c
--- /dev/null
+++ b/python/paddle/trainer/recurrent_units.py
@@ -0,0 +1,325 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+ 
+# recurrent_units.py
+# Version 2.0
+#
+# Some recurrent units can be used in recurrent layer group, 
+#   to use these units, import this module in your config_file:
+#     import trainer.recurrent_units 
+# 
+
+from paddle.trainer.config_parser import *
+
+# long short term memory, can be used in recurrent machine
+# *inputs* must be a list of Projections, for example:
+#   inputs = [FullMatrixProjection("input_layer_name")],
+# *para_prefix* defines parameter names, if the *para_prefix* of 
+#   two LstmRecurrentUnit is same, they share same parameters
+# *out_memory* can be defined outside if it's used outside
+def LstmRecurrentUnit(name, size, 
+                      active_type, state_active_type, gate_active_type, 
+                      inputs, para_prefix = None, 
+                      error_clipping_threshold = 0,
+                      out_memory = None):
+
+    if para_prefix is None: 
+        para_prefix = name
+    if out_memory is None:
+        out_memory = Memory(name = name, size = size)
+
+    state_memory = Memory(name = name + "_" + "state", size = size)
+ 
+    Layer(
+          name = name + "_" + "input_recurrent",
+          type = "mixed",
+          size = size * 4, #(input_s, input_gate, forget_gate, output_gate)
+          error_clipping_threshold = error_clipping_threshold,
+          bias = Bias(initial_std = 0, 
+                      parameter_name = para_prefix + "_input_recurrent.b"),
+          inputs = inputs + [
+            FullMatrixProjection(out_memory,
+                                 parameter_name = para_prefix + "_input_recurrent.w"),
+            ],
+    )
+    LstmStepLayer(
+          name = name,
+          size = size,
+          bias = Bias(parameter_name = para_prefix + "_check.b"),
+          inputs = [name + "_" + "input_recurrent", state_memory],
+          active_type = active_type,
+          active_gate_type = gate_active_type,
+          active_state_type = state_active_type,
+    )
+    GetOutputLayer(
+          name = name + "_" + "state",
+          size = size,
+          inputs = Input(name, input_layer_argument = "state"),
+    )
+
+def LstmRecurrentUnitNaive(name, size, 
+                           active_type, state_active_type, gate_active_type, 
+                           inputs, para_prefix = None, 
+                           error_clipping_threshold = 0,
+                           out_memory = None):
+
+    if para_prefix is None: 
+        para_prefix = name
+    if out_memory is None:
+        out_memory = Memory(name = name, size = size)
+
+    state_memory = Memory(name = name + "_" + "state", size = size)
+ 
+    Layer(
+          name = name + "_" + "input_recurrent",
+          type = "mixed",
+          size = size * 4, #(input_s, input_gate, forget_gate, output_gate)
+          error_clipping_threshold = error_clipping_threshold,
+          bias = Bias(initial_std = 0, 
+                      parameter_name = para_prefix + "_input_recurrent.b"),
+          inputs = inputs + [
+            FullMatrixProjection(out_memory,
+                                 parameter_name = para_prefix + "_input_recurrent.w"),
+            ],
+    )
+    ExpressionLayer(
+          name = name + "_" + "input_s",
+          size = size,
+          active_type = active_type,
+          inputs = [IdentityOffsetProjection(name + "_" + "input_recurrent", offset=0)],
+    )
+    ExpressionLayer(
+          name = name + "_" + "input_gate",
+          active_type = gate_active_type,
+          inputs = [IdentityOffsetProjection(name + "_" + "input_recurrent", offset=size),
+                    DotMulProjection(state_memory,
+                                     parameter_name = para_prefix + "_input_check.w")],
+    )
+    ExpressionLayer(
+          name = name + "_" + "forget_gate",
+          active_type = gate_active_type,
+          inputs = [IdentityOffsetProjection(name + "_" + "input_recurrent", offset=size*2),
+                    DotMulProjection(state_memory,
+                                     parameter_name = para_prefix + "_forget_check.w")],
+    )
+    ExpressionLayer(
+          name = name + "_" + "state",
+          inputs = [DotMulOperator([name + "_" + "input_s",
+                                    name + "_" + "input_gate"]),
+                    DotMulOperator([state_memory, 
+                                    name + "_" + "forget_gate"]),
+                    ],
+    )
+    ExpressionLayer(
+          name = name + "_" + "output_gate",
+          active_type = gate_active_type,
+          inputs = [IdentityOffsetProjection(name + "_" + "input_recurrent", offset=size*3),
+                    DotMulProjection(name + "_" + "state",
+                                     parameter_name = para_prefix + "_output_check.w")],
+    )
+    ExpressionLayer(
+          name = name + "_" + "state_atv",
+          active_type = state_active_type,
+          inputs = IdentityProjection(name + "_" + "state"),
+    )
+    ExpressionLayer(
+          name = name,
+          inputs = DotMulOperator([name + "_" + "state_atv",
+                                   name + "_" + "output_gate"]),
+    )
+
+# like LstmRecurrentUnit, but it's a layer group.
+# it is equivalent to LstmLayer
+def LstmRecurrentLayerGroup(name, size, 
+                            active_type, state_active_type, gate_active_type, 
+                            inputs, para_prefix = None,
+                            error_clipping_threshold = 0,
+                            seq_reversed = False):
+
+    input_layer_name = name + "_" + "transform_input"
+    Layer(
+          name = input_layer_name,
+          type = "mixed",
+          size = size * 4,
+          active_type = "",
+          bias = False,
+          inputs = inputs,
+    )
+
+    RecurrentLayerGroupBegin(name + "_layer_group", 
+                             in_links = [input_layer_name], 
+                             out_links = [name],
+                             seq_reversed = seq_reversed)
+
+    LstmRecurrentUnit(
+        name = name,
+        size = size,
+        active_type = active_type,
+        state_active_type = state_active_type,
+        gate_active_type = gate_active_type,
+        inputs = [IdentityProjection(input_layer_name)],
+        para_prefix = para_prefix,
+        error_clipping_threshold = error_clipping_threshold,
+        )
+
+    RecurrentLayerGroupEnd(name + "_layer_group")
+
+
+
+# gated recurrent unit, can be used in recurrent machine
+# *inputs* should be a list of Projections, for example:
+#   inputs = [FullMatrixProjection("input_layer_name")],
+# *para_prefix* defines parameter names, if the *para_prefix* of 
+#   two GatedRecurrentUnit is same, they share same parameters
+# *out_memory* can be defined outside if it's used outside
+
+def GatedRecurrentUnit(name, size, 
+                       active_type, gate_active_type, 
+                       inputs, para_prefix = None, 
+                       error_clipping_threshold = 0,
+                       out_memory = None):
+    if type_of(inputs) == str: #only used by GatedRecurrentLayerGroup
+        input_layer_name = inputs
+    else:
+        input_layer_name = name + "_" + "transform_input"
+        Layer(
+            name = input_layer_name,
+            type = "mixed",
+            size = size * 3,
+            active_type = "",
+            bias = False,
+            inputs = inputs,
+        )
+
+    if para_prefix is None: 
+        para_prefix = name
+    if out_memory is None:
+        out_memory = Memory(name = name, size = size)
+
+    GruStepLayer(
+          name = name,
+          size = size,
+          bias = Bias(parameter_name = para_prefix + "_gate.b"),
+          inputs = [input_layer_name,
+                    Input(out_memory, parameter_name = para_prefix + "_gate.w")],
+          active_type = active_type,
+          active_gate_type = gate_active_type,
+    )
+
+def GatedRecurrentUnitNaive(name, size, 
+                            active_type, gate_active_type, 
+                            inputs, para_prefix = None, 
+                            error_clipping_threshold = 0,
+                            out_memory = None):
+
+    if type_of(inputs) == str: #only used by GatedRecurrentLayerGroup
+        input_layer_name = inputs
+    else:
+        input_layer_name = name + "_" + "transform_input"
+        Layer(
+            name = input_layer_name,
+            type = "mixed",
+            size = size * 3,
+            active_type = "",
+            bias = False,
+            inputs = inputs,
+        )
+
+    if para_prefix is None: 
+        para_prefix = name
+    if out_memory is None:
+        out_memory = Memory(name = name, size = size)
+
+    Layer(
+          name = name + "_" + "update_gate",
+          type = "mixed",
+          size = size, 
+          active_type = gate_active_type,
+          error_clipping_threshold = error_clipping_threshold,
+          bias = Bias(initial_std = 0, parameter_name = para_prefix + "_update_gate.b"),
+          inputs = [IdentityOffsetProjection(input_layer_name, offset=0),
+                    FullMatrixProjection(out_memory,
+                                         parameter_name = para_prefix + "_update_gate.w")],
+    )
+    Layer(
+          name = name + "_" + "reset_gate",
+          type = "mixed",
+          size = size, 
+          active_type = gate_active_type,
+          error_clipping_threshold = error_clipping_threshold,
+          bias = Bias(initial_std = 0, parameter_name = para_prefix + "_reset_gate.b"),
+          inputs = [IdentityOffsetProjection(input_layer_name, offset=size),
+                    FullMatrixProjection(out_memory,
+                                         parameter_name = para_prefix + "_reset_gate.w")],
+    )
+    ExpressionLayer(
+          name = name + "_" + "reset_output",
+          inputs = DotMulOperator([out_memory, name + "_" + "reset_gate"]),
+    )
+    Layer(
+          name = name + "_" + "output_candidate",
+          type = "mixed",
+          size = size, 
+          active_type = active_type,
+          error_clipping_threshold = error_clipping_threshold,
+          bias = Bias(initial_std = 0, parameter_name = para_prefix + "_output_candidate.b"),
+          inputs = [IdentityOffsetProjection(input_layer_name, offset=size*2),
+                    FullMatrixProjection(name + "_" + "reset_output",
+                                         parameter_name = para_prefix + "_output_candidate.w")],
+    )
+    ExpressionLayer( #element-wise interpolation
+          name = name,
+          inputs = [IdentityProjection(out_memory),
+                    DotMulOperator([out_memory, 
+                                    name + "_" + "update_gate"], scale=-1.0),
+                    DotMulOperator([name + "_" + "output_candidate", 
+                                    name + "_" + "update_gate"]),
+                    ],
+    )
+
+# like GatedRecurrentUnit, but it's a layer group.
+# it is equivalent to GatedRecurrentLayer.
+def GatedRecurrentLayerGroup(name, size, 
+                             active_type, gate_active_type, 
+                             inputs, para_prefix = None,
+                             error_clipping_threshold = 0,
+                             seq_reversed = False):
+
+    input_layer_name = name + "_" + "transform_input"
+    Layer(
+          name = input_layer_name,
+          type = "mixed",
+          size = size * 3,
+          active_type = "",
+          bias = False,
+          inputs = inputs,
+    )
+
+    RecurrentLayerGroupBegin(name + "_layer_group", 
+                             in_links = [input_layer_name], 
+                             out_links = [name],
+                             seq_reversed = seq_reversed)
+
+    GatedRecurrentUnit(
+        name = name,
+        size = size,
+        active_type = active_type,
+        gate_active_type = gate_active_type,
+        inputs = input_layer_name, #transform outside
+        para_prefix = para_prefix,
+        error_clipping_threshold = error_clipping_threshold,
+        )
+
+    RecurrentLayerGroupEnd(name + "_layer_group")
+
diff --git a/python/paddle/trainer_config_helpers/__init__.py b/python/paddle/trainer_config_helpers/__init__.py
new file mode 100644
index 00000000000000..451b9ac3396ead
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from activations import *
+from data_sources import *
+from poolings import *
+from evaluators import *
+from layers import *
+from networks import *
+from optimizers import *
+from attrs import *
diff --git a/python/paddle/trainer_config_helpers/activations.py b/python/paddle/trainer_config_helpers/activations.py
new file mode 100644
index 00000000000000..b5895680a3419e
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/activations.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Paddle Supported Activations.
+
+Each activation inherit BaseActivation, which has two attributes.
+
+- name: activation name in paddle config.
+- support_hppl: True if supported by hppl. lstm layer can only use activations
+                supported by hppl (the name hppl will be revised later).
+"""
+
+__all__ = ["TanhActivation", "SigmoidActivation",
+           "SoftmaxActivation", "IdentityActivation", "LinearActivation",
+           'SequenceSoftmaxActivation',
+           "ReluActivation", "BReluActivation", "SoftReluActivation", "STanhActivation",
+           "AbsActivation", "SquareActivation", "BaseActivation"]
+
+
+class BaseActivation(object):
+    """
+    A mark for activation class.
+    """
+
+    def __init__(self, name, support_hppl):
+        self.name = name
+        self.support_hppl = support_hppl
+
+
+class TanhActivation(BaseActivation):
+    """
+    Tanh activation.
+
+    .. math::
+
+       f(z)=tanh(z)=\\frac{e^z-e^{-z}}{e^z+e^{-z}}
+    """
+
+    def __init__(self): BaseActivation.__init__(self, 'tanh', True)
+
+
+class SigmoidActivation(BaseActivation):
+    """
+    Sigmoid activation.
+
+    .. math::
+
+       f(z) = \\frac{1}{1+exp(-z)}
+    """
+
+    def __init__(self): BaseActivation.__init__(self, 'sigmoid', True)
+
+
+class SoftmaxActivation(BaseActivation):
+    """
+    Softmax activation for simple input
+
+
+
+    .. math::
+
+       P(y=j|x) = \\frac{e^{x_j}} {\\sum^K_{k=1} e^{x_j} }
+    """
+
+    def __init__(self):
+        BaseActivation.__init__(self, 'softmax', False)
+
+
+class SequenceSoftmaxActivation(BaseActivation):
+    """
+    Softmax activation for one sequence. The dimension of input feature must be
+    1 and a sequence.
+
+    ..  code:: python
+
+        result = softmax(for each_feature_vector[0] in input_feature)
+        for i, each_time_step_output in enumerate(output):
+            each_time_step_output = result[i]
+    """
+
+    def __init__(self):
+        BaseActivation.__init__(self, 'sequence_softmax', False)
+
+
+class IdentityActivation(BaseActivation):
+    """
+    Identity Activation.
+
+    Just do nothing for output both forward/backward.
+    """
+
+    def __init__(self): BaseActivation.__init__(self, '', False)
+
+
+LinearActivation = IdentityActivation
+
+
+class ReluActivation(BaseActivation):
+    """
+    Relu activation.
+
+    forward. :math:`y = max(0, z)`
+
+    derivative:
+
+    .. math::
+
+       1  &\\quad if z > 0 \\\\
+       0  &\\quad\\mathrm{otherwize}
+    """
+
+    def __init__(self): BaseActivation.__init__(self, 'relu', True)
+
+
+class BReluActivation(BaseActivation):
+    """
+    BRelu Activation.
+
+    forward.  :math:`y = min(24, max(0, z))`
+
+    derivative:
+
+    .. math::
+
+       1  &\\quad if 0 < z < 24 \\\\
+       0  &\\quad \\mathrm{otherwise}
+    """
+
+    def __init__(self): BaseActivation.__init__(self, 'brelu', False)
+
+
+class SoftReluActivation(BaseActivation):
+    """
+    SoftRelu Activation.
+    """
+
+    def __init__(self): BaseActivation.__init__(self, 'softrelu', False)
+
+class STanhActivation(BaseActivation):
+    """
+    Scaled Tanh Activation.
+
+    .. math::
+
+       f(z) = 1.7159 * tanh(2/3*z)
+    """
+
+    def __init__(self): BaseActivation.__init__(self, 'stanh', False)
+
+
+class AbsActivation(BaseActivation):
+    """
+    Abs Activation.
+
+    Forward:    :math:`f(z) = abs(z)`
+
+    Derivative:
+
+    .. math::
+
+       1 &\\quad if \\quad z > 0 \\\\
+       -1 &\\quad if \\quad z < 0 \\\\
+       0 &\\quad if \\quad z = 0
+    """
+
+    def __init__(self): BaseActivation.__init__(self, 'abs', False)
+
+
+class SquareActivation(BaseActivation):
+    """
+    Square Activation.
+
+    .. math::
+       f(z) = z^2.
+    """
+
+    def __init__(self): BaseActivation.__init__(self, 'square', False)
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
new file mode 100644
index 00000000000000..7b0a398d191722
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.config_parser import *
+__all__ = ['ParamAttr', 'ExtraAttr', 'ParameterAttribute',
+           'ExtraLayerAttribute']
+
+
+class ParameterAttribute(object):
+    """
+    Parameter Attributes object. To fine-tuning network training process, user
+    can set attribute to control training details, such as l1,l2 rate / learning
+    rate / how to init param.
+
+    NOTE: IT IS A HIGH LEVEL USER INTERFACE.
+
+    :param is_static: True if this parameter will be fixed while training.
+    :type is_static: bool
+
+    :param initial_std: Gauss Random initialization standard deviation.
+                        None if not using Gauss Random initialize parameter.
+    :type initial_std: float or None
+    :param initial_mean:  Gauss Random initialization mean.
+                         None if not using Gauss Random initialize parameter.
+    :type initial_mean: float or None
+    :param initial_max: Uniform initialization max value.
+    :type initial_max: float or None
+    :param initial_min: Uniform initialization min value.
+    :type initial_min: float or None
+    :param l1_rate: the l1 regularization factor
+    :type l1_rate: float or None
+    :param l2_rate: the l2 regularization factor
+    :type l2_rate: float or None
+    :param learning_rate: The parameter learning rate. None means 1.
+                          The learning rate when optimize is LEARNING_RATE =
+                          GLOBAL_LEARNING_RATE * PARAMETER_LEARNING_RATE
+                          * SCHEDULER_FACTOR.
+
+    :type learning_rate: float or None
+    :param momentum: The parameter momentum. None means use global value.
+    :type momentum: float or None
+    :param sparse_update: Enable sparse update for this parameter. It will
+                          enable both local and remote sparse update.
+    :type sparse_update: bool
+    """
+
+    def __init__(self, name=None, is_static=False, initial_std=None,
+                 initial_mean=None, initial_max=None, initial_min=None,
+                 l1_rate=None, l2_rate=None, learning_rate=None, momentum=None,
+                 sparse_update=False):
+        # initialize strategy.
+        if is_static:
+            self.attr = {'is_static': True}
+        elif initial_std is None and initial_mean is None and initial_max \
+                is None and initial_min is None:
+            self.attr = {'initial_smart': True}
+        elif isinstance(initial_std, float) or isinstance(initial_mean, float):
+            self.attr = dict()
+            if initial_std is not None:
+                self.attr['initial_std'] = initial_std
+            if initial_mean is not None:
+                self.attr['initial_mean'] = initial_mean
+            self.attr['initial_strategy'] = 0  # Gauss Random
+        elif isinstance(initial_max, float) and isinstance(initial_min, float):
+            assert initial_min < initial_max
+            initial_mean = (initial_max + initial_min) / 2
+            initial_std = initial_mean - initial_min
+            self.attr = dict()
+            self.attr['initial_mean'] = initial_mean
+            self.attr['initial_std'] = initial_std
+            self.attr['initial_strategy'] = 1  # Uniform Random
+        else:
+            raise RuntimeError("Unexpected branch.")
+
+        if not is_static and isinstance(l1_rate, float):
+            self.attr['decay_rate_l1'] = l1_rate
+
+        if not is_static and isinstance(l2_rate, float):
+            self.attr['decay_rate'] = l2_rate
+
+        if not is_static and isinstance(learning_rate, float):
+            self.attr['learning_rate'] = learning_rate
+
+        if not is_static and isinstance(momentum, float):
+            self.attr['momentum'] = momentum
+
+        if name is not None:
+            self.attr['parameter_name'] = name
+
+        if sparse_update:
+            self.attr['sparse_update'] = True
+            self.attr['sparse_remote_update'] = True
+
+    def set_default_parameter_name(self, name):
+        """
+        Set default parameter name. If parameter not set, then will use default
+        parameter name.
+
+
+        :param name: default parameter name.
+        :type name: basestring
+        """
+        if 'parameter_name' not in self.attr:
+            self.attr['parameter_name'] = name
+
+    @staticmethod
+    def to_bias(bias_attr):
+        if isinstance(bias_attr, ParameterAttribute):
+            return Bias(**bias_attr.attr)
+        else:
+            return False
+
+
+class ExtraLayerAttribute(object):
+    """
+    Some high level layer attributes config. You can set all attributes here,
+    but some layer doesn't support all attributes. If you set an attribute to a
+    layer that not support this attribute, paddle will print an error and core.
+
+    :param error_clipping_threshold: Error clipping threshold.
+    :type error_clipping_threshold: float
+    :param drop_rate: Dropout rate. Dropout will create a mask on layer output.
+                      The dropout rate is the zero rate of this mask. The
+                      details of what dropout is please refer to `here
+                      <https://www.cs.toronto.edu/~hinton/absps/
+                      JMLRdropout.pdf>`_
+    :type drop_rate: float
+
+    """
+
+    def __init__(self, error_clipping_threshold=None, drop_rate=None):
+        self.attr = dict()
+        if isinstance(error_clipping_threshold, float):
+            assert error_clipping_threshold > 0
+            self.attr["error_clipping_threshold"] = error_clipping_threshold
+
+        if isinstance(drop_rate, float):
+            assert drop_rate > 0
+            self.attr["drop_rate"] = drop_rate
+
+    def check(self, layer_name):
+        for key in self.attr:
+            if not hasattr(self, 'can_%s' % key) or \
+                    not getattr(self, 'can_%s' % key):
+                raise NotImplementedError(
+                    "Layer %s cannot support %s" % (layer_name, key))
+
+    @staticmethod
+    def to_kwargs(attr):
+        if attr is None:
+            return dict()
+        else:
+            return attr.attr
+
+
+ParamAttr = ParameterAttribute
+ExtraAttr = ExtraLayerAttribute
diff --git a/python/paddle/trainer_config_helpers/data_sources.py b/python/paddle/trainer_config_helpers/data_sources.py
new file mode 100644
index 00000000000000..8f3dcb96a931da
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/data_sources.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Data Sources are helpers to define paddle training data or testing data.
+There are several data attributes will be used by paddle:
+
+- Data ProviderType\: such as Python, Protobuf
+- Data File list\: a single file that contains all data file paths
+"""
+from paddle.trainer.config_parser import *
+from .utils import deprecated
+
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+
+__all__ = ['define_py_data_sources',
+           'define_py_data_sources2']
+
+
+def define_py_data_source(file_list, cls, module,
+                          obj, args=None, async=False,
+                          data_cls=PyData):
+    """
+    Define a python data source.
+
+    For example, the simplest usage in trainer_config.py as follow:
+
+    ..  code-block:: python
+
+        define_py_data_source("train.list", TrainData, "data_provider", "process")
+
+    Or. if you want to pass arguments from trainer_config to data_provider.py, then
+
+    ..  code-block:: python
+
+        define_py_data_source("train.list", TrainData, "data_provider", "process",
+                              args={"dictionary": dict_name})
+
+    The related data provider can refer to 
+    `here <data_provider/python_case.html#quick-start>`__.
+
+    :param data_cls:
+    :param file_list: file list name.
+    :type file_list: basestring
+    :param cls: Train or Test Class.
+    :type cls: TrainData or TestData
+    :param module: python module name.
+    :type module: basestring
+    :param obj: python object name. May be a function name if using
+                PyDataProviderWrapper.
+    :type obj: basestring
+    :param args: The best practice is using dict to pass arguments into 
+                 DataProvider, and use :code:`@init_hook_wrapper` to 
+                 receive arguments.
+    :type args: string or picklable object
+    :param async: Load Data asynchronously or not.
+    :type async: bool
+    :return: None
+    :rtype: None
+    """
+    if isinstance(file_list, list):
+        file_list_name = 'train.list'
+        if isinstance(cls, TestData):
+            file_list_name = 'test.list'
+        with open(file_list_name, 'r') as f:
+            f.writelines(file_list)
+        file_list = file_list_name
+
+    if not isinstance(args, basestring) and args is not None:
+        args = pickle.dumps(args, 0)
+
+    if data_cls is None:
+        def py_data2(files, load_data_module, load_data_object, load_data_args,
+                    **kwargs):
+            data = DataBase()
+            data.type = 'py2'
+            data.files = files
+            data.load_data_module = load_data_module
+            data.load_data_object = load_data_object
+            data.load_data_args = load_data_args
+            return data
+        data_cls = py_data2
+
+    cls(data_cls(files=file_list,
+                 load_data_module=module,
+                 load_data_object=obj,
+                 load_data_args=args,
+                 async_load_data=async))
+
+
+def define_py_data_sources(train_list, test_list, module, obj, args=None,
+                           train_async=False, data_cls=PyData):
+    """
+    Define python Train/Test data sources in one method. If train/test use
+    the same Data Provider configuration, module/obj/args contain one argument,
+    otherwise contain a list or tuple of arguments. For example\:
+
+    ..  code-block:: python
+
+        define_py_data_sources("train.list", "test.list", module="data_provider"
+                               obj="process", args={"dictionary": dict_name})
+
+    Or.
+
+    ..  code-block:: python
+
+        define_py_data_sources("train.list", "test.list", module="data_provider"
+                               obj=["process_train", "process_test"],
+                               args=[{"dictionary": dict_train}, {"dictionary": dict_test}])
+
+    The related data provider can refer to 
+    `here <data_provider/python_case.html#sequence-example>`__.
+
+    :param data_cls:
+    :param train_list: Train list name.
+    :type train_list: basestring
+    :param test_list: Test list name.
+    :type test_list: basestring
+    :param module: python module name. If train and test is different, then
+                   pass a tuple or list to this argument.
+    :type module: basestring or tuple or list
+    :param obj: python object name. May be a function name if using
+                PyDataProviderWrapper. If train and test is different, then pass
+                a tuple or list to this argument.
+    :type obj: basestring or tuple or list
+    :param args: The best practice is using dict() to pass arguments into
+                 DataProvider, and use :code:`@init_hook_wrapper` to receive 
+                 arguments. If train and test is different, then pass a tuple 
+                 or list to this argument.
+    :type args: string or picklable object or list or tuple.
+    :param train_async: Is training data load asynchronously or not.
+    :type train_async: bool
+    :return: None
+    :rtype: None
+    """
+
+    def __is_splitable__(o):
+        return (isinstance(o, list) or isinstance(o, tuple)
+                ) and hasattr(o, '__len__') and len(o) == 2
+
+    assert train_list is not None or test_list is not None
+    assert module is not None and obj is not None
+
+    test_module = module
+    train_module = module
+    if __is_splitable__(module):
+        train_module, test_module = module
+
+    test_obj = obj
+    train_obj = obj
+    if __is_splitable__(obj):
+        train_module, test_module = module
+
+    if args is None:
+        args = ""
+
+    train_args = args
+    test_args = args
+    if __is_splitable__(args):
+        train_args, test_args = args
+
+    if train_list is not None:
+        define_py_data_source(train_list, TrainData, train_module, train_obj,
+                              train_args, train_async, data_cls)
+
+    if test_list is not None:
+        define_py_data_source(test_list, TestData, test_module, test_obj,
+                              test_args, False, data_cls)
+
+
+def define_py_data_sources2(train_list, test_list, module, obj, args=None):
+    define_py_data_sources(train_list=train_list,
+                           test_list=test_list,
+                           module=module,
+                           obj=obj,
+                           args=args,
+                           data_cls=None)
diff --git a/python/paddle/trainer_config_helpers/default_decorators.py b/python/paddle/trainer_config_helpers/default_decorators.py
new file mode 100644
index 00000000000000..b20aebc685fe5a
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/default_decorators.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+from .attrs import ParamAttr
+from .activations import TanhActivation
+from paddle.trainer.config_parser import *
+
+__all__ = ['wrap_name_default', 'wrap_param_attr_default',
+           'wrap_bias_attr_default', 'wrap_act_default',
+           'wrap_param_default']
+
+
+def __default_not_set_callback__(kwargs, name):
+    return name not in kwargs or kwargs[name] is None
+
+
+def wrap_param_default(param_names=None, default_factory=None,
+                       not_set_callback=__default_not_set_callback__):
+    assert param_names is not None
+    assert isinstance(param_names, list) or isinstance(param_names, tuple)
+    for each_param_name in param_names:
+        assert isinstance(each_param_name, basestring)
+
+    def __impl__(func):
+        @functools.wraps(func)
+        def __wrapper__(*args, **kwargs):
+            if len(args) != 0:
+                logger.warning("please use keyword arguments in paddle config.")
+
+            for name in param_names:
+                if not_set_callback(kwargs, name):  # Not set
+                    kwargs[name] = default_factory(func)
+            return func(*args, **kwargs)
+
+        return __wrapper__
+
+    return __impl__
+
+
+class DefaultNameFactory(object):
+    def __init__(self, name_prefix):
+        self.__counter__ = 0
+        self.__name_prefix__ = name_prefix
+
+    def __call__(self, func):
+        if self.__name_prefix__ is None:
+            self.__name_prefix__ = func.__name__
+        tmp = "__%s_%d__" % (self.__name_prefix__, self.__counter__)
+        self.__check_name__(tmp)
+        self.__counter__ += 1
+        return tmp
+
+    def __check_name__(self, nm):
+        """
+        @TODO(yuyang18): Implement it!
+        @param nm:
+        @return:
+        """
+        pass
+
+
+def wrap_name_default(name_prefix=None):
+    """
+    Decorator to set "name" arguments default to "{name_prefix}_{invoke_count}".
+
+    ..  code:: python
+
+        @default_name("some_name")
+        def func(name=None):
+            print name      # name will never be None. If name is not set,
+                            # name will be "some_name_%d"
+
+    :param name_prefix: name prefix. wrapped function's __name__ if None.
+    :type name_prefix: basestring
+    :return: a decorator to set default name
+    :rtype: callable
+    """
+    return wrap_param_default(["name"], DefaultNameFactory(name_prefix))
+
+
+def wrap_param_attr_default(param_names=None, default_factory=None):
+    """
+    Setting Default Parameter Attributes Decorator.
+
+    :param default_factory:
+    :param param_names: Parameter Attribute's Names, list of string
+    :type param_names: list
+    :return: decorator
+    """
+    if param_names is None:
+        param_names = ['param_attr']
+    if default_factory is None:
+        default_factory = lambda _: ParamAttr()
+
+    return wrap_param_default(param_names, default_factory)
+
+
+def wrap_bias_attr_default(param_names=None, default_factory=None,
+                           has_bias=True):
+    if param_names is None:
+        param_names = ['bias_attr']
+    if default_factory is None:
+        default_factory = lambda _: ParamAttr(initial_std=0.,
+                                              initial_mean=0.)
+
+    def __bias_attr_not_set__(kwargs, name):
+        if has_bias:
+            return name not in kwargs or kwargs[name] is None or \
+                   kwargs[name] == True
+        else:
+            return name in kwargs and kwargs[name] == True
+
+    return wrap_param_default(param_names, default_factory,
+                              __bias_attr_not_set__)
+
+
+def wrap_act_default(param_names=None, act=None):
+    if param_names is None:
+        param_names = ["act"]
+
+    if act is None:
+        act = TanhActivation()
+
+    return wrap_param_default(param_names, lambda _: act)
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
new file mode 100644
index 00000000000000..6d47eccca6137c
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -0,0 +1,651 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Evaluator will evaluate the network status while training/testing.
+
+User can use evaluator by classify/regression job. For example.
+
+..  code-block:: python
+
+    classify(prediction, output, evaluator=classification_error_evaluator)
+
+And user could define evaluator separately as follow.
+
+..  code-block:: python
+
+    classification_error_evaluator("ErrorRate", prediction, label)
+
+The evaluator often contains a name parameter. It will also be printed when
+evaluating network. The printed information may look like the following.
+
+..  code-block:: text
+
+     Batch=200 samples=20000 AvgCost=0.679655 CurrentCost=0.662179 Eval:
+     classification_error_evaluator=0.4486
+     CurrentEval: ErrorRate=0.3964
+
+"""
+from paddle.trainer.config_parser import *
+from default_decorators import *
+
+__all__ = ["classification_error_evaluator", "auc_evaluator",
+           "pnpair_evaluator", "precision_recall_evaluator",
+           "ctc_error_evaluator", "chunk_evaluator", "sum_evaluator",
+           "column_sum_evaluator", "value_printer_evaluator",
+           "gradient_printer_evaluator", "maxid_printer_evaluator",
+           "maxframe_printer_evaluator", "seqtext_printer_evaluator",
+           "classification_error_printer_evaluator"]
+
+
+class EvaluatorAttribute(object):
+    FOR_CLASSIFICATION = 1
+    FOR_REGRESSION = 1 << 1
+    FOR_RANK = 1 << 2
+    FOR_PRINT = 1 << 3
+    FOR_UTILS = 1 << 4
+
+    KEYS = [
+        "for_classification",
+        "for_regression",
+        "for_rank",
+        "for_print",
+        "for_utils"
+    ]
+
+    @staticmethod
+    def to_key(idx):
+        tmp = 1
+        for i in xrange(0, len(EvaluatorAttribute.KEYS)):
+            if idx == tmp:
+                return EvaluatorAttribute.KEYS[i]
+            else:
+                tmp = (tmp << 1)
+
+
+def evaluator(*attrs):
+    def impl(method):
+        for attr in attrs:
+            setattr(method, EvaluatorAttribute.to_key(attr), True)
+        method.is_evaluator = True
+        return method
+    return impl
+
+def evaluator_base(
+        input,
+        type,
+        label=None,
+        weight=None,
+        name=None,
+        chunk_scheme=None,
+        num_chunk_types=None,
+        classification_threshold=0.5,
+        positive_label=-1,
+        dict_file="",
+        result_file="",
+        num_results=1,
+        delimited=True):
+    """
+    :param input: Input layers, a object of LayerOutput or a list of
+                  LayerOutput.
+    :type input: list|LayerOutput
+    :param label: An input layer containing the ground truth label.
+    :type label: LayerOutput|None
+    :param weight: An input layer which is a weight for each sample.
+                   Each evaluator may calculate differently to use this weight.
+    :type weight: LayerOutput.
+    """
+    # inputs type assertions.
+    assert isinstance(classification_threshold, float)
+    assert isinstance(positive_label, int)
+    assert isinstance(num_results, int)
+
+    if not isinstance(input, list):
+        input = [input]
+
+    if label:
+        input.append(label)
+    if weight:
+        input.append(weight)
+
+    Evaluator(
+        name=name,
+        type=type,
+        inputs=[i.name for i in input],
+        chunk_scheme=chunk_scheme,
+        num_chunk_types=num_chunk_types,
+        classification_threshold=classification_threshold,
+        positive_label=positive_label,
+        dict_file=dict_file,
+        result_file=result_file,
+        delimited=delimited)
+
+@evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
+@wrap_name_default()
+def classification_error_evaluator(
+        input,
+        label,
+        name=None,
+        weight=None,
+        threshold=0.5):
+    """
+    Classification Error Evaluator. It will print error rate for classification.
+
+    The classification error is:
+
+    ..  math::
+
+        classification\\_error = \\frac{NumOfWrongPredicts}{NumOfAllSamples}
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       eval =  classification_error_evaluator(input=prob,label=lbl)
+
+    :param name: Evaluator name.
+    :type name: basestring
+    :param input: Input Layer name. The output prediction of network.
+    :type input: LayerOutput
+    :param label: Label layer name.
+    :type label: basestring
+    :param weight: Weight Layer name. It should be a matrix with size
+                  [sample_num, 1]. And will just multiply to NumOfWrongPredicts
+                  and NumOfAllSamples. So, the elements of weight are all one,
+                  then means not set weight. The larger weight it is, the more
+                  important this sample is.
+    :type weight: LayerOutput
+    :param threshold: The classification threshold.
+    :type threshold: float
+    :return: None.
+    """
+
+    evaluator_base(name=name,
+                   type="classification_error",
+                   input=input,
+                   label=label,
+                   weight=weight,
+                   classification_threshold=threshold,
+                   )
+
+@evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
+@wrap_name_default()
+def auc_evaluator(
+        input,
+        label,
+        name=None,
+        weight=None,
+        ):
+    """
+    Auc Evaluator which adapts to binary classification.
+
+    The simple usage:
+
+    .. code-block:: python
+
+       eval = auc_evaluator(input, label)
+
+    :param name: Evaluator name.
+    :type name: None|basestring
+    :param input: Input Layer name. The output prediction of network.
+    :type input: LayerOutput
+    :param label: Label layer name.
+    :type label: None|basestring
+    :param weight: Weight Layer name. It should be a matrix with size
+                  [sample_num, 1].
+    :type weight: LayerOutput
+    """
+    evaluator_base(name=name,
+                   type="last-column-auc",
+                   input=input,
+                   label=label,
+                   weight=weight)
+
+@evaluator(EvaluatorAttribute.FOR_RANK)
+@wrap_name_default()
+def pnpair_evaluator(
+        input,
+        label,
+        info,
+        name=None,
+        weight=None,
+        ):
+    """
+    Positive-negative pair rate Evaluator which adapts to rank task like
+    learning to rank. This evaluator must contain at least three layers.
+
+    The simple usage:
+
+    .. code-block:: python
+
+       eval = pnpair_evaluator(input, info, label)
+
+    :param name: Evaluator name.
+    :type name: None|basestring
+    :param input: Input Layer name. The output prediction of network.
+    :type input: LayerOutput
+    :param label: Label layer name.
+    :type label: LayerOutput
+    :param info: Label layer name. (TODO, explaination)
+    :type info: LayerOutput
+    :param weight: Weight Layer name. It should be a matrix with size
+                  [sample_num, 1]. (TODO, explaination)
+    :type weight: LayerOutput
+    """
+    evaluator_base(name=name,
+                   type="pnpair",
+                   input=input,
+                   label=label,
+                   info=info,
+                   weight=weight)
+
+@evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
+@wrap_name_default()
+def precision_recall_evaluator(
+        input,
+        label,
+        positive_label=-1,
+        weight=None,
+        name=None,
+        ):
+    """
+    An Evaluator to calculate precision and recall, F1-score.
+    It is adapt to the task with multiple labels.
+
+    - If positive_label=-1, it will print the average precision, recall,
+      F1-score of all labels.
+
+    - If use specify positive_label, it will print the precision, recall,
+      F1-score of this label.
+
+    The simple usage:
+
+    .. code-block:: python
+
+       eval = precision_recall_evaluator(input, label)
+
+    :param name: Evaluator name.
+    :type name: None|basestring
+    :param input: Input Layer name. The output prediction of network.
+    :type input: LayerOutput
+    :param label: Label layer name.
+    :type label: LayerOutput
+    :param positive_label: The input label layer.
+    :type positive_label: LayerOutput.
+    :param weight: Weight Layer name. It should be a matrix with size
+                  [sample_num, 1]. (TODO, explaination)
+    :type weight: LayerOutput
+    """
+    evaluator_base(name=name,
+                   type="precision_recall",
+                   input=input,
+                   label=label,
+                   positive_label=positive_label,
+                   weight=weight)
+
+@evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
+@wrap_name_default()
+def ctc_error_evaluator(
+        input,
+        name=None,
+        ):
+    """
+    This evaluator is to calculate sequence-to-sequence edit distance.
+
+    The simple usage is :
+
+    .. code-block:: python
+
+       eval = ctc_error_evaluator(input)
+
+    :param name: Evaluator name.
+    :type name: None|basestring
+    :param input: Input Layer.
+    :type input: LayerOutput
+    """
+    evaluator_base(name=name,
+                   type="ctc_edit_distance",
+                   input=input)
+
+@evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
+@wrap_name_default()
+def chunk_evaluator(
+        input,
+        name=None,
+        chunk_scheme=None,
+        num_chunk_types=None,
+        ):
+    """
+    Chunk evaluator is used to evaluate segment labelling accuracy for a
+    sequence. It calculates the chunk detection F1 score.
+
+    A chunk is correctly detected if its beginning, end and type are correct.
+    Other chunk type is ignored.
+
+    For each label in the label sequence, we have:
+
+    .. code-block:: python
+
+       tagType = label % numTagType
+       chunkType = label / numTagType
+       otherChunkType = numChunkTypes
+
+    The total number of different labels is numTagType*numChunkTypes+1.
+    We support 4 labelling scheme.
+    The tag type for each of the scheme is shown as follows:
+
+    .. code-block:: python
+
+       Scheme Begin Inside End   Single
+       plain  0     -      -     -
+       IOB    0     1      -     -
+       IOE    -     0      1     -
+       IOBES  0     1      2     3
+
+    'plain' means the whole chunk must contain exactly the same chunk label.
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       eval = chunk_evaluator(input)
+
+    :param input: The input layers.
+    :type input: LayerOutput
+    :param name: The Evaluator name, it is not necessary.
+    :type name: basename|None
+    :param chunk_scheme: The labelling schemes support 4 types. It is one of
+                         "IOB", "IOE", "IOBES", "plain".This Evaluator must
+                         contain this chunk_scheme.
+    :type chunk_scheme: basestring
+    :param num_chunk_types: number of chunk types other than "other"
+    """
+    evaluator_base(name=name,
+                   type="chunk",
+                   input=input,
+                   chunk_scheme=chunk_scheme,
+                   num_chunk_types=num_chunk_types)
+
+@evaluator(EvaluatorAttribute.FOR_UTILS)
+@wrap_name_default()
+def sum_evaluator(
+        input,
+        name=None,
+        weight=None,
+        ):
+    """
+    An Evaluator to sum the result of input.
+
+    The simple usage:
+
+    .. code-block:: python
+
+       eval = sum_evaluator(input)
+
+    :param name: Evaluator name.
+    :type name: None|basestring
+    :param input: Input Layer name.
+    :type input: LayerOutput
+    :param weight: Weight Layer name. It should be a matrix with size
+                  [sample_num, 1]. (TODO, explaination)
+    :type weight: LayerOutput
+    """
+    evaluator_base(name=name,
+                   type="sum",
+                   input=input,
+                   weight=weight)
+
+@evaluator(EvaluatorAttribute.FOR_UTILS)
+@wrap_name_default()
+def column_sum_evaluator(
+        input,
+        name=None,
+        weight=None,
+        ):
+    """
+    This Evaluator is used to sum the last column of input.
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       eval = column_sum_evaluator(input, label)
+
+    :param name: Evaluator name.
+    :type name: None|basestring
+    :param input: Input Layer name.
+    :type input: LayerOutput
+    """
+    evaluator_base(name=name,
+                   type="last-column-sum",
+                   input=input,
+                   weight=weight)
+
+"""
+The following are printer Evaluators which are usually used to
+print the result, like value or gradient of input layers, the
+results generated in machine translation, the classification error etc.
+"""
+@evaluator(EvaluatorAttribute.FOR_PRINT)
+@wrap_name_default()
+def value_printer_evaluator(
+        input,
+        name=None,
+        ):
+    """
+    This Evaluator is used to print the values of input layers. It contains
+    one or more input layers.
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       eval = value_printer_evaluator(input)
+
+    :param input: One or more input layers.
+    :type input: LayerOutput|list
+    :param name: Evaluator name.
+    :type name: None|basestring
+    """
+    evaluator_base(name=name,
+                   type="value_printer",
+                   input=input)
+
+@evaluator(EvaluatorAttribute.FOR_PRINT)
+@wrap_name_default()
+def gradient_printer_evaluator(
+        input,
+        name=None,
+        ):
+    """
+    This Evaluator is used to print the gradient of input layers. It contains
+    one or more input layers.
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       eval = gradient_printer_evaluator(input)
+
+    :param input: One or more input layers.
+    :type input: LayerOutput|list
+    :param name: Evaluator name.
+    :type name: None|basestring
+    """
+    evaluator_base(name=name,
+                   type="gradient_printer",
+                   input=input)
+
+@evaluator(EvaluatorAttribute.FOR_PRINT)
+@wrap_name_default()
+def maxid_printer_evaluator(
+        input,
+        num_results=1,
+        name=None,
+        ):
+    """
+    This Evaluator is used to print maximum top k values and their indexes
+    of each row of input layers. It contains one or more input layers.
+    k is specified by num_results.
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       eval = maxid_printer_evaluator(input)
+
+    :param input: Input Layer name.
+    :type input: LayerOutput|list
+    :param num_results: This number is used to specify the top k numbers.
+                        It is 1 by default.
+    :type num_results: int.
+    :param name: Evaluator name.
+    :type name: None|basestring
+    """
+    evaluator_base(name=name,
+                   type="max_id_printer",
+                   input=input)
+
+@evaluator(EvaluatorAttribute.FOR_PRINT)
+@wrap_name_default()
+def maxframe_printer_evaluator(
+        input,
+        num_results=1,
+        name=None,
+        ):
+    """
+    This Evaluator is used to print the top k frames of each input layers.
+    The input layers should contain sequences info or sequences type.
+    k is specified by num_results.
+    It contains one or more input layers.
+
+    Note:
+        The width of each frame is 1.
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       eval = maxframe_printer_evaluator(input)
+
+    :param input: Input Layer name.
+    :type input: LayerOutput|list
+    :param name: Evaluator name.
+    :type name: None|basestring
+    """
+    evaluator_base(name=name,
+                   type="max_frame_printer",
+                   input=input,
+                   num_results=num_results)
+
+@evaluator(EvaluatorAttribute.FOR_PRINT)
+@wrap_name_default()
+def seqtext_printer_evaluator(
+        input,
+        dict_file="",
+        result_file="",
+        delimited=True,
+        name=None,
+        ):
+    """
+    Sequence text printer will print text according to index matrix and a
+    dictionary. There can be multiple input to this layer:
+
+    1. If there is only one input, the input must be a matrix containing
+    the sequence of indices;
+
+    2. If there are more than one input, the first input should be ids,
+    and are interpreted as sample ids.
+
+    The output format will be:
+
+    1. sequence without sub-sequence, and there is probability.
+
+    .. code-block:: python
+
+         id \t prob space_seperated_tokens_from_dictionary_according_to_seq
+
+    2. sequence without sub-sequence, and there is not probability.
+
+    .. code-block:: python
+
+         id \t space_seperated_tokens_from_dictionary_according_to_seq
+
+    3. sequence with sub-sequence, and there is not probability.
+
+    .. code-block:: python
+
+         id \t space_seperated_tokens_from_dictionary_according_to_sub_seq
+         \t \t space_seperated_tokens_from_dictionary_according_to_sub_seq
+         ...
+
+    Typically SequenceTextPrinter layer takes output of maxid or RecurrentGroup
+    with maxid (when generating) as an input.
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       eval = seqtext_printer_evaluator(input,
+                                        dict_file=dict_file,
+                                        result_file=result_file)
+
+    :param input: Input Layer name.
+    :type input: LayerOutput|list
+    :param dict_file: The input dictionary which contains a list of tokens.
+    :type dict_file: basestring
+    :param result_file: The file is to save the results.
+    :type result_file: basestring
+    :param delimited: Whether to use space to separate output tokens.
+                Default is True. No space is added if set to False.
+    :type delimited: bool
+    :param name: Evaluator name.
+    :type name: None|basestring
+    """
+    evaluator_base(name=name,
+                   type="seq_text_printer",
+                   input=input,
+                   dict_file=dict_file,
+                   result_file=result_file,
+                   delimited=delimited)
+
+@evaluator(EvaluatorAttribute.FOR_PRINT)
+@wrap_name_default()
+def classification_error_printer_evaluator(
+        input,
+        label,
+        threshold=0.5,
+        name=None,
+        ):
+    """
+    This Evaluator is used to print the classification error of each sample.
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       eval = classification_error_printer_evaluator(input)
+
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param label: Input label layer.
+    :type label: LayerOutput
+    :param name: Evaluator name.
+    :type name: None|basestring
+    """
+    evaluator_base(name=name,
+                   type="classification_error_printer",
+                   input=input,
+                   label=label,
+                   classification_threshold=threshold)
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
new file mode 100644
index 00000000000000..931a373fcbcb38
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -0,0 +1,3327 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+
+from paddle.trainer.config_parser import *
+from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
+    ReluActivation, IdentityActivation, SoftmaxActivation
+from .evaluators import *
+from .poolings import MaxPooling, AvgPooling, BasePoolingType
+from .attrs import *
+from .default_decorators import *
+
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+import copy
+
+__all__ = ["full_matrix_projection", "AggregateLevel", "ExpandLevel",
+           "identity_projection", "dotmul_projection",
+           "table_projection", "mixed_layer", "data_layer",
+           "embedding_layer", "fc_layer", "grumemory",
+           "pooling_layer", "lstmemory", "last_seq", "first_seq",
+           "cos_sim", "hsigmoid",
+           "regression_cost", 'classification_cost', "LayerOutput",
+           'img_conv_layer', 'img_pool_layer', 'batch_norm_layer',
+           'img_cmrnorm_layer', 'img_rnorm_layer', 'addto_layer',
+           'concat_layer', 'lstm_step_layer', 'recurrent_group',
+           'memory', 'StaticInput', 'expand_layer', 'scaling_layer',
+           'power_layer', 'interpolation_layer', 'trans_layer',
+           'sum_to_one_norm_layer',
+           'get_output_layer', 'LayerType', 'context_projection',
+           'beam_search', 'maxid_layer', 'GeneratedInput', 'SubsequenceInput',
+           'gru_step_layer', 'recurrent_layer',
+           'BaseGeneratedInput', 'conv_operator', 'conv_shift_layer',
+           'tensor_layer', 'selective_fc_layer', 'sampling_id_layer',
+           'slope_intercept_layer', 'trans_full_matrix_projection',
+           'convex_comb_layer', 'ctc_layer', 'crf_layer', 'crf_decoding_layer',
+           'cross_entropy_with_selfnorm', 'cross_entropy',
+           'multi_binary_label_cross_entropy',
+           'rank_cost', 'lambda_cost', 'huber_cost',
+           'block_expand_layer',
+           ]
+
+
+class LayerType(object):
+    """
+    Layer type enumerations.
+    """
+
+    DATA = "data"
+    MIXED_LAYER = "mixed"
+    LSTMEMORY = "lstmemory"
+    GRUMEMORY = "gated_recurrent"
+    SEQUENCE_LAST_INSTANCE = "seqlastins"
+    SEQUENCE_FIRST_INSTANCE = "seqfirstins"
+    POOLING_MAX = "max"
+    POOLING_AVG = 'average'
+    FC_LAYER = "fc"
+    COST = 'cost'
+    COSINE_SIM = 'cos_vm'
+    HSIGMOID = 'hsigmoid'
+    CONV_LAYER = "conv"
+    POOL_LAYER = "pool"
+    BATCH_NORM_LAYER = 'batch_norm'
+    NORM_LAYER = 'norm'
+    SUM_TO_ONE_NORM_LAYER = 'sum_to_one_norm'
+    ADDTO_LAYER = 'addto'
+
+    CONCAT_LAYER = 'concat'
+    CONCAT_PROJ_LAYER = 'concat2'
+
+    LSTM_STEP_LAYER = 'lstm_step'
+    GRU_STEP_LAYER = 'gru_step'
+    GET_OUTPUT_LAYER = 'get_output'
+
+    EXPAND_LAYER = 'expand'
+    INTERPOLATION_LAYER = 'interpolation'
+    POWER_LAYER = 'power'
+    SCALING_LAYER = 'scaling'
+    TRANS_LAYER = 'trans'
+
+    MEMORY = 'memory'
+    MAXID_LAYER = 'maxid'
+    EOSID_LAYER = 'eos_id'
+    RECURRENT_LAYER = 'recurrent'
+
+    CONV_SHIFT_LAYER = "conv_shift"
+    TENSOR_LAYER = "tensor"
+    SEL_FC_LAYER = "selective_fc"
+    SAMPLING_ID_LAYER = "sampling_id"
+    SLOPE_INTERCEPT_LAYER = "slope_intercept"
+    CONVEX_COMBINATION_LAYER = "convex_comb"
+    BLOCK_EXPAND = "blockexpand"
+
+    CTC_LAYER = "ctc"
+    CRF_LAYER = "crf"
+    CRF_DECODING_LAYER = "crf_decoding"
+
+    RANK_COST = "rank-cost"
+    LAMBDA_COST = "lambda_cost"
+    HUBER = "huber"
+    CROSS_ENTROPY = "multi-class-cross-entropy"
+    CROSS_ENTROPY_WITH_SELFNORM = "multi_class_cross_entropy_with_selfnorm"
+    SOFT_BIN_CLASS_CROSS_ENTROPY = "soft_binary_class_cross_entropy"
+    MULTI_BIN_LABEL_CROSS_ENTROPY = "multi_binary_label_cross_entropy"
+
+    @staticmethod
+    def is_layer_type(type_name):
+        """
+        If type_name is a layer type.
+
+        :param type_name: layer type name. Because layer type enumerations are
+                          strings.
+        :type type_name: basestring
+        :return: True if is a layer_type
+        :rtype: bool
+        """
+        for key in dir(LayerType):
+            if key.isupper():
+                att = getattr(LayerType, key)
+                if isinstance(att, basestring) and type_name == att:
+                    return True
+        return False
+
+
+class AggregateLevel(object):
+    EACH_TIMESTEP = 'non-seq'
+    EACH_SEQUENCE = 'seq'
+
+
+class LayerOutput(object):
+    """
+    LayerOutput is output for layer function. It is used internally by several
+    reasons.
+
+    - Check layer connection make sense.
+
+        - FC(Softmax) => Cost(MSE Error) is not good for example.
+
+    - Tracking layer connection.
+
+    - Pass to layer methods as input.
+
+    :param name: Layer output name.
+    :type name: basestring
+    :param layer_type: Current Layer Type. One of LayerType enumeration.
+    :type layer_type: basestring
+    :param activation: Layer Activation.
+    :type activation: BaseActivation.
+    :param parents: Layer's parents.
+    :type parents: list|tuple
+    """
+
+    def __init__(self, name, layer_type, parents=None, activation=None,
+                 num_filters=None, img_norm_type=None, size=None, outputs=None):
+        assert isinstance(name, basestring)
+        assert isinstance(layer_type, basestring)
+        assert LayerType.is_layer_type(layer_type)
+        self.name = name
+        self.layer_type = layer_type
+        self.parents = [] if parents is None else parents
+        self.activation = activation
+        self.num_filters = num_filters
+        self.img_norm_type = img_norm_type
+        self.size = size
+        if outputs is None:
+            outputs = ['default']
+        self.outputs = outputs
+
+    def __repr__(self):
+        """
+        Disable __repr__ for debug reason. Will be implemented when release
+        """
+        assert False, "this method should not be invoked"
+
+    def __str__(self):
+        """
+        Disable __str__ for debug reason. Will be implemented when release
+        """
+        assert False, "this method should not be invoked"
+
+
+ERROR_CLIPPING = 'error_clipping_threshold'
+DROPOUT = 'drop_rate'
+
+
+def layer_support(*attrs):
+    def decorator(method):
+        @functools.wraps(method)
+        def wrapper(*args, **kwargs):
+            for attr in attrs:
+                for each in args:
+                    if isinstance(each, ExtraLayerAttribute):
+                        setattr(each, '_'.join(['can', attr]), True)
+                for key in kwargs:
+                    val = kwargs[key]
+                    if isinstance(val, ExtraLayerAttribute):
+                        setattr(val, '_'.join(['can', attr]), True)
+            for each in args:
+                if isinstance(each, ExtraLayerAttribute):
+                    each.check(method.__name__)
+            for key in kwargs:
+                val = kwargs[key]
+                if isinstance(val, ExtraLayerAttribute):
+                    val.check(method.__name__)
+            return method(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
+@wrap_param_attr_default()
+def full_matrix_projection(input, size=0, param_attr=None):
+    """
+    Full Matrix Projection. It performs full matrix multiplication.
+
+    ..  math::
+        out.row[i] += in.row[i] * weight
+
+    There are two styles of usage.
+
+    1. When used in mixed_layer like this, you can only set the input:
+
+    .. code-block:: python
+
+       with mixed_layer(size=100) as m:
+           m += full_matrix_projection(input=layer)
+
+    2. When used as an independant object like this, you must set the size:
+
+    .. code-block:: python
+
+       proj = full_matrix_projection(input=layer,
+                                     size=100,
+                                     param_attr=ParamAttr(name='_proj'))
+
+    :param input: input layer
+    :type input: LayerOutput
+    :param size: The parameter size. Means the width of parameter.
+    :type size: int
+    :param param_attr: Parameter config, None if use default.
+    :type param_attr: ParameterAttribute
+    :return: A FullMatrixProjection Object.
+    :rtype: FullMatrixProjection
+    """
+    proj = FullMatrixProjection(input_layer_name=input.name,
+                                size=size,
+                                **param_attr.attr)
+    proj.origin = input
+    proj.origin.projection = "matrix"
+    return proj
+
+
+@wrap_param_attr_default()
+def table_projection(input, size=0, param_attr=None):
+    """
+    Table Projection. It selects rows from parameter where row\_id
+    is in input\_ids.
+
+    .. math::
+       out.row[i] += table.row[ids[i]]
+
+    where :math:`out` is output, :math:`table` is parameter, :math:`ids` is input\_ids,
+    and :math:`i` is row\_id.
+
+    There are two styles of usage.
+
+    1. When used in mixed_layer like this, you can only set the input:
+
+    .. code-block:: python
+
+       with mixed_layer(size=100) as m:
+           m += table_projection(input=layer)
+
+    2. When used as an independant object like this, you must set the size:
+
+    .. code-block:: python
+
+       proj = table_projection(input=layer,
+                               size=100,
+                               param_attr=ParamAttr(name='_proj'))
+
+
+    :param input: Input layer, which must contains id fields.
+    :type input: LayerOutput
+    :param size: The parameter size. Means the width of parameter.
+    :type size: int
+    :param param_attr: Parameter config, None if use default.
+    :type param_attr: ParameterAttribute
+    :return: A TableProjection Object.
+    :rtype: TableProjection
+    """
+    proj = TableProjection(input_layer_name=input.name,
+                           size=size,
+                           **param_attr.attr)
+    proj.origin = input
+    proj.origin.projection = "table"
+    return proj
+
+
+def identity_projection(input, offset=None):
+    """
+    1. IdentityProjection if offset=None. It performs:
+
+    .. math::
+       out.row[i] += in.row[i]
+
+    The example usage is:
+
+    .. code-block:: python
+
+       proj = identity_projection(input=layer)
+
+
+    2. IdentityOffsetProjection if offset!=None. It likes IdentityProjection,
+    but layer size may be smaller than input size.
+    It select dimesions [offset, offset+layer_size) from input:
+
+    .. math::
+       out.row[i] += in.row[i + \\textrm{offset}]
+
+    The example usage is:
+
+    .. code-block:: python
+
+       proj = identity_projection(input=layer,
+                                  offset=10)
+
+    Note that both of two projections should not have any parameter.
+
+    :param input: Input Layer.
+    :type input: LayerOutput.
+    :param offset: Offset, None if use default.
+    :type offset: int
+    :return: A IdentityProjection or IdentityOffsetProjection Object
+    :rtype: IdentityProjection or IdentityOffsetProjection
+    """
+    if offset is None:
+        proj = IdentityProjection(input_layer_name=input.name)
+        proj.origin = input
+        proj.origin.projection = 'identity'
+    else:
+        proj = IdentityOffsetProjection(input_layer_name=input.name,
+                                        offset=offset)
+        proj.origin = input
+        proj.origin.projection = 'identity_offset'
+    return proj
+
+
+@wrap_param_attr_default()
+def dotmul_projection(input, param_attr=None, scale=1):
+    """
+    1. DotMulProjection if input is a layer.
+    It performs element-wise multiplication with weight.
+
+    ..  math::
+        out.row[i] += in.row[i] .* weight
+
+    where :math:`.*` means element-wise multiplication.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       proj = dotmul_projection(input=layer)
+
+    2. DotMulOperator if input is a list or tuple.
+    It takes two inputs, performs element-wise multiplication:
+
+    .. math::
+       out.row[i] += scale * (in1.row[i] .* in2.row[i])
+
+    where :math:`.*` means element-wise multiplication, and
+    scale is a config scalar, its default value is one.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       op = dotmul_projection(input=[layer1, layer2],
+                              scale=2.0)
+
+    :param input: Input layer.
+    :type input: LayerOutput|list|tuple
+    :param param_attr: Parameter config, None if use default.
+    :type param_attr: ParameterAttribute
+    :param scale: config scalar, default value is one.
+    :type scale: float
+    :return: A DotMulProjection or DotMulOperator Object.
+    :rtype: DotMulProjection or DotMulOperator
+    """
+    if isinstance(input, LayerOutput):
+        proj = DotMulProjection(input_layer_name=input.name,
+                                size=input.size,
+                                **param_attr.attr)
+        proj.origin = input
+        proj.origin.projection = "dot_mul"
+        return proj
+    else:
+        assert isinstance(input, list) or isinstance(input, tuple)
+        assert len(input) == 2
+        assert param_attr is None
+        op = DotMulOperator(input_layer_name=[x.name for x in input],
+                            scale=scale)
+        op.origin = input
+        op.origin.operator = "dot_mul"
+        return op
+
+
+@wrap_bias_attr_default(['padding_attr'])
+def context_projection(input, context_len, context_start=None,
+                       padding_attr=False):
+    """
+    Context Projection.
+
+    It just simply reorganizes input sequence, combines "context_len" sequence
+    to one context from context_start. "context_start" will be set to
+    -(context_len - 1) / 2 by default. If context position out of sequence
+    length, padding will be filled as zero if padding_attr = False, otherwise
+    it is trainable.
+
+    For example, origin sequence is [A B C D E F G], context len is 3, then
+    after context projection and not set padding_attr, sequence will
+    be [ 0AB ABC BCD CDE DEF EFG FG0 ].
+
+    :param input: Input Sequence.
+    :type input: LayerOutput
+    :param context_len: context length.
+    :type context_len: int
+    :param context_start: context start position. Default is
+                          -(context_len - 1)/2
+    :type context_start: int
+    :param padding_attr: Padding Parameter Attribute. If false, it means padding
+                         always be zero. Otherwise Padding is learnable, and
+                         parameter attribute is set by this parameter.
+    :type padding_attr: bool|ParameterAttribute
+    :return: Projection
+    :rtype: Projection
+    """
+    context_start = -(
+        context_len - 1) / 2 if context_start is None else context_start
+
+    extra_dict = dict()
+    trainable = isinstance(padding_attr, ParameterAttribute)
+    if trainable:
+        extra_dict = padding_attr.attr
+
+    proj = ContextProjection(input_layer_name=input.name,
+                             context_length=context_len,
+                             context_start=context_start,
+                             trainable_padding=trainable,
+                             **extra_dict)
+    proj.origin = input
+    proj.origin.projection = 'context'
+    return proj
+
+
+class MixedLayerType(LayerOutput):
+    """
+    The internal object for trainer_helpers.
+    """
+
+    class AddToSealedMixedLayerException(Exception):
+        def __init__(self):
+            Exception.__init__(self)
+
+    def __init__(self, name, size, act, bias_attr, layer_attr,
+                 parents=None):
+        """
+        Ctor.
+        :param name: layer name.
+        :type name: basestring
+        :param size: layer size.
+        :type size: int
+        :param act: activation type.
+        :type act: BaseActivation
+        :param bias_attr: The Bias Attribute. If no bias, then pass False or
+                          something not type of ParameterAttribute. None will
+                          get a default Bias.
+        :type bias_attr: ParameterAttribute or None means has bias. Any other
+                         type means no bias.
+        :param layer_attr: Extra Layer Attribute.
+        :type layer_attr: ExtraLayerAttribute or None
+        """
+        LayerOutput.__init__(self, name, LayerType.MIXED_LAYER, parents,
+                             size=size, activation=act)
+        self.bias_attr = bias_attr
+        self.layer_attr = layer_attr
+        self.inputs = []
+        self.finalized = False
+
+    def __add__(self, other):
+        """
+        + += operator
+        :param other: Other projection.
+        :type other: Projection
+        :return: self.
+        :rtype: MixedLayerType
+        """
+        if not self.finalized:
+            assert isinstance(other, Projection)
+            self.inputs.append(other)
+            self.parents.append(other.origin)
+            return self
+        else:
+            raise MixedLayerType.AddToSealedMixedLayerException()
+
+    def __enter__(self):
+        assert len(self.inputs) == 0
+        return self
+
+    def __exit__(self, *args, **kwargs):
+        del args, kwargs  # unused parameter to suppress warning
+        assert len(self.inputs) != 0
+        MixedLayer(
+            name=self.name,
+            size=self.size,
+            active_type=self.activation.name,
+            bias=ParamAttr.to_bias(self.bias_attr),
+            inputs=self.inputs,
+            **ExtraLayerAttribute.to_kwargs(self.layer_attr)
+        )
+
+
+@wrap_name_default("mixed")
+@wrap_act_default(act=LinearActivation())
+@wrap_bias_attr_default(has_bias=False)
+@layer_support(ERROR_CLIPPING, DROPOUT)
+def mixed_layer(size, input=None, name=None, act=None, bias_attr=False,
+                layer_attr=None):
+    """
+    Mixed Layer. A mixed layer will add all inputs together, then activate.
+    Each inputs is a projection or operator.
+
+    There are two styles of usages.
+
+    1. When not set inputs parameter, use mixed_layer like this:
+
+    .. code-block:: python
+
+       with mixed_layer(size=256) as m:
+           m += full_matrix_projection(input=layer1)
+           m += identity_projection(input=layer2)
+
+    2. You can also set all inputs when invoke mixed_layer as follows:
+
+    .. code-block:: python
+
+       m = mixed_layer(size=256,
+                       input=[full_matrix_projection(input=layer1),
+                              full_matrix_projection(input=layer2)])
+
+    :param name: mixed layer name. Can be referenced by other layer.
+    :type name: basestring
+    :param size: layer size.
+    :type size: int
+    :param input: inputs layer. It is an optional parameter. If set,
+                  then this function will just return layer's name.
+    :param act: Activation Type.
+    :type act: BaseActivation
+    :param bias_attr: The Bias Attribute. If no bias, then pass False or
+                      something not type of ParameterAttribute. None will get a
+                      default Bias.
+    :type bias_attr: ParameterAttribute or None or bool
+    :param layer_attr: The extra layer config. Default is None.
+    :type layer_attr: ExtraLayerAttribute
+    :return: MixedLayerType object can add inputs or layer name.
+    :rtype: MixedLayerType
+    """
+
+    if input is None:
+        return MixedLayerType(name, size, act, bias_attr, layer_attr)
+    else:
+        with mixed_layer(name=name, size=size, act=act, bias_attr=bias_attr,
+                         layer_attr=layer_attr) as m:
+            if isinstance(input, list) or isinstance(input, tuple):
+                for each in input:
+                    m += each
+            else:
+                m += input
+        return m
+
+
+@layer_support()
+def data_layer(name, size, layer_attr=None):
+    """
+    Define DataLayer For NeuralNetwork.
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        data = data_layer(name="input",
+                          size=1000)
+
+    :param name: Name of this data layer.
+    :type name: basestring
+    :param size: Size of this data layer.
+    :type size: int
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: Layer Output Object.
+    :rtype: LayerOutput
+    """
+    Layer(type=LayerType.DATA, name=name, size=size,
+          **ExtraLayerAttribute.to_kwargs(layer_attr))
+
+    return LayerOutput(name, LayerType.DATA, size=size)
+
+
+@wrap_name_default("embedding")
+@wrap_param_attr_default()
+@layer_support(ERROR_CLIPPING)
+def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None):
+    """
+    Define a embedding Layer.
+
+    :param name: Name of this embedding layer.
+    :type name: basestring
+    :param input: The input layer for this embedding. NOTE: must be Index Data.
+    :type input: LayerOutput
+    :param size: The embedding dimension.
+    :type size: int
+    :param param_attr: The embedding parameter attribute. See ParameterAttribute
+                      for details.
+    :type param_attr: ParameterAttribute|None
+    :param layer_attr: Extra layer Config. Default is None.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: Embedding Layer output
+    :rtype: LayerOutput
+    """
+    with mixed_layer(name=name, size=size, act=LinearActivation(),
+                     bias_attr=False,
+                     layer_attr=layer_attr) as mix:
+        mix += table_projection(input=input, size=size, param_attr=param_attr)
+    return mix
+
+
+@wrap_name_default()
+@wrap_param_attr_default()
+@wrap_bias_attr_default()
+@wrap_act_default()
+@layer_support(ERROR_CLIPPING, DROPOUT)
+def fc_layer(input, size, act=None, name=None,
+             param_attr=None, bias_attr=None, layer_attr=None):
+    """
+    Helper for declare fully connected layer.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       fc = fc_layer(input=layer,
+                     size=1024,
+                     act=LinearActivation(),
+                     bias_attr=False)
+
+   which is equal to:
+
+    .. code-block:: python
+
+       with mixed_layer(size=1024) as fc:
+           fc += full_matrix_projection(input=layer)
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer. Could be a list/tuple of input layer.
+    :type input: LayerOutput|list|tuple
+    :param size: The layer dimension.
+    :type size: int
+    :param act: Activation Type. Default is tanh.
+    :type act: BaseActivation
+    :param param_attr: The Parameter Attribute|list.
+    :type param_attr: ParameterAttribute
+    :param bias_attr: The Bias Attribute. If no bias, then pass False or
+                      something not type of ParameterAttribute. None will get a
+                      default Bias.
+    :type bias_attr: ParameterAttribute|None|Any
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: Layer Name.
+    :rtype: LayerOutput
+    """
+    if isinstance(input, LayerOutput):
+        input = [input]
+        assert not isinstance(param_attr, list)
+        param_attr = [param_attr]
+    else:
+        if isinstance(param_attr, list) or isinstance(param_attr, tuple):
+            assert len(input) == len(param_attr)
+        else:
+            param_attr = [copy.deepcopy(param_attr) for _ in range(len(input))]
+
+    assert isinstance(input, list)
+
+    def __idx_to_input__(i):
+        attr = param_attr[i]
+        assert isinstance(attr, ParameterAttribute)
+        return Input(input[i].name, **attr.attr)
+
+    Layer(
+        inputs=map(__idx_to_input__, range(len(input))),
+        name=name,
+        type=LayerType.FC_LAYER,
+        size=size,
+        bias=ParamAttr.to_bias(bias_attr),
+        active_type=act.name,
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
+    )
+    return LayerOutput(name, LayerType.FC_LAYER, input, activation=act,
+                       size=size)
+
+
+@wrap_name_default("seq_pooling")
+@wrap_bias_attr_default(has_bias=False)
+@wrap_param_default(['pooling_type'], default_factory=lambda _: MaxPooling())
+@layer_support()
+def pooling_layer(input, pooling_type=None, name=None, bias_attr=None,
+                  agg_level=AggregateLevel.EACH_TIMESTEP,
+                  layer_attr=None):
+    """
+    Pooling layer for sequence inputs, not used for Image.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       seq_pool = pooling_layer(input=layer,
+                                pooling_type=AvgPooling(),
+                                agg_level=AggregateLevel.EACH_SEQUENCE)
+
+    :param agg_level: AggregateLevel.EACH_TIMESTEP or AggregateLevel.EACH_SEQUENCE
+    :type agg_level: AggregateLevel
+    :param name: layer name.
+    :type name: basestring
+    :param input: input layer name.
+    :type input: LayerOutput
+    :param pooling_type: Type of pooling, MaxPooling(default), AvgPooling,
+                         SumPooling, SquareRootNPooling.
+    :type pooling_type: BasePoolingType|None
+    :param bias_attr: Bias parameter attribute. False if no bias.
+    :type bias_attr: ParameterAttribute|None|False
+    :param layer_attr: The Extra Attributes for layer, such as dropout.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: layer name.
+    :rtype: LayerType
+    """
+    extra_dict = dict()
+    if isinstance(pooling_type, AvgPooling):
+        extra_dict['average_strategy'] = pooling_type.strategy
+    extra_dict.update(ExtraLayerAttribute.to_kwargs(layer_attr))
+
+    Layer(
+        name=name,
+        type=pooling_type.name,
+        inputs=[Input(input.name)],
+        bias=ParamAttr.to_bias(bias_attr),
+        trans_type=agg_level,
+        **extra_dict
+    )
+
+    return LayerOutput(name, pooling_type.name, parents=[input],
+                       size=input.size)
+
+
+@wrap_bias_attr_default()
+@wrap_param_attr_default()
+@wrap_act_default(param_names=['gate_act'],
+                  act=SigmoidActivation())
+@wrap_act_default(param_names=["act", 'state_act'], act=TanhActivation())
+@wrap_name_default("lstmemory")
+@layer_support(DROPOUT)
+def lstmemory(input, name=None, reverse=False, act=None,
+              gate_act=None,
+              state_act=None, bias_attr=None, param_attr=None,
+              layer_attr=None):
+    """
+    Long Short-term Memory Cell.
+
+    The memory cell was implemented as follow equations.
+
+    ..  math::
+
+        i_t = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+
+        f_t = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+
+        c_t = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+
+        o_t = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+
+        h_t = o_t tanh(c_t)
+
+
+    NOTE: In paddle's implementation, the multiply operation
+    :math:`W_{xi}x_{t}` , :math:`W_{xf}x_{t}`,
+    :math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` is not done by
+    lstmemory layer, so it must use a mixed_layer do this full_matrix_projection
+    before lstm is used.
+
+    NOTE: This is a low level user interface. You may use network.simple_lstm
+    to config a simple plain lstm layer.
+
+    Please refer **Generating Sequences With Recurrent Neural Networks** if you
+    want to know what lstm is. Link_ is here.
+
+    .. _Link: http://arxiv.org/abs/1308.0850
+
+    TODO(yuyang18): Check lstm can input multiple values or not?
+
+    :param name: The lstmemory layer name.
+    :type name: basestring
+    :param input: input layer name.
+    :type input: LayerOutput
+    :param reverse: is sequence process reversed or not.
+    :type reverse: bool
+    :param act: activation type, TanhActivation by default. :math:`h_t`
+    :type act: BaseActivation
+    :param gate_act: gate activation type, SigmoidActivation by default.
+    :type gate_act: BaseActivation
+    :param state_act: state activation type, TanhActivation by default.
+    :type state_act: BaseActivation
+
+    :param bias_attr: Bias attribute. None means default bias. False means no
+                      bias.
+    :type bias_attr: ParameterAttribute|None|False
+    :param param_attr: Parameter Attribute.
+    :type param_attr: ParameterAttribute|None|False
+    :param layer_attr: Extra Layer attribute
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: Layer name.
+    :rtype: LayerOutput
+    """
+
+    assert gate_act.support_hppl
+    assert state_act.support_hppl
+    assert act.support_hppl
+
+    Layer(name=name,
+          type=LayerType.LSTMEMORY,
+          active_type=act.name,
+          active_state_type=state_act.name,
+          active_gate_type=gate_act.name,
+          reversed=reverse,
+          bias=ParamAttr.to_bias(bias_attr),
+          inputs=[Input(input.name, **param_attr.attr)],
+          **ExtraLayerAttribute.to_kwargs(layer_attr))
+
+    return LayerOutput(name, LayerType.LSTMEMORY, [input],
+                       size=input.size / 4 if input.size is not None else None)
+
+@wrap_bias_attr_default()
+@wrap_param_attr_default()
+@wrap_act_default(param_names=['gate_act'],
+                  act=SigmoidActivation())
+@wrap_act_default(param_names=["act"], act=TanhActivation())
+@wrap_name_default("gru")
+@layer_support(DROPOUT)
+def grumemory(input, name=None, reverse=False, act=None,
+              gate_act=None,
+              bias_attr=None, param_attr=None,
+              layer_attr=None):
+    """
+    Gate Recurrent Unit Layer.
+
+    The memory cell was implemented as follow equations.
+
+    1. update gate :math:`z`: defines how much of the previous memory to
+    keep around or the unit updates its activations. The update gate
+    is computed by:
+
+    ..  math::
+
+        z_t = \\sigma(W_{z}x_{t} + U_{z}h_{t-1} + b_z)
+
+    2. reset gate :math:`r`: determines how to combine the new input with the
+    previous memory. The reset gate is computed similarly to the update gate:
+
+    ..  math::
+
+        r_t = \\sigma(W_{r}x_{t} + U_{r}h_{t-1} + b_r)
+
+    3. The candidate activation :math:`\\tilde{h_t}` is computed similarly to that
+    of the traditional recurrent unit:
+
+    ..  math::
+
+        {\\tilde{h_t}} = tanh(W x_{t} + U (r_{t} \odot h_{t-1}) + b)
+
+    4. The hidden activation :math:`h_t` of the GRU at time t is a linear interpolation
+    between the previous activation :math:`h_{t-1}` and the candidate activation
+    :math:`\\tilde{h_t}`:
+
+    ..  math::
+
+        h_t = (1 - z_t) h_{t-1} + z_t {\\tilde{h_t}}
+
+    NOTE: In paddle's implementation, the multiply operation
+    :math:`W_{r}x_{t}`, :math:`W_{z}x_{t}` and :math:`W x_t` are not computed in
+    gate_recurrent layer. So it must use a mixed_layer with full_matrix_projection
+    or fc_layer to compute them before GRU.
+
+    The details can refer to `Empirical Evaluation of Gated Recurrent
+    Neural Networks on Sequence Modeling. <https://arxiv.org/abs/1412.3555>`_
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       gru = grumemory(input)
+
+    :param name: The gru layer name.
+    :type name: None|basestring
+    :param input: input layer.
+    :type input: LayerOutput.
+    :param reverse: Wether sequence process is reversed or not.
+    :type reverse: bool
+    :param act: activation type, TanhActivation by default. This activation
+                affects the :math:`{\\tilde{h_t}}`.
+    :type act: BaseActivation
+    :param gate_act: gate activation type, SigmoidActivation by default.
+                     This activation affects the :math:`z_t` and :math:`r_t`. It is the
+                     :math:`\\sigma` in the above formula.
+    :type gate_act: BaseActivation
+    :param bias_attr: Bias attribute. None means default bias. False means no
+                      bias.
+    :type bias_attr: ParameterAttribute|None|False
+    :param param_attr: Parameter Attribute.
+    :type param_attr: ParameterAttribute|None|False
+    :param layer_attr: Extra Layer attribute
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: Layer name.
+    :rtype: LayerOutput
+    """
+
+    assert act.support_hppl
+    assert gate_act.support_hppl
+
+    Layer(name=name,
+          type=LayerType.GRUMEMORY,
+          active_type=act.name,
+          active_gate_type=gate_act.name,
+          reversed=reverse,
+          bias=ParamAttr.to_bias(bias_attr),
+          inputs=[Input(input.name, **param_attr.attr)],
+          **ExtraLayerAttribute.to_kwargs(layer_attr)
+          )
+
+    return LayerOutput(name, LayerType.GRUMEMORY, [input],
+                       size=input.size / 3 if input.size is not None else None)
+
+@wrap_name_default()
+@layer_support()
+def last_seq(input, name=None, agg_level=AggregateLevel.EACH_TIMESTEP,
+             layer_attr=None):
+    """
+    Get Last Timestamp Activation of a sequence.
+
+    :param agg_level: Aggregated level
+    :param name: Layer name.
+    :type name: basestring
+    :param input: Input layer name.
+    :type input: LayerOutput
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: layer name.
+    :rtype: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.SEQUENCE_LAST_INSTANCE,
+        inputs=[input.name],
+        trans_type=agg_level,
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
+    )
+    return LayerOutput(name, LayerType.SEQUENCE_LAST_INSTANCE, parents=[input],
+                       size=input.size)
+
+
+@wrap_name_default()
+@layer_support()
+def first_seq(input, name=None, agg_level=AggregateLevel.EACH_TIMESTEP,
+              layer_attr=None):
+    """
+    Get First Timestamp Activation of a sequence.
+
+    :param agg_level: aggregation level
+    :param name: Layer name.
+    :type name: basestring
+    :param input: Input layer name.
+    :type input: LayerOutput
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: layer name.
+    :rtype: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.SEQUENCE_FIRST_INSTANCE,
+        inputs=[input.name],
+        trans_type=agg_level,
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
+    )
+    return LayerOutput(name, LayerType.SEQUENCE_FIRST_INSTANCE,
+                       parents=[input], size=input.size)
+
+
+class ExpandLevel(object):
+    FROM_TIMESTEP = AggregateLevel.EACH_TIMESTEP
+    FROM_SEQUENCE = AggregateLevel.EACH_SEQUENCE
+
+@wrap_name_default()
+@layer_support()
+def expand_layer(input, expand_as,
+                 name=None,
+                 bias_attr=False,
+                 expand_level=ExpandLevel.FROM_TIMESTEP,
+                 layer_attr=None):
+    """
+    A layer for "Expand Dense data or (sequence data where the length of each
+    sequence is one) to sequence data."
+
+    The example usage is:
+
+    .. code-block:: python
+
+       expand = expand_layer(input=layer1,
+                             expand_as=layer2,
+                             expand_level=ExpandLevel.FROM_TIMESTEP)
+
+    :param input: Input layer
+    :type input: LayerOutput
+    :param expand_as: Expand as this layer's sequence info.
+    :type expand_as: LayerOutput
+    :param name: Layer name.
+    :type name: basestring
+    :param bias_attr: Bias attribute. None means default bias. False means no
+                      bias.
+    :type bias_attr: ParameterAttribute|None|False
+    :param expand_level: whether input layer is timestep(default) or sequence.
+    :type expand_level: ExpandLevel
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: layer name
+    :rtype: LayerOutput
+    """
+
+    Layer(
+        inputs=[input.name, expand_as.name],
+        name=name,
+        bias=ParamAttr.to_bias(bias_attr=bias_attr),
+        type=LayerType.EXPAND_LAYER,
+        trans_type=expand_level,
+        **ExtraAttr.to_kwargs(layer_attr)
+    )
+    return LayerOutput(name=name,
+                       size=input.size,
+                       layer_type=LayerType.EXPAND_LAYER,
+                       parents=[input, expand_as])
+
+
+
+@wrap_name_default()
+@layer_support()
+def interpolation_layer(input, weight, name=None, layer_attr=None):
+    """
+    This layer is for linear interpolation with two inputs,
+    which is used in NEURAL TURING MACHINE.
+
+    .. math::
+       y.row[i] = w[i] * x_1.row[i] + (1 - w[i]) * x_2.row[i]
+
+    where :math:`x_1` and :math:`x_2` are two (batchSize x dataDim) inputs,
+    :math:`w` is (batchSize x 1) weight vector, and :math:`y` is
+    (batchSize x dataDim) output.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       interpolation = interpolation_layer(input=[layer1, layer2], weight=layer3)
+
+    :param input: Input layer.
+    :type input: list|tuple
+    :param weight: Weight layer.
+    :type weight: LayerOutput
+    :param name: Layer name.
+    :type name: basestring
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: layer name.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, list) or isinstance(input, tuple)
+    assert len(input) == 2
+    assert input[0].size == input[1].size
+    assert weight.size == 1
+    Layer(
+        name=name,
+        type=LayerType.INTERPOLATION_LAYER,
+        inputs=[weight.name, input[0].name, input[1].name],
+        **ExtraAttr.to_kwargs(layer_attr)
+    )
+    return LayerOutput(name, LayerType.INTERPOLATION_LAYER,
+                       parents=[weight, input[0], input[1]],
+                       size=input[0].size)
+
+
+@wrap_name_default()
+@layer_support()
+def power_layer(input, weight, name=None, layer_attr=None):
+    """
+    This layer applies a power function to a vector element-wise,
+    which is used in NEURAL TURING MACHINE.
+
+    .. math::
+       y = x^w
+
+    where :math:`x` is a input vector, :math:`w` is scalar weight,
+    and :math:`y` is a output vector.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       power = power_layer(input=layer1, weight=layer2)
+
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param weight: Weight layer.
+    :type weight: LayerOutput
+    :param name: Layer name.
+    :type name: basestring
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: layer name.
+    :rtype: LayerOutput
+    """
+    assert weight.size == 1
+    Layer(
+        name=name,
+        type=LayerType.POWER_LAYER,
+        inputs=[input.name, weight.name],
+        **ExtraAttr.to_kwargs(layer_attr)
+    )
+    return LayerOutput(name, LayerType.POWER_LAYER,
+                       parents=[input, weight], size=input.size)
+
+
+@wrap_name_default()
+@layer_support()
+def scaling_layer(input, weight, name=None, layer_attr=None):
+    """
+    A layer for each row of a matrix, multiplying with a element of a vector.
+
+    .. math::
+       y.row[i] = w[i] * x.row[i]
+
+    where :math:`x` is (batchSize x dataDim) input, :math:`w` is
+    (batchSize x 1) weight vector, and :math:`y` is (batchSize x dataDim) output.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       scale = scaling_layer(input=layer1, weight=layer2)
+
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param weight: Weight layer.
+    :type weight: LayerOutput
+    :param name: Layer name.
+    :type name: basestring
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: layer name.
+    :rtype: LayerOutput
+    """
+    assert weight.size == 1
+    Layer(
+        name=name,
+        type=LayerType.SCALING_LAYER,
+        inputs=[weight.name, input.name],
+        **ExtraAttr.to_kwargs(layer_attr)
+    )
+    return LayerOutput(name, LayerType.SCALING_LAYER, parents=[weight, input],
+                       size=input.size)
+
+
+@wrap_name_default()
+@layer_support()
+def trans_layer(input, name=None, layer_attr=None):
+    """
+    A layer for transposition.
+
+    .. math::
+       y = x^\mathrm{T}
+
+    where :math:`x` is (M x N) input, and :math:`y` is (N x M) output.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       trans = trans_layer(input=layer)
+
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param name: Layer name.
+    :type name: basestring
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: layer name.
+    :rtype: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.TRANS_LAYER,
+        inputs=[input.name],
+        **ExtraAttr.to_kwargs(layer_attr)
+    )
+    return LayerOutput(name, LayerType.TRANS_LAYER, parents=[input],
+                       size=input.size)
+
+
+@wrap_name_default()
+@layer_support()
+def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None):
+    """
+    Cosine Similarity Layer. The cosine similarity equation is here.
+
+    ..  math::
+        similarity = cos(\\theta) = {\\mathbf{A} \\cdot \\mathbf{B}
+        \\over \\|\\mathbf{A}\\| \\|\\mathbf{B}\\|}
+
+    And the input dimension is :math:`a \in R^M`, :math:`b \in R^{MN}`. The
+    similarity will be calculated N times by step M. The output dimension is
+    :math:`R^N`. The scale will be multiplied to similarity.
+
+    :param name: layer name
+    :type name: basestring
+    :param a: input layer a
+    :type a: LayerOutput
+    :param b: input layer b
+    :type b: LayerOutput
+    :param scale: scale for cosine value. default is 5.
+    :type scale: float
+    :param size: layer size. NOTE size_a * size should equal size_b.
+    :type size: int
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: layer name.
+    :rtype: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.COSINE_SIM,
+        size=size,
+        cos_scale=scale,
+        inputs=[a.name, b.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
+    )
+    return LayerOutput(name, LayerType.COSINE_SIM, parents=[a, b])
+
+@wrap_name_default()
+@wrap_bias_attr_default(has_bias=True)
+@layer_support()
+def hsigmoid(input, label, num_classes, name=None, bias_attr=None, layer_attr=None):
+    """
+    Organize the classes into a binary tree. At each node, a sigmoid function
+    is used to calculate the probability of belonging to the right branch.
+    This idea is from "F. Morin, Y. Bengio (AISTATS 05):
+    Hierarchical Probabilistic Neural Network Language Model."
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        cost = hsigmoid(input=[layer1, layer2],
+                        label=data_layer,
+                        num_classes=3)
+
+    :param name: layer name
+    :type name: basestring
+    :param input: Input layers. It could be a LayerOutput or list/tuple of
+                 LayerOutput.
+    :type input: LayerOutput|list|tuple
+    :param label: Label layer.
+    :type label: LayerOutput
+    :param num_classes: number of classes.
+    :type num_classes: int
+    :param bias_attr: Bias attribute. None means default bias.
+                      False means no bias.
+    :type bias_attr: ParameterAttribute|False
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: layer name.
+    :rtype: LayerOutput
+    """
+    if isinstance(input, LayerOutput):
+        input = [input]
+    assert isinstance(input, list) or isinstance(input, tuple)
+    assert isinstance(label, LayerOutput)
+    assert label.layer_type == LayerType.DATA
+
+    ipts_for_layer = []
+    parents = []
+    for each_input in input:
+        assert isinstance(each_input, LayerOutput)
+        ipts_for_layer.append(each_input.name)
+        parents.append(each_input)
+    ipts_for_layer.append(label.name)
+    parents.append(label)
+
+    Layer(
+        name=name,
+        type=LayerType.HSIGMOID,
+        num_classes=num_classes,
+        bias=ParamAttr.to_bias(bias_attr),
+        inputs=ipts_for_layer,
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
+    )
+    return LayerOutput(name, LayerType.HSIGMOID, parents=parents)
+
+@wrap_name_default("conv")
+@wrap_param_attr_default()
+@wrap_bias_attr_default()
+@wrap_act_default(act=ReluActivation())
+@layer_support(DROPOUT)
+def img_conv_layer(input, filter_size, num_filters,
+                   name=None, num_channels=None,
+                   act=None, groups=1, stride=1, padding=0, bias_attr=None,
+                   param_attr=None, shared_biases=True, layer_attr=None,
+                   filter_size_y=None, stride_y=None, padding_y=None):
+    """
+    Convolution layer for image. Paddle only support square input currently and
+    thus input image's width equals height.
+
+    The details of convolution layer, please refer UFLDL's `convolution
+    <http://ufldl.stanford.edu/tutorial/supervised/
+    FeatureExtractionUsingConvolution/>`_ .
+
+    The num_channel means input image's channel number. It may be 1 or 3 when
+    input is raw pixels of image(mono or RGB), or it may be the previous layer's
+    num_filters * num_group.
+
+    There are several group of filter in paddle
+    implementation. Each group will process some channel of inputs. For example,
+    if input num_channel = 256, group = 4, num_filter=32, the paddle will create
+    32*4 = 128 filters to process inputs. The channels will be split into 4
+    pieces. First 256/4 = 64 channels will process by first 32 filters. The rest
+    channels will be processed by rest group of filters.
+
+    :param name: Layer name.
+    :type name: basestring
+    :param input: Layer Input.
+    :type input: LayerOutput
+    :param filter_size: The x dimension of a filter kernel.
+    :type filter_size: int
+    :param filter_size_y: The y dimension of a filter kernel. Since paddle now
+                        support rectangular filters, the filter's shape
+                        will be (filter_size, filter_size_y).
+    :type filter_size_y: int
+    :param num_filters: Each filter group's number of filter
+    :param act: Activation type. Default is tanh
+    :type act: BaseActivation
+    :param groups: Group size of filters.
+    :type groups: int
+    :param stride: The x dimension of the stride.
+    :type stride: int
+    :param stride_y: The y dimension of the stride.
+    :type stride_y: int
+    :param padding: The x dimension of the padding.
+    :type padding: int
+    :param padding_y: The y dimension of the padding.
+    :type padding_y: int
+    :param bias_attr: Convolution bias attribute. None means default bias.
+                      False means no bias.
+    :type bias_attr: ParameterAttribute|False
+    :param num_channels: number of input channels. If None will be set
+                        automatically from previous output.
+    :type num_channels: int
+    :param param_attr: Convolution param attribute. None means default attribute
+    :type param_attr: ParameterAttribute
+    :param shared_biases: Is biases will be shared between filters or not.
+    :type shared_biases: bool
+    :param layer_attr: Layer Extra Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: Layer output.
+    :rtype: LayerOutput
+    """
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+    if filter_size_y is None:
+        filter_size_y = filter_size
+    if stride_y is None:
+        stride_y = stride
+    if padding_y is None:
+        padding_y = padding
+    if param_attr.attr.get('initial_smart') == True: # special initial for conv layers.
+        init_w = (2.0 / (filter_size ** 2 * num_channels)) ** 0.5
+        param_attr = ParameterAttribute(initial_mean=0.0, initial_std=init_w)
+    Layer(
+        name=name,
+        inputs=Input(input.name, conv=Conv(
+            filter_size=filter_size, padding=padding, stride=stride,
+            channels=num_channels, groups=groups,
+            filter_size_y=filter_size_y, padding_y=padding_y, stride_y=stride_y),
+            **param_attr.attr),
+        active_type=act.name,
+        num_filters=num_filters,
+        bias=ParamAttr.to_bias(bias_attr),
+        shared_biases=shared_biases,
+        type=LayerType.CONV_LAYER,
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
+    )
+    return LayerOutput(name, LayerType.CONV_LAYER, parents=[input],
+                       activation=act, num_filters=num_filters)
+
+
+@wrap_name_default("pool")
+@layer_support()
+def img_pool_layer(input, pool_size, name=None,
+                   num_channels=None, pool_type=None,
+                   stride=1, start=None, padding=0, layer_attr=None):
+    """
+    Image pooling Layer.
+
+    The details of pooling layer, please refer ufldl's pooling_ .
+
+    .. _pooling: http://ufldl.stanford.edu/tutorial/supervised/Pooling/
+
+    :param padding: pooling padding
+    :type padding: int
+    :param name: name of pooling layer
+    :type name: basestring.
+    :param input: layer's input
+    :type input: LayerOutput
+    :param pool_size: pooling size
+    :type pool_size: int
+    :param num_channels: number of input channel.
+    :type num_channels: int
+    :param pool_type: pooling type. MaxPooling or AveragePooling. Default is
+                      MaxPooling.
+    :type pool_type: BasePoolingType
+    :param stride: stride of pooling.
+    :type stride: int
+    :param start: start position of pooling operation.
+    :type start: int
+    :param layer_attr: Extra Layer attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput
+    """
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+
+    if pool_type is None:
+        pool_type = MaxPooling()
+    elif isinstance(pool_type, AvgPooling):
+        pool_type.name = 'avg'
+
+    Layer(
+        name=name,
+        type=LayerType.POOL_LAYER,
+        inputs=[Input(input.name,
+                      pool=Pool(
+                          pool_type=pool_type.name + '-projection',
+                          channels=num_channels,
+                          size_x=pool_size,
+                          start=start,
+                          stride=stride,
+                          padding=padding
+                      ))],
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
+    )
+    return LayerOutput(name, LayerType.POOL_LAYER, parents=[input],
+                       num_filters=num_channels)
+
+
+def __img_norm_layer__(name, input, size, norm_type, scale, power,
+                       num_channels, blocked, layer_attr):
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+
+    Layer(
+        name=name, type=LayerType.NORM_LAYER, inputs=Input(
+            input.name, norm=Norm(norm_type=norm_type,
+                                  channels=num_channels, size=size,
+                                  scale=scale,
+                                  pow=power, blocked=blocked)
+        ),
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
+    )
+    return LayerOutput(name, layer_type=LayerType.NORM_LAYER, parents=[input],
+                       num_filters=num_channels, img_norm_type=norm_type)
+
+
+@wrap_name_default("crmnorm")
+@layer_support()
+def img_cmrnorm_layer(input, size, scale, power, name=None, num_channels=None,
+                      blocked=0, layer_attr=None):
+    """
+    Convolution cross-map-response-normalize layer.
+
+    TODO(yuyang18): Add reference and equations, to explain why cmr is work?
+
+    :param name: layer name.
+    :type name: basestring
+    :param input: layer's input.
+    :type input: LayerOutput
+    :param size: cross map response size.
+    :type size: int
+    :param scale: TODO(yuyang18)
+    :type scale: float
+    :param power: TODO(yuyang18)
+    :type power: float
+    :param num_channels: input layer's filers number or channels. If
+                         num_channels is None, it will be set automatically.
+    :param blocked: TODO(yuyang18)
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: Layer's output
+    :rtype: LayerOutput
+    """
+    return __img_norm_layer__(name, input, size, "cmrnorm-projection", scale,
+                              power, num_channels, blocked, layer_attr)
+
+
+@wrap_name_default("rnorm")
+@layer_support()
+def img_rnorm_layer(input, size, scale, power, name=None, num_channels=None,
+                    layer_attr=None):
+    """
+    TODO(yuyang18): add comments
+
+    TODO(yuyang18): Why it is always not implemented whenever use_gpu or not?
+
+
+    :param name:
+    :param input:
+    :param size:
+    :param scale:
+    :param power:
+    :param num_channels:
+    :param layer_attr:
+    :return:
+    """
+    return __img_norm_layer__(name, input, size, 'rnorm', scale, power,
+                              num_channels, 0, layer_attr)
+
+
+@wrap_bias_attr_default()
+@wrap_param_attr_default(default_factory=lambda _: ParamAttr(initial_mean=1.0,
+                                                             initial_std=0.))
+@wrap_act_default(act=ReluActivation())
+@wrap_name_default("batch_norm")
+@layer_support(DROPOUT)
+def batch_norm_layer(input, act=None, name=None, num_channels=None,
+                     bias_attr=None, param_attr=None, layer_attr=None,
+                     batch_norm_type=None,
+                     moving_average_fraction=0.9,
+                     use_global_stats=None):
+    """
+    Batch Normalization Layer. The notation of this layer as follow.
+
+    :math:`x` is the input features over a mini-batch.
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    The details of batch normalization please refer to this
+    `paper <http://arxiv.org/abs/1502.03167>`_.
+
+    :param name: layer name.
+    :type name: basestring
+    :param input: batch normalization input. Better be linear activation.
+                Because there is an activation inside batch_normalization.
+    :type input: LayerOutput
+    :param batch_norm_type: We have batch_norm and cudnn_batch_norm. batch_norm
+                            supports both CPU and GPU. cudnn_batch_norm requires
+                            cuDNN version greater or equal to v4 (>=v4). But
+                            cudnn_batch_norm is faster and needs less memory
+                            than batch_norm. By default (None), we will
+                            automaticly select cudnn_batch_norm for GPU and
+                            batch_norm for CPU. Otherwise, select batch norm
+                            type based on the specified type. If you use cudnn_batch_norm,
+                            we suggested you use latest version, such as v5.1.
+    :type type: None|string, None or "batch_norm" or "cudnn_batch_norm"
+    :param act: Activation Type. Better be relu. Because batch
+                     normalization will normalize input near zero.
+    :type act: BaseActivation
+    :param num_channels: num of image channels or previous layer's number of
+                         filters. None will automatically get from layer's
+                         input.
+    :type num_channels: int
+    :param bias_attr: :math:`\\beta`, better be zero when initialize. So the
+                      initial_std=0, initial_mean=1 is best practice.
+    :type bias_attr: ParameterAttribute
+    :param param_attr: :math:`\\gamma`, better be one when initialize. So the
+                       initial_std=0, initial_mean=1 is best practice.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :param use_global_stats: whether use moving mean/variance statistics
+                             during testing peroid. If None or True,
+                             it will use moving mean/variance statistics during
+                             testing. If False, it will use the mean
+                             and variance of current batch of test data for
+                             testing.
+    :type use_global_stats: bool|None.
+    :param moving_average_fraction: Factor used in the moving average
+                                   computation, referred to as facotr,
+                                   :math:`runningMean = newMean*(1-factor)
+                                   + runningMean*factor`
+    :type moving_average_fraction: float.
+    :return: Layer's output
+    :rtype: LayerOutput
+    """
+    if not isinstance(act, ReluActivation):
+        logger.log(logging.WARN,
+                   "%s is not recommend for batch normalization's activation, "
+                   "maybe the relu is better" % act.name)
+
+    if not isinstance(input.activation, LinearActivation):
+        logger.log(logging.WARN,
+                   "The activation should be inside batch normalization, the "
+                   "previous layer's activation may be Linear")
+
+    if num_channels is None:
+        if input.num_filters is not None:
+            num_channels = input.num_filters
+        else:
+            num_channels = input.size
+    assert (batch_norm_type is None) or (batch_norm_type == "batch_norm") or \
+           (batch_norm_type == "cudnn_batch_norm")
+    Layer(
+        name=name,
+        inputs=Input(input.name,
+                     image=Image(channels=num_channels),
+                     **param_attr.attr),
+        active_type=act.name,
+        type=LayerType.BATCH_NORM_LAYER,
+        batch_norm_type=batch_norm_type,
+        bias=ParamAttr.to_bias(bias_attr),
+        moving_average_fraction=moving_average_fraction,
+        use_global_stats=use_global_stats,
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
+    )
+
+    return LayerOutput(name=name, layer_type=LayerType.BATCH_NORM_LAYER,
+                       parents=[input], activation=act,
+                       num_filters=num_channels)
+
+
+@wrap_name_default()
+@layer_support()
+def sum_to_one_norm_layer(input, name=None, layer_attr=None):
+    """
+    A layer for sum-to-one normalization,
+    which is used in NEURAL TURING MACHINE.
+
+    .. math::
+       out[i] = \\frac {in[i]} {\sum_{k=1}^N in[k]}
+
+    where :math:`in` is a (batchSize x dataDim) input vector,
+    and :math:`out` is a (batchSize x dataDim) output vector.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       sum_to_one_norm = sum_to_one_norm_layer(input=layer)
+
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param name: Layer name.
+    :type name: basestring
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: layer name.
+    :rtype: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.SUM_TO_ONE_NORM_LAYER,
+        inputs=[input.name],
+        **ExtraAttr.to_kwargs(layer_attr)
+    )
+    return LayerOutput(name, LayerType.SUM_TO_ONE_NORM_LAYER, parents=[input],
+                       size=input.size)
+
+
+@wrap_name_default("addto")
+@wrap_act_default(act=LinearActivation())
+@wrap_bias_attr_default(has_bias=False)
+@layer_support(DROPOUT)
+def addto_layer(input, act=None, name=None, bias_attr=None,
+                layer_attr=None):
+    """
+    AddtoLayer.
+
+    ..  math::
+
+        y = f(\\sum_{i} x_i + b)
+
+    where :math:`y` is output, :math:`x` is input, :math:`b` is bias,
+    and :math:`f` is activation function.
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        addto = addto_layer(input=[layer1, layer2],
+                            act=ReluActivation(),
+                            bias_attr=False)
+
+    This layer just simply add all input layers together, then activate the sum
+    inputs. Each input of this layer should be the same size, which is also the
+    output size of this layer.
+
+    There is no weight matrix for each input, because it just a simple add operation.
+    If you want to a complicated operation before add, please use mixed_layer.
+
+    It is a very good way to set dropout outside the layers. Since not all
+    paddle layer support dropout, you can add an add_to layer, set dropout here.
+    Please refer to dropout_layer for details.
+
+    :param name: Layer name.
+    :type name: basestring
+    :param input: Input layers. It could be a LayerOutput or list/tuple of
+                 LayerOutput.
+    :type input: LayerOutput|list|tuple
+    :param act: Activation Type, default is tanh.
+    :type act: BaseActivation
+    :param bias_attr: Bias attribute. If False, means no bias. None is default
+                      bias.
+    :type bias_attr: ParameterAttribute|bool
+    :param layer_attr: Extra Layer attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: layer's output
+    :rtype: LayerOutput
+    """
+    num_filters = None
+    if isinstance(input, LayerOutput):
+        input = [input]
+
+    assert isinstance(input, list) or isinstance(input, tuple)
+    ipts_for_layer = []
+    for each_input in input:
+        assert isinstance(each_input, LayerOutput)
+        ipts_for_layer.append(Input(each_input.name))
+        if each_input.num_filters is not None:
+            num_filters = each_input.num_filters
+
+    Layer(
+        name=name, type=LayerType.ADDTO_LAYER, inputs=ipts_for_layer,
+        bias=ParamAttr.to_bias(bias_attr),
+        active_type=act.name,
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
+    )
+    assert isinstance(input, list) or isinstance(input, tuple)
+    return LayerOutput(name, LayerType.ADDTO_LAYER, parents=input,
+                       activation=act, num_filters=num_filters)
+
+
+@wrap_act_default(act=IdentityActivation())
+@wrap_name_default("concat")
+@layer_support()
+def concat_layer(input, act=None, name=None, layer_attr=None):
+    """
+    Concat all input vector into one huge vector.
+    Inputs can be list of LayerOutput or list of projection.
+
+    :param name: Layer name.
+    :type name: basestring
+    :param input: input layers or projections
+    :type input: list|tuple
+    :param act: Activation type.
+    :type act: BaseActivation
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: layer's output
+    :rtype: LayerOutput
+    """
+
+    if isinstance(input, LayerOutput):
+        input = [input]
+    elif isinstance(input, Projection):
+        input = [input]
+    else:
+        assert isinstance(input, list) or isinstance(input, tuple)
+
+    def __is_type__(o, tp):
+        if not isinstance(o, list) and not isinstance(o, tuple):
+            if o == tp:
+                return True
+            elif len(o.__bases__) == 0:
+                return False
+            else:
+                for bs in o.__bases__:
+                    if __is_type__(bs, tp):
+                        return True
+                return False
+        else:
+            tmp = map(lambda _x: __is_type__(_x, tp), o)
+            a = tmp[0]
+            for b in tmp[1:]:
+                assert a == b
+            return a
+
+    def __reduce_concat_type__(a, b):
+        assert __is_type__([a, b], Projection) or __is_type__([a, b],
+                                                              LayerOutput)
+        return a
+
+    is_concat_layer = __is_type__(reduce(__reduce_concat_type__,
+                                         map(type, input)), LayerOutput)
+
+    layer_type = (LayerType.CONCAT_LAYER if is_concat_layer
+                  else LayerType.CONCAT_PROJ_LAYER)
+
+    Layer(
+        name=name, type=layer_type,
+        inputs=[x.name for x in input] if is_concat_layer else input,
+        active_type=act.name,
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
+    )
+
+    sz = 0
+    for each_input in input:
+        if each_input.size is not None:
+            sz += each_input.size
+        else:
+            sz = None
+            break
+
+    return LayerOutput(name, layer_type=layer_type,
+                       parents=input if is_concat_layer else [
+                           x.origin for x in input],
+                       activation=act, size=sz)
+
+
+def memory(name, size, is_seq=False, boot_layer=None,
+           boot_bias=None, boot_bias_active_type=None,
+           boot_with_const_id=None):
+    """
+    The memory layers is a layer cross each time step. Reference this output
+    as previous time step layer :code:`name` 's output.
+
+    The default memory is zero in first time step, previous time step's
+    output in the rest time steps.
+
+    If boot_bias, the first time step value is this bias and
+    with activation.
+
+    If boot_with_const_id, then the first time stop is a IndexSlot, the
+    Arguments.ids()[0] is this :code:`cost_id`.
+
+    If boot_layer is not null, the memory is just the boot_layer's output.
+    Set :code:`is_seq` is true boot layer is sequence.
+
+
+    The same name layer in recurrent group will set memory on each time
+    step.
+
+    :param name: memory's name.
+    :type name: basestring
+    :param size: size of memory.
+    :type size: int
+    :param is_seq: is sequence for boot_layer
+    :type is_seq: bool
+    :param boot_layer: boot layer of memory.
+    :type boot_layer: LayerOutput|None
+    :param boot_bias: boot layer's bias
+    :type boot_bias: ParameterAttribute|None
+    :param boot_bias_active_type: boot layer's active type.
+    :type boot_bias_active_type: BaseActivation
+    :param boot_with_const_id: boot layer's id.
+    :type boot_with_const_id: int
+    :return: Memory layer's output
+    :rtype: LayerOutput
+    """
+    if boot_bias_active_type is None:
+        boot_bias_active_type = LinearActivation()
+
+    assert boot_bias is None or isinstance(boot_bias, ParameterAttribute)
+    if isinstance(boot_bias, ParameterAttribute):
+        boot_bias = ParamAttr.to_bias(boot_bias)
+
+    assert boot_layer is None or isinstance(boot_layer, LayerOutput)
+
+    agent_name = Memory(name, size,
+                        is_seq,
+                        boot_layer.name if boot_layer is not None else None,
+                        boot_bias,
+                        boot_bias_active_type.name,
+                        boot_with_const_id)
+
+    lout = LayerOutput(name=agent_name, size=size,
+                       layer_type=LayerType.MEMORY,
+                       parents=[boot_layer] if boot_layer is not None
+                       else None)
+    return lout
+
+
+@wrap_bias_attr_default()
+@wrap_act_default(param_names=['gate_act',
+                               'state_act'],
+                  act=SigmoidActivation())
+@wrap_act_default(act=TanhActivation())
+@wrap_name_default('lstm_step')
+@layer_support()
+def lstm_step_layer(input, state, size, act=None,
+                    name=None, gate_act=None, state_act=None,
+                    bias_attr=None, layer_attr=None):
+    """
+    LSTM Step Layer. It used in recurrent_group. The lstm equations are shown
+    as follow.
+
+    ..  math::
+
+        i_t = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+
+        f_t = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+
+        c_t = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+
+        o_t = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+
+        h_t = o_t tanh(c_t)
+
+
+    The input\_ of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
+    :code:`mixed_layer` and :code:`full_matrix_projection` to calculate these
+    input vector.
+
+    The state of lstm step is :math:`c_{t-1}`. And lstm step layer will do
+
+    ..  math::
+
+        i_t = \\sigma(input + W_{ci}c_{t-1} + b_i)
+
+        ...
+
+
+    This layer contains two outputs. Default output is :math:`h_t`. The other
+    output is :math:`o_t`, which name is 'state' and can use
+    :code:`get_output_layer` to extract this output.
+
+    :param name: Layer's name.
+    :type name: basestring
+    :param size: Layer's size. NOTE: lstm layer's size, should be equal as
+                 :code:`input.size/4`, and should be equal as
+                 :code:`state.size`.
+    :type size: int
+    :param input: input layer. :math:`Wx_t + Wh_{t-1}`
+    :type input: LayerOutput
+    :param state: State Layer. :math:`c_{t-1}`
+    :type state: LayerOutput
+    :param act: Activation type. Default is tanh
+    :type act: BaseActivation
+    :param gate_act: Gate Activation Type. Default is sigmoid, and should
+                          be sigmoid only.
+    :type gate_act: BaseActivation
+    :param state_act: State Activation Type. Default is sigmoid, and should
+                           be sigmoid only.
+    :type state_act: BaseActivation
+    :param bias_attr: Bias Attribute.
+    :type bias_attr: ParameterAttribute
+    :param layer_attr: layer's extra attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: lstm step's layer output
+    :rtype: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.LSTM_STEP_LAYER,
+        active_type=act.name,
+        active_gate_type=gate_act.name,
+        active_state_type=state_act.name,
+        bias=ParamAttr.to_bias(bias_attr),
+        size=size, inputs=[input.name, state.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
+    )
+
+    return LayerOutput(name=name, layer_type=LayerType.LSTM_STEP_LAYER,
+                       parents=[input, state], activation=act,
+                       size=size, outputs=['default', 'state'])
+
+
+@wrap_bias_attr_default()
+@wrap_act_default(param_names=['gate_act'],
+                  act=SigmoidActivation())
+@wrap_act_default(act=TanhActivation())
+@wrap_name_default('gru_step')
+@layer_support()
+def gru_step_layer(input, output_mem, size=None, act=None,
+                   name=None, gate_act=None,
+                   bias_attr=None, layer_attr=None):
+    """
+
+    :param input:
+    :type input: LayerOutput
+    :param output_mem:
+    :param size:
+    :param act:
+    :param name:
+    :param gate_act:
+    :param bias_attr:
+    :param layer_attr:
+    :return:
+    :rtype: LayerOutput
+    """
+    assert input.size % 3 == 0
+    if size is None:
+        size = input.size / 3
+    Layer(
+        name=name,
+        type=LayerType.GRU_STEP_LAYER,
+        inputs=[
+            input.name,
+            output_mem.name
+        ],
+        bias=ParamAttr.to_bias(bias_attr),
+        size=size,
+        active_type=act.name,
+        active_gate_type=gate_act.name,
+        **ExtraAttr.to_kwargs(layer_attr)
+    )
+    return LayerOutput(
+        name=name, layer_type=LayerType.GRU_STEP_LAYER,
+        parents=[input, output_mem],
+        size=size, activation=act)
+
+
+@wrap_name_default()
+@layer_support()
+def get_output_layer(input, arg_name, name=None, layer_attr=None):
+    """
+    Get layer's output by name. In paddle, a layer might return multiple value,
+    but return one layer output. If user want to reference another output beside
+    default output, use get_output_layer first to get another output from input.
+
+    :param name: Layer's name.
+    :type name: basestring
+    :param input: get output layer's input. And this layer should contains
+                   multiple outputs.
+    :type input: LayerOutput
+    :param arg_name: Output name from input.
+    :type arg_name: basestring
+    :param layer_attr: Layer's extra attribute.
+    :return: Layer's output
+    :rtype: LayerOutput
+    """
+    # GetOutputLayer
+    assert arg_name in input.outputs, 'Get Output From an not existed input.' \
+                                      ' The get output name is %s, which not' \
+                                      ' in %s' % (
+                                          arg_name, ",".join(input.outputs))
+    Layer(name=name, type=LayerType.GET_OUTPUT_LAYER,
+          inputs=[Input(input.name, input_layer_argument=arg_name)],
+          size=input.size,
+          **ExtraLayerAttribute.to_kwargs(layer_attr))
+
+    return LayerOutput(name=name, layer_type=LayerType.GET_OUTPUT_LAYER,
+                       parents=[input], size=input.size)
+
+
+@wrap_name_default()
+@wrap_act_default()
+@wrap_bias_attr_default()
+@wrap_param_attr_default()
+@layer_support()
+def recurrent_layer(input, act=None, bias_attr=None,
+                    param_attr=None, name=None, layer_attr=None):
+    """
+    TODO(yuyang18): Add docs
+
+    :param input:
+    :param size:
+    :param act:
+    :param bias_attr:
+    :param param_attr:
+    :param name:
+    :param layer_attr:
+    :return:
+    """
+    Layer(name=name,
+          type=LayerType.RECURRENT_LAYER,
+          inputs=Input(input.name, **param_attr.attr),
+          active_type=act.name,
+          size=input.size,
+          bias=ParamAttr.to_bias(bias_attr),
+          **ExtraAttr.to_kwargs(layer_attr))
+    return LayerOutput(name=name, layer_type=LayerType.RECURRENT_LAYER,
+                       parents=[input], size=input.size, activation=act)
+
+
+class StaticInput(object):
+    """
+    StaticInput is only used in recurrent_group which defines a read-only memory
+    that can be a sequence or non-sequence.
+    """
+    def __init__(self, input, is_seq=False, size=None):
+        assert isinstance(input, LayerOutput)
+        self.input = input
+        self.is_seq = is_seq
+        assert input.size is not None or size is not None
+        if size is not None:
+            input.size = size
+
+
+class SubsequenceInput(object):
+    """
+    Input sequence has sub-sequence, used in recurrent_group.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       input = SubsequenceInput(layer)
+    """
+    def __init__(self, input):
+        assert isinstance(input, LayerOutput)
+        assert input.size is not None
+        self.input = input
+
+
+@wrap_name_default("recurrent_group")
+def recurrent_group(step, input, reverse=False, name=None):
+    """
+    Recurrent Group. It supports time steps and sequence steps mechanisms.
+
+    The basic usage (time steps) is:
+
+    .. code-block:: python
+
+       def step(input):
+           output = fc_layer(input=layer,
+                             size=1024,
+                             act=LinearActivation(),
+                             bias_attr=False)
+           return output
+
+       group = recurrent_group(input=layer,
+                               step=step)
+
+    You can see following configs for further usages:
+
+    - time steps: lstmemory_group, paddle/gserver/tests/sequence_layer_group.conf, \
+                  demo/seqToseq/seqToseq_net.py
+    - sequence steps: paddle/gserver/tests/sequence_nest_layer_group.conf
+
+    :param step: recurrent one time step function.The input of this function is
+                 input of the group. The return of this function will be
+                 recurrent group's return value.
+
+                 The recurrent group scatter a sequence into time steps. And
+                 for each time step, will invoke step function, and return
+                 a time step result. Then gather each time step of output into
+                 layer group's output.
+
+    :type step: callable
+
+    :param name: recurrent_group's name.
+    :type name: basestring
+
+    :param input: Input links array.
+
+                  LayerOutput will be scattered into time steps.
+                  SubsequenceInput will be scattered into sequence steps.
+                  StaticInput will be imported to each time step, and doesn't change
+                  through time. It's a mechanism to access layer outside step function.
+
+    :type input: LayerOutput|StaticInput|SubsequenceInput|list|tuple
+
+    :param reverse: Reverse is true, rnn will process sequence reversely.
+    :type reverse: bool
+    :return: Layer output object
+    :rtype: LayerOutput
+    """
+    model_type('recurrent_nn')
+
+    def is_single_input(x):
+        return isinstance(x, LayerOutput) or isinstance(x, StaticInput) \
+               or isinstance(x, SubsequenceInput)
+
+    if is_single_input(input):
+        input = [input]
+    assert isinstance(input, list) or isinstance(input, tuple)
+
+    def is_in_links(x):
+        return isinstance(x, LayerOutput) or isinstance(x, SubsequenceInput)
+
+    in_links = filter(is_in_links, input)
+
+    contains_sub_seq = [False]
+
+    def map_in_links(x):
+        if isinstance(x, SubsequenceInput):
+            contains_sub_seq[0] = True
+            return Link(name=x.input.name, has_subseq=True)
+        else:
+            return x.name
+
+    RecurrentLayerGroupWithoutOutLinksBegin(
+        name=name, in_links=map(map_in_links, in_links),
+        seq_reversed=reverse)
+    in_args = []
+    for each_input in input:
+        assert is_single_input(each_input)
+        if isinstance(each_input, LayerOutput):
+            in_args.append(each_input)
+        elif isinstance(each_input, SubsequenceInput):
+            in_args.append(each_input.input)
+        else:
+            mem_name = "__%s_memory__" % each_input.input.name
+            mem = memory(name=mem_name,
+                         is_seq=each_input.is_seq,
+                         size=each_input.input.size,
+                         boot_layer=each_input.input)
+            with mixed_layer(name=mem_name, size=each_input.input.size,
+                             act=IdentityActivation()) as mix:
+                mix += identity_projection(mem)
+            in_args.append(mem)
+
+    layer_outs = step(*in_args)
+
+    if isinstance(layer_outs, LayerOutput):
+        layer_outs = [layer_outs]
+
+    for ot in layer_outs:
+        assert isinstance(ot, LayerOutput)
+        if contains_sub_seq[0]:
+            RecurrentLayerGroupSetOutLink(Link(ot.name, has_subseq=True))
+        else:
+            RecurrentLayerGroupSetOutLink(ot.name)
+
+    RecurrentLayerGroupEnd(name=name)
+
+    if len(layer_outs) == 1:
+        return layer_outs[0]
+    else:
+        return layer_outs
+
+class BaseGeneratedInput(object):
+    def __init__(self):
+        self.bos_id = None
+        self.eos_id = None
+
+    def before_real_step(self):
+        raise NotImplementedError()
+
+    def after_real_step(self, *args):
+        raise NotImplementedError()
+
+
+class GeneratedInput(BaseGeneratedInput):
+    def after_real_step(self, input):
+        return maxid_layer(input=input, name='__beam_search_predict__')
+
+    def before_real_step(self):
+        predict_id = memory(name='__beam_search_predict__',
+                            size=self.size,
+                            boot_with_const_id=self.bos_id)
+
+        trg_emb = embedding_layer(input=predict_id,
+                                  size=self.embedding_size,
+                                  param_attr=ParamAttr(
+                                      name=self.embedding_name))
+        return trg_emb
+
+    def __init__(self, size, embedding_name, embedding_size):
+        self.size = size
+        self.embedding_name = embedding_name
+        self.embedding_size = embedding_size
+
+
+@wrap_name_default()
+def maxid_layer(input, name=None, layer_attr=None):
+    """
+    A layer for finding the id which has the maximal value for each sample.
+    The result is stored in output.ids.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       maxid = maxid_layer(input=layer)
+
+    :param input: Input layer name.
+    :type input: LayerOutput
+    :param name: Layer name.
+    :type name: basestring
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: layer name.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, LayerOutput)
+    Layer(name=name,
+          type='maxid',
+          inputs=[input.name],
+          **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(name=name,
+                       layer_type=LayerType.MAXID_LAYER,
+                       parents=[input])
+
+
+@wrap_name_default()
+def eos_layer(input, eos_id, name=None, layer_attr=None):
+    """
+    A layer for checking EOS for each sample:
+    - output_id = (input_id == conf.eos_id)
+
+    The result is stored in output\_.ids.
+    It is used by recurrent layer group.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       eos = eos_layer(input=layer, eos_id=id)
+
+    :param input: Input layer name.
+    :type input: LayerOutput
+    :param eos_id: end id of sequence
+    :type eos_id: int
+    :param name: Layer name.
+    :type name: basestring
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: layer name.
+    :rtype: LayerOutput
+    """
+    Layer(name=name,
+          type=LayerType.EOSID_LAYER,
+          eos_id=eos_id,
+          inputs=[input.name],
+          **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(name=name, layer_type=LayerType.EOSID_LAYER,
+                       parents=[input])
+
+
+@wrap_name_default()
+def beam_search(step, input, bos_id, eos_id, beam_size,
+                result_file, dict_file="", id_input=None,
+                max_length=500, name=None,
+                num_results_per_sample=None):
+    if num_results_per_sample is None:
+        num_results_per_sample = beam_size
+    if num_results_per_sample > beam_size:
+        logger.warning("num_results_per_sample should be less than beam_size")
+
+    if isinstance(input, StaticInput) or isinstance(input,
+                                                    BaseGeneratedInput):
+        input = [input]
+
+    generated_input_index = -1
+
+    real_input = []
+    for i, each_input in enumerate(input):
+        # print type(each_input)
+        assert isinstance(each_input, StaticInput) or isinstance(each_input,
+                                                          BaseGeneratedInput)
+        if isinstance(each_input, BaseGeneratedInput):
+            assert generated_input_index == -1
+            generated_input_index = i
+        else:
+            real_input.append(each_input)
+
+    assert generated_input_index != -1
+
+    gipt = input[generated_input_index]
+    assert isinstance(gipt, BaseGeneratedInput)
+
+    gipt.bos_id = bos_id
+    gipt.eos_id = eos_id
+
+    def __real_step__(*args):
+        eos_name = "__%s_eos_layer__" % name
+        RecurrentLayerGroupSetGenerator(Generator(
+            eos_layer_name=eos_name,
+            max_num_frames=max_length,
+            beam_size=beam_size,
+            num_results_per_sample=num_results_per_sample))
+
+        args = list(args)
+        args.insert(generated_input_index, gipt.before_real_step())
+
+        predict = gipt.after_real_step(step(*args))
+
+        eos_layer(input=predict, eos_id=eos_id, name=eos_name)
+
+        return predict
+
+    tmp = recurrent_group(step=__real_step__, input=real_input, reverse=False,
+                          name=name)
+
+    if id_input is None:
+        inputs = [tmp.name]
+    else:
+        assert isinstance(id_input, LayerOutput)
+        inputs = [id_input.name, tmp.name]
+        tmp.parents.append(id_input)
+
+    Evaluator(name='target_printer',
+              type='seq_text_printer',
+              dict_file=dict_file,
+              result_file=result_file,
+              inputs=inputs
+              )
+    return tmp
+
+
+@wrap_name_default()
+def regression_cost(input, label, cost='square_error', name=None):
+    """
+    Regression Layer.
+
+    TODO(yuyang18): Complete this method.
+
+    :param name: layer name.
+    :param input: Network prediction.
+    :param label: Data label.
+    :param cost: Cost method.
+    :return: layer name.
+    """
+    Layer(inputs=[Input(input.name), Input(label.name)], type=cost, name=name)
+    return LayerOutput(
+        name, LayerType.COST, parents=[input, label]
+    )
+
+
+@wrap_name_default("cost")
+def classification_cost(input, label, name=None,
+                        cost="multi-class-cross-entropy",
+                        evaluator=classification_error_evaluator):
+    """
+    classification cost Layer.
+
+    :param name: layer name.
+    :type name: basestring
+    :param input: input layer name. network output.
+    :type input: LayerOutput
+    :param label: label layer name. data_layer often.
+    :type label: LayerOutput
+    :param cost: cost method.
+    :type cost: basestring
+    :param evaluator: Evaluator method.
+    :return: layer name.
+    :rtype: LayerOutput
+    """
+    assert input.layer_type != LayerType.DATA
+    assert isinstance(input.activation, SoftmaxActivation)
+    assert label.layer_type == LayerType.DATA
+    Layer(name=name, type=cost, inputs=[Input(input.name), Input(label.name)])
+
+    def __add_evaluator__(e):
+        assert callable(e)
+        assert hasattr(e, 'is_evaluator')
+        assert isinstance(e.is_evaluator, bool)
+        assert e.is_evaluator
+        assert hasattr(e, "for_classification")
+        assert isinstance(e.for_classification, bool)
+        assert e.for_classification
+
+        e(name=e.__name__, input=input, label=label)
+
+    if not isinstance(evaluator, list) and not isinstance(evaluator, tuple):
+        evaluator = [evaluator]
+
+    for each_evaluator in evaluator:
+        __add_evaluator__(each_evaluator)
+
+    return LayerOutput(name, LayerType.COST, parents=[input, label])
+
+def conv_operator(input, filter_size, num_filters,
+                  num_channel=None, stride=1, padding=0,
+                  filter_size_y=None, stride_y=None, padding_y=None):
+    """
+    Different from img_conv_layer, conv_op is an Operator, which can be used
+    in mixed_layer. And conv_op takes two inputs to perform convolution.
+    The first input is the image and the second is filter kernel. It only
+    support GPU mode.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       op = conv_operator(input=[layer1, layer2],
+                          filter_size=3.0,
+                          num_filters=64,
+                          num_channels=64)
+
+    :param input: Input layer.
+    :type input: LayerOutput|list|tuple
+    :param filter_size: The x dimension of a filter kernel.
+    :type filter_size: int
+    :param filter_size_y: The y dimension of a filter kernel. Since paddle now
+                        support rectangular filters, the filter's shape
+                        will be (filter_size, filter_size_y).
+    :type filter_size_y: int
+    :param num_filter: channel of output data.
+    :type num_filter: int
+    :param num_channel: channel of input data.
+    :rtype num_channel: int
+    :param stride: The x dimension of the stride.
+    :rtype stride: int
+    :param stride_y: The y dimension of the stride.
+    :rtype stride_y: int
+    :param padding: The x dimension of padding.
+    :type padding: int
+    :param padding_y: The y dimension of padding.
+    :type padding_y: int
+    :return: A ConvOperator Object.
+    :rtype: ConvOperator
+    """
+    assert isinstance(input, list) or isinstance(input, tuple)
+    if filter_size_y is None:
+        filter_size_y = filter_size
+    if stride_y is None:
+        stride_y = stride
+    if padding_y is None:
+        padding_y = padding
+    op = ConvOperator(input_layer_name=[x.name for x in input],
+                      num_filters = num_filter,
+                      conv_conf=Conv(filter_size=filter_size,
+                                     padding=padding,
+                                     stride=stride,
+                                     channels=num_channel,
+                                     filter_size_y=filter_size_y,
+                                     padding_y=padding_y,
+                                     stride_y=stride_y))
+    op.origin = input
+    op.origin.operator = "conv_op"
+    return op
+
+
+@wrap_name_default()
+def conv_shift_layer(input, name=None):
+    """
+    This layer performs cyclic convolution for two input. For example:
+      - a[in]: contains M elements.
+      - b[in]: contains N elements (N should be odd).
+      - c[out]: contains M elements.
+
+    .. math::
+
+        c[i] = \sum_{j=-(N-1)/2}^{(N-1)/2}a_{i+j} * b_{j}
+
+    In this formular:
+     - a's index is computed modulo M.
+     - b's index is computed modulo N.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       conv_shift = conv_shif_layer(input=[layer1, layer2])
+
+    :param name: layer name
+    :type name: basestring
+    :param input: Input layer.
+    :type input: LayerOutput|list|tuple.
+    :return: a object of LayerOutput.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, list) or isinstance(input, tuple)
+    Layer(
+        name=name,
+        type=LayerType.CONV_SHIFT_LAYER,
+        inputs=[x.name for x in input],
+    )
+
+    return LayerOutput(name, LayerType.CONV_SHIFT_LAYER, parents=input)
+
+
+@wrap_name_default()
+@wrap_param_attr_default()
+@wrap_bias_attr_default()
+@layer_support(ERROR_CLIPPING, DROPOUT)
+def tensor_layer(input, size, act=None, name=None,
+                 param_attr=None, bias_attr=None, layer_attr=None):
+    """
+    This layer performs tensor operation for two input.
+    For example, each sample:
+
+    .. math::
+       y_{i} = x_{1} * W_{i} * {x_{2}^\mathrm{T}}, i=0,1,...,K-1
+
+    In this formular:
+      - :math:`x_{1}`: the first input contains M elements.
+      - :math:`x_{2}`: the second input contains N elements.
+      - y[out]: contains K elements.
+      - :math:`y_{i}`: the i-th element of y.
+      - :math:`W_{i}`: the i-th learned weight, shape if [M, N]
+      - :math:`{x_{2}}^\mathrm{T}`: the transpose of :math:`x_{2}`.
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       tensor = tensor_layer(input=[layer1, layer2])
+
+    :param name: layer name
+    :type name: basestring
+    :param input: Input layer.
+    :type input: LayerOutput|list|tuple.
+    :param size: the layer dimension.
+    :rtype: int.
+    :param act: Activation Type. Default is tanh.
+    :type act: BaseActivation
+    :param param_attr: The Parameter Attribute.
+    :type param_attr: ParameterAttribute|list
+    :param bias_attr: The Bias Attribute. If no bias, then pass False or
+                      something not type of ParameterAttribute. None will get a
+                      default Bias.
+    :type bias_attr: ParameterAttribute|None|Any
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: a object of LayerOutput.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, list) or isinstance(input, tuple)
+    assert len(input) == 2
+    Layer(
+        name=name,
+        size=size,
+        type=LayerType.TENSOR_LAYER,
+        active_type=act.name,
+        bias=ParamAttr.to_bias(bias_attr),
+        inputs=[Input(input[0].name, **param_attr),
+                Input(input[1].name)],
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
+    )
+    return LayerOutput(name, LayerType.TENSOR_LAYER, parents=input,
+                       activation=act, size=size)
+
+
+@wrap_param_attr_default()
+def trans_full_matrix_projection(input, size=0, param_attr=None):
+    """
+    Different from full_matrix_projection, this projection performs matrix
+    multiplication, using transpose of weight.
+
+    ..  math::
+        out.row[i] += in.row[i] * w^\mathrm{T}
+
+    :math:`w^\mathrm{T}` means transpose of weight.
+    The simply usage is:
+
+    .. code-block:: python
+
+       proj = trans_full_matrix_projection(input=layer,
+                                           size=100,
+                                           param_attr=ParamAttr(
+                                                name='_proj',
+                                                initial_mean=0.0,
+                                                initial_std=0.01))
+
+    :param input: input layer
+    :type input: LayerOutput
+    :param size: The parameter size. Means the width of parameter.
+    :type size: int
+    :param param_attr: Parameter config, None if use default.
+    :type param_attr: ParameterAttribute
+    :return: A TransposedFullMatrixProjection Object.
+    :rtype: TransposedFullMatrixProjection
+    """
+    proj = TransposedFullMatrixProjection(input_layer_name=input.name,
+                                          size=size,
+                                          **param_attr.attr)
+    proj.origin = input
+    proj.origin.projection = "trans_matrix"
+    return proj
+
+
+@wrap_name_default()
+@wrap_param_attr_default()
+@wrap_bias_attr_default()
+@wrap_act_default()
+def selective_fc_layer(input, size, act=None, name=None,
+                       pass_generation=False,
+                       has_selected_colums=True,
+                       mul_ratio=0.02,
+                       param_attr=None, bias_attr=None, layer_attr=None):
+    """
+    Selectived fully connected layer. Different from fc_layer, the output
+    of this layer maybe sparse. It requires an additional input to indicate
+    several selected columns for output. If the selected columns is not
+    specified, selective_fc_layer acts exactly like fc_layer.
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       sel_fc = selective_fc_layer(input=input, 128, act=TanhActivation())
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput|list|tuple
+    :param size: The layer dimension.
+    :type size: int
+    :param act: Activation Type. Default is tanh.
+    :type act: BaseActivation
+    :param param_attr: The Parameter Attribute.
+    :type param_attr: ParameterAttribute
+    :param bias_attr: The Bias Attribute. If no bias, then pass False or
+                      something not type of ParameterAttribute. None will get a
+                      default Bias.
+    :type bias_attr: ParameterAttribute|None|Any
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: a object of LayerOutput.
+    :rtype: LayerOutput
+    """
+    if isinstance(input, LayerOutput):
+        input = [input]
+        assert not isinstance(param_attr, list)
+        param_attr = [param_attr]
+    else:
+        if isinstance(param_attr, list) or isinstance(param_attr, tuple):
+            assert len(input) == len(param_attr)
+        else:
+            param_attr = [copy.deepcopy(param_attr) for _ in range(len(input))]
+
+    assert isinstance(input, list)
+
+    def __idx_to_input__(i):
+        attr = param_attr[i]
+        assert isinstance(attr, ParameterAttribute)
+        return Input(input[i].name, **attr.attr)
+
+    Layer(
+        inputs=map(__idx_to_input__, range(len(input))),
+        name=name,
+        type=LayerType.SEL_FC_LAYER,
+        size=size,
+        active_type=act.name,
+        selective_fc_pass_generation=pass_generation,
+        has_selected_colums=has_selected_colums,
+        selective_fc_full_mul_ratio=mul_ratio,
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
+    )
+    return LayerOutput(name, LayerType.SEL_FC_LAYER, input, activation=act,
+                       size=size)
+
+
+@wrap_name_default()
+def sampling_id_layer(input, name=None):
+    """
+    A layer for sampling id from multinomial distribution from the input layer.
+    Sampling one id for one sample.
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       samping_id = sampling_id_layer(input=input)
+
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param name: The Layer Name.
+    :type name: basestring
+    :return: a object of LayerOutput.
+    :rtype: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.SAMPLING_ID_LAYER,
+        inputs=[Input(input.name)],
+    )
+    return LayerOutput(name, LayerType.SAMPLING_ID_LAYER, input)
+
+
+@wrap_name_default()
+def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0):
+    """
+    This layer for applying a slope and an intercept to the input
+    element-wise. There is no activation and weight.
+
+    ..  math::
+        y = slope * x + intercept
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       scale = slope_intercept_layer(input=input, slope=-1.0, intercept=1.0)
+
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param name: The Layer Name.
+    :type name: basestring
+    :param slope: the scale factor.
+    :type slope: float.
+    :param intercept: the offset.
+    :type intercept: float.
+    :return: a object of LayerOutput.
+    :rtype: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.SLOPE_INTERCEPT_LAYER,
+        slope=slope,
+        intercept=intercept,
+        inputs=[Input(input.name)],
+    )
+    return LayerOutput(name, LayerType.SLOPE_INTERCEPT_LAYER, input)
+
+
+@wrap_name_default()
+def convex_comb_layer(input, size, name=None):
+    """
+    A layer for convex weighted average of vectors takes two inputs.
+      - Input: a vector containing the convex weights (batchSize x weightdim),
+             and a matrix in a vector form (batchSize x (weightdim*datadim)).
+      - Output: a vector (batchSize * datadim).
+
+    .. math::
+
+       y[i][j] = \sum_{j}(x_{1}(i, j) * x_{2}(i,j + i * dataDim)),
+
+                   i = 0,1,...,(batchSize-1); j = 0, 1,...,(dataDim-1)
+
+    In this formular:
+      - :math:`x_{1}`: the first input.
+      - :math:`x_{2}`: the second input.
+      - :math:`y`: the output.
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       convex_comb = convex_comb_layer(input=inputs,
+                                       size=elem_dim)
+
+    :param input: The input layers.
+    :type input: LayerOutput
+    :param size: the dimension of this layer.
+    :type size: int
+    :param name: The Layer Name.
+    :type name: basestring
+    :return: a object of LayerOutput.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, list) or isinstance(input, tuple)
+    assert len(input) == 2
+    Layer(
+        name=name,
+        type=LayerType.CONVEX_COMBINATION_LAYER,
+        size=size,
+        inputs=[Input(input[0].name), Input(input[1].name)],
+    )
+    return LayerOutput(name, LayerType.CONVEX_COMBINATION_LAYER, input, size=size)
+
+@wrap_name_default()
+def block_expand_layer(input,
+                       channel=0,
+                       block_x=0,
+                       block_y=0,
+                       stride_x=0,
+                       stride_y=0,
+                       padding_x=0,
+                       padding_y=0,
+                       name=None):
+    """
+    Expand feature map to minibatch matrix.
+      - matrix width is: block_y * block_x * channel
+      - matirx height is: outputH * outputW
+
+    .. math::
+
+       outputH = 1 + (2 * padding_y + imgSizeH - block_y + stride_y - 1) / stride_y
+
+       outputW = 1 + (2 * padding_x + imgSizeW - block_x + stride_x - 1) / stride_x
+
+    The expand method is the same with ExpandConvLayer, but saved the transposed
+    value. After expanding, output.sequenceStartPositions will store timeline.
+    The number of time steps are outputH * outputW and the dimension of each
+    time step is block_y * block_x * channel. This layer can be used after
+    convolution neural network, and before recurrent neural network.
+
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param channel: The channel number of input layer.
+    :type channel: int
+    :param block_x: The width of sub block.
+    :type block_x: int
+    :param block_y: The width of sub block.
+    :type block_y: int
+    :param stride_x: The stride size in horizontal direction.
+    :type stride_x: int
+    :param stride_y: The stride size in vertical direction.
+    :type stride_y: int
+    :param padding_x: The padding size in horizontal direction.
+    :type padding_x: int
+    :param padding_y: The padding size in vertical direction.
+    :type padding_y: int
+    :param name: The name of this layer, which can not specify.
+    :type name: None|basestring.
+    :return: a object of LayerOutput.
+    :rtype: LayerOutput
+    """
+    Layer(name=name,
+          input=Input(input.name,
+                      block_expand=BlockExpand(channel=channel,
+                                               block_x=block_x,
+                                               block_y=block_y,
+                                               stride_x=stride_x,
+                                               stride_y=stride_y,
+                                               padding_x=padding_x,
+                                               padding_y=padding_y)
+                       ),
+          type=LayerType.BLOCK_EXPAND,
+         )
+
+    return LayerOutput(name, LayerType.BLOCK_EXPAND,
+                       parents=[input], size=size)
+
+@wrap_name_default()
+def ctc_layer(input, label, size, name=None, norm_by_times=False):
+    """
+    Connectionist Temporal Classification (CTC) is designed for temporal
+    classication task. That is, for sequence labeling problems where the
+    alignment between the inputs and the target labels is unknown.
+
+    The simple usage:
+
+    .. code-block:: python
+
+      ctc = ctc_layer(input=input,
+                      label=label,
+                      size=9055,
+                      norm_by_times=True)
+
+    :param input: The input layers.
+    :type input: LayerOutput
+    :param label: The data layer of label with variable length.
+    :type label: LayerOutput
+    :param size: category numbers.
+    :type size: int
+    :param name: The name of this layer, which can not specify.
+    :type name: string|None
+    :param norm_by_times: Whether to normalization by times. False by default.
+    :type norm_by_times: bool
+    :return: a object of LayerOutput.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert isinstance(label, LayerOutput)
+    Layer(
+        name = name,
+        type = LayerType.CTC_LAYER,
+        size = size,
+        norm_by_times = norm_by_times,
+        inputs = [input.name, label.name]
+    )
+    return LayerOutput(name, LayerType.CTC_LAYER, [input, label], size=size)
+
+@wrap_name_default()
+def crf_layer(input, label, size, weight=None, param_attr=None, name=None):
+    """
+    A layer for calculating the cost of sequential conditional random
+    field model.
+
+    The simple usage:
+
+    .. code-block:: python
+
+      crf = crf_layer(input=input,
+                      label=label,
+                      size=label_dim)
+
+    :param input: The first input layer is the feature.
+    :type input: LayerOutput
+    :param label: The second input layer is label.
+    :type input: LayerOutput
+    :param size: The category number.
+    :type size: int
+    :param weight: The third layer is "weight" of each sample, which is an
+                  optional argument.
+    :type weight: LayerOutput
+    :param param_attr: Parameter attribute. None means default attribute
+    :type param_attr: ParameterAttribute
+    :param name: The name of this layers. It is not necessary.
+    :type name: None|basestring
+    :return: a object of LayerOutput.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert isinstance(label, LayerOutput)
+    assert weight is None or isinstance(weight, LayerOutput)
+
+    ipts = [Input(input.name, **param_attr),
+            Input(label.name)]
+    if weight is not None:
+        ipts.append(Input(weight.name))
+
+    Layer(
+        name = name,
+        type = LayerType.CRF_LAYER,
+        size = size,
+        inputs = ipts,
+    )
+    parents = [input, label]
+    if weight is not None:
+        parents.append(weight)
+    return LayerOutput(name, LayerType.CRF_LAYER, parents, size=size)
+
+@wrap_name_default()
+def crf_decoding_layer(input, size, label=None, param_attr=None, name=None):
+    """
+    A layer for calculating the decoding sequence of sequential conditional
+    random field model. The decoding sequence is stored in output.ids.
+    If a second input is provided, it is treated as the ground-truth label, and
+    this layer will also calculate error. output.value[i] is 1 for incorrect
+    decoding or 0 for correct decoding.
+
+    :param input: The first input layer.
+    :type input: LayerOutput
+    :param size: size of this layer.
+    :type size: int
+    :param label: None or ground-truth label.
+    :type label: LayerOutput or None
+    :param param_attr: Parameter attribute. None means default attribute
+    :type param_attr: ParameterAttribute
+    :param name: The name of this layers. It is not necessary.
+    :type name: None|basestring
+    :return: a object of LayerOutput.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, LayerOutput)
+    assert label is None or isinstance(label, LayerOutput)
+
+    ipts = [Input(input.name, **param_attr)]
+    if label is not None:
+        ipts.append(Input(label.name))
+
+    Layer(
+        name = name,
+        type = LayerType.CRF_DECODING_LAYER,
+        size = size,
+        inputs = ipts,
+    )
+    parents = [input]
+    if label is not None:
+        parents.append(label)
+    return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=size)
+
+"""
+following are cost Layers.
+"""
+@wrap_name_default()
+def rank_cost(left, right, lable, weight=None, name=None, coeff=1.0):
+    """
+    A cost Layer for leanrning to rank using gradient descent. Details can refer
+    to `papers <http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf>`_.
+    This layer contains at least three inputs. The weight is an optional
+    argument, which affects the cost.
+
+    .. math::
+
+       C_{i,j} = -\\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}})
+
+       o_{i,j} =  o_i - o_j
+
+       \\tilde{P_{i,j}} = \\{0, 0.5, 1\\} \ or \ \\{0, 1\\}
+
+    In this formula:
+      - :math:`C_{i,j}` is the cross entropy cost.
+      - :math:`\\tilde{P_{i,j}}` is the label. 1 means positive order
+        and 0 means reverse order.
+      - :math:`o_i` and :math:`o_j`: the left output and right output.
+        Their dimension is one.
+
+    The simple usage:
+
+    .. code-block:: python
+
+      cost = rank_cost(left=out_left,
+                       right=out_right,
+                       label=label)
+
+    :param left: The first input, the size of this layer is 1.
+    :type left: LayerOutput
+    :param right: The right input, the size of this layer is 1.
+    :type right: LayerOutput
+    :param label: Label is 1 or 0, means positive order and reverse order.
+    :type label: LayerOutput
+    :param weight: The weight affects the cost, namely the scale of cost.
+                   It is an optional argument.
+    :type weight: LayerOutput
+    :param name: The name of this layers. It is not necessary.
+    :type name: None|basestring
+    :param coeff: The coefficient affects the gradient in the backward.
+    :type coeff: float
+    :return: a object of LayerOutput.
+    :rtype: LayerOutput
+    """
+    assert left.size == 1
+    assert right.size == 1
+    assert label.size == 1
+
+    ipts = [left.name, right.name, label.name]
+    parents = [left, right, label]
+    if weight is not None:
+        ipts.append(weight.name)
+        parents.append(weight)
+
+    Layer(name=name,
+          type=LayerType.RANK_COST,
+          inputs=ipts,
+          coeff=coeff,
+         )
+
+    return LayerOutput(name, LayerType.RANK_COST, parents=parents)
+
+@wrap_name_default()
+def lambda_cost(input, score, NDCG_num=5, max_sort_size=-1, coeff=1.0):
+    """
+    lambdaCost for lambdaRank LTR approach.
+
+    The simple usage:
+
+    .. code-block:: python
+
+      cost = lambda_cost(input=input,
+                         score=score,
+                         NDCG_num=8,
+                         max_sort_size=-1)
+
+    :param input: The 1st input. Samples of the same query should be loaded
+                  as sequence. User should provided socres for each sample.
+                  The score should be the 2nd input of this layer.
+    :type input: LayerOutput
+    :param score: The 2nd input. Score of each sample.
+    :type input: LayerOutput
+    :param NDCG_num: The size of NDCG (Normalized Discounted Cumulative Gain),
+                     e.g., 5 for NDCG@5. It must be less than for equal to the
+                     minimum size of lists.
+    :type NDCG_num: int
+    :param max_sort_size: The size of partial sorting in calculating gradient.
+                          If max_sort_size = -1, then for each list, the
+                          algorithm will sort the entire list to get gradient.
+                          In other cases, max_sort_size must be greater than or
+                          equal to NDCG_num. And if max_sort_size is greater than
+                          the size of a list, the algorithm will sort the entire
+                          list of get gradient.
+    :type max_sort_size: int
+    :param name: The name of this layers. It is not necessary.
+    :type name: None|basestring
+    :param coeff: The coefficient affects the gradient in the backward.
+    :type coeff: float
+    :return: a object of LayerOutput.
+    :rtype: LayerOutput
+    """
+    Layer(name=name,
+          type=LayerType.LAMBDA_COST,
+          inputs=[input.name, score.name],
+          NDCG_num=NDCG_num,
+          max_sort_size=max_sort_size,
+          coeff=coeff,
+         )
+
+    return LayerOutput(name, LayerType.LAMBDA_COST, parents=[input, score])
+
+@wrap_name_default()
+def cross_entropy(input, label, name=None, coeff=1.0):
+    """
+    A loss layer for multi class entropy.
+
+    .. code-block:: python
+
+       cost = cross_entropy(input, label)
+
+    :param input: The first input layer.
+    :type input: LayerOutput.
+    :param label: The input label.
+    :type input: LayerOutput.
+    :param type: The type of cost.
+    :type type: basestring.
+    :param name: The name of this layers. It is not necessary.
+    :type name: None|basestring.
+    :param coeff: The coefficient affects the gradient in the backward.
+    :type coeff: float.
+    :return: a object of LayerOutput.
+    :rtype: LayerOutput.
+    """
+
+    Layer(name=name,
+          type=LayerType.CROSS_ENTROPY,
+          inputs=[input.name, label.name],
+          coeff=coeff,
+         )
+    return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=[input, label])
+
+@wrap_name_default()
+def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0,
+                                softmax_selfnorm_alpha=0.1):
+    """
+    A loss layer for multi class entropy with selfnorm.
+
+    .. code-block:: python
+
+       cost = cross_entropy_with_selfnorm(input, label)
+
+    :param input: The first input layer.
+    :type input: LayerOutput.
+    :param label: The input label.
+    :type input: LayerOutput.
+    :param type: The type of cost.
+    :type type: basestring.
+    :param name: The name of this layers. It is not necessary.
+    :type name: None|basestring.
+    :param coeff: The coefficient affects the gradient in the backward.
+    :type coeff: float.
+    :param softmax_selfnorm_alpha: The scale factor affects the cost.
+    :type softmax_selfnorm_alpha: float.
+    :return: a object of LayerOutput.
+    :rtype: LayerOutput.
+    """
+    Layer(name=name,
+          type=LayerType.CROSS_ENTROPY_WITH_SELFNORM,
+          inputs=[input.name, label.name],
+          coeff=coeff,
+          softmax_selfnorm_alpha=softmax_selfnorm_alpha,
+         )
+
+    return LayerOutput(name,
+                       LayerType.CROSS_ENTROPY_WITH_SELFNORM,
+                       parents=[input, label])
+
+@wrap_name_default()
+def huber_cost(input, label, name=None, coeff=1.0):
+    """
+    A loss layer for huber loss.
+
+    .. code-block:: python
+
+       cost = huber_cost(input, label)
+
+    :param input: The first input layer.
+    :type input: LayerOutput.
+    :param label: The input label.
+    :type input: LayerOutput.
+    :param type: The type of cost.
+    :type type: basestring.
+    :param name: The name of this layers. It is not necessary.
+    :type name: None|basestring.
+    :param coeff: The coefficient affects the gradient in the backward.
+    :type coeff: float.
+    :return: a object of LayerOutput.
+    :rtype: LayerOutput.
+    """
+
+    Layer(name=name,
+          type=LayerType.HUBER,
+          inputs=[input.name, label.name],
+          coeff=coeff,
+         )
+    return LayerOutput(name, LayerType.HUBER, parents=[input, label])
+
+@wrap_name_default()
+def multi_binary_label_cross_entropy(input, label, name=None, coeff=1.0):
+    """
+    A loss layer for multi binary label cross entropy.
+
+    .. code-block:: python
+
+       cost = multi_binary_label_cross_entropy(input, label)
+
+    :param input: The first input layer.
+    :type input: LayerOutput
+    :param label: The input label.
+    :type input: LayerOutput
+    :param type: The type of cost.
+    :type type: basestring
+    :param name: The name of this layers. It is not necessary.
+    :type name: None|basestring
+    :param coeff: The coefficient affects the gradient in the backward.
+    :type coeff: float
+    :return: a object of LayerOutput.
+    :rtype: LayerOutput
+    """
+
+    if not isinstance(input.act, SigmoidActivation):
+        logger.log(logging.WARN,
+                   "%s is not recommend for batch normalization's activation, "
+                   "maybe the relu is better" % act.name)
+
+    Layer(name=name,
+          type=LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
+          inputs=[input.name, label.name],
+          coeff=coeff,
+         )
+    return LayerOutput(name, LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
+                       parents=[input, label])
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
new file mode 100644
index 00000000000000..bb7a99d4420018
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -0,0 +1,959 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+The networks module contains pieces of neural network that combine multiple
+layers.
+
+The pieces are:
+
+- NLP
+
+  * Text Convolution component
+
+- Images
+
+  * Conv_Pool
+  * Conv_BatchNorm_Pool
+
+- Recurrent
+
+  * simple_lstm
+
+- Miscs
+
+  * dropout_layer
+
+"""
+# from activations import *
+from activations import LinearActivation, ReluActivation, SoftmaxActivation, \
+    IdentityActivation, TanhActivation, SequenceSoftmaxActivation
+from attrs import ExtraAttr
+from default_decorators import wrap_name_default, wrap_act_default, \
+    wrap_param_default
+from layers import *  # There are too many layers used in network, so import *
+from poolings import MaxPooling, SumPooling
+from paddle.trainer.config_parser import *
+
+__all__ = ['sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
+           "img_conv_bn_pool", 'dropout_layer', 'lstmemory_group',
+           'lstmemory_unit', 'small_vgg', 'img_conv_group', 'vgg_16_network',
+           'gru_unit', 'gru_group', 'simple_gru', 'simple_attention',
+           'text_conv_pool',
+           'bidirectional_lstm', 'outputs']
+
+
+######################################################
+#                     Text CNN                       #
+######################################################
+
+@wrap_name_default("sequence_conv_pooling")
+def sequence_conv_pool(input,
+                       context_len, hidden_size,
+                       name=None,
+                       context_start=None,
+                       pool_type=None, context_proj_layer_name=None,
+                       context_proj_param_attr=False,
+                       fc_layer_name=None,
+                       fc_param_attr=None,
+                       fc_bias_attr=None, fc_act=None,
+                       pool_bias_attr=None,
+                       fc_attr=None,
+                       context_attr=None,
+                       pool_attr=None):
+    """
+    Text convolution pooling layers helper.
+
+    Text input => Context Projection => FC Layer => Pooling => Output.
+
+    :param name: name of output layer(pooling layer name)
+    :type name: basestring
+    :param input: name of input layer
+    :type input: LayerOutput
+    :param context_len: context projection length. See
+                        context_projection's document.
+    :type context_len: int
+    :param hidden_size: FC Layer size.
+    :type hidden_size: int
+    :param context_start: context projection length. See
+                          context_projection's context_start.
+    :type context_start: int or None
+    :param pool_type: pooling layer type. See pooling_layer's document.
+    :type pool_type: BasePoolingType.
+    :param context_proj_layer_name: context projection layer name.
+                                    None if user don't care.
+    :type context_proj_layer_name: basestring
+    :param context_proj_param_attr: context projection parameter attribute.
+                                    None if user don't care.
+    :type context_proj_param_attr: ParameterAttribute or None.
+    :param fc_layer_name: fc layer name. None if user don't care.
+    :type fc_layer_name: basestring
+    :param fc_param_attr: fc layer parameter attribute. None if user don't care.
+    :type fc_param_attr: ParameterAttribute or None
+    :param fc_bias_attr: fc bias parameter attribute. False if no bias,
+                         None if user don't care.
+    :type fc_bias_attr: ParameterAttribute or None
+    :param fc_act: fc layer activation type. None means tanh
+    :type fc_act: BaseActivation
+    :param pool_bias_attr: pooling layer bias attr. None if don't care.
+                           False if no bias.
+    :type pool_bias_attr: ParameterAttribute or None.
+    :param fc_attr: fc layer extra attribute.
+    :type fc_attr: ExtraLayerAttribute
+    :param context_attr: context projection layer extra attribute.
+    :type context_attr: ExtraLayerAttribute
+    :param pool_attr: pooling layer extra attribute.
+    :type pool_attr: ExtraLayerAttribute
+    :return: output layer name.
+    :rtype: LayerOutput
+    """
+    # Set Default Value to param
+    context_proj_layer_name = "%s_conv_proj" % name \
+        if context_proj_layer_name is None else context_proj_layer_name
+
+    with mixed_layer(name=context_proj_layer_name,
+                     size=input.size * context_len,
+                     act=LinearActivation(),
+                     layer_attr=context_attr) as m:
+        m += context_projection(input, context_len=context_len,
+                                context_start=context_start,
+                                padding_attr=context_proj_param_attr)
+
+    fc_layer_name = "%s_conv_fc" % name \
+        if fc_layer_name is None else fc_layer_name
+    fl = fc_layer(name=fc_layer_name, input=m, size=hidden_size,
+                  act=fc_act, layer_attr=fc_attr,
+                  param_attr=fc_param_attr, bias_attr=fc_bias_attr)
+
+    return pooling_layer(name=name, input=fl,
+                         pooling_type=pool_type,
+                         bias_attr=pool_bias_attr,
+                         layer_attr=pool_attr)
+
+
+text_conv_pool = sequence_conv_pool
+
+
+############################################################################
+#                       Images                                             #
+############################################################################
+
+@wrap_name_default("conv_pool")
+def simple_img_conv_pool(input, filter_size, num_filters, pool_size, name=None,
+                         pool_type=None, act=None, groups=1, conv_stride=1,
+                         conv_padding=0, bias_attr=None, num_channel=None,
+                         param_attr=None, shared_bias=True,
+                         conv_layer_attr=None, pool_stride=1, pool_start=None,
+                         pool_padding=0, pool_layer_attr=None):
+    """
+    Simple image convolution and pooling group.
+
+    Input => conv => pooling
+
+    :param name: group name
+    :type name: basestring
+    :param input: input layer name.
+    :type input: LayerOutput
+    :param filter_size: see img_conv_layer for details
+    :type filter_size: int
+    :param num_filters: see img_conv_layer for details
+    :type num_filters: int
+    :param pool_size: see img_pool_layer for details
+    :type pool_size: int
+    :param pool_type: see img_pool_layer for details
+    :type pool_type: BasePoolingType
+    :param act: see img_conv_layer for details
+    :type act: BaseActivation
+    :param groups: see img_conv_layer for details
+    :type groups: int
+    :param conv_stride: see img_conv_layer for details
+    :type conv_stride: int
+    :param conv_padding: see img_conv_layer for details
+    :type conv_padding: int
+    :param bias_attr: see img_conv_layer for details
+    :type bias_attr: ParameterAttribute
+    :param num_channel: see img_conv_layer for details
+    :type num_channel: int
+    :param param_attr: see img_conv_layer for details
+    :type param_attr: ParameterAttribute
+    :param shared_bias: see img_conv_layer for details
+    :type shared_bias: bool
+    :param conv_layer_attr: see img_conv_layer for details
+    :type conv_layer_attr: ExtraLayerAttribute
+    :param pool_stride: see img_conv_layer for details
+    :type pool_stride: int
+    :param pool_start: see img_conv_layer for details
+    :type pool_start: int
+    :param pool_padding: see img_conv_layer for details
+    :type pool_padding: int
+    :param pool_layer_attr: see img_conv_layer for details
+    :type pool_layer_attr: ExtraLayerAttribute
+    :return: Layer's output
+    :rtype: LayerOutput
+    """
+    _conv_ = img_conv_layer(name="%s_conv" % name, input=input,
+                            filter_size=filter_size,
+                            num_filters=num_filters, num_channels=num_channel,
+                            act=act, groups=groups,
+                            stride=conv_stride,
+                            padding=conv_padding, bias_attr=bias_attr,
+                            param_attr=param_attr, shared_biases=shared_bias,
+                            layer_attr=conv_layer_attr)
+    return img_pool_layer(name="%s_pool" % name, input=_conv_,
+                          pool_size=pool_size,
+                          pool_type=pool_type, stride=pool_stride,
+                          start=pool_start, padding=pool_padding,
+                          layer_attr=pool_layer_attr)
+
+
+@wrap_name_default("conv_bn_pool")
+def img_conv_bn_pool(input, filter_size, num_filters, pool_size, name=None,
+                     pool_type=None, act=None, groups=1, conv_stride=1,
+                     conv_padding=0, conv_bias_attr=None, num_channel=None,
+                     conv_param_attr=None, shared_bias=True,
+                     conv_layer_attr=None, bn_param_attr=None,
+                     bn_bias_attr=None, bn_layer_attr=None, pool_stride=1,
+                     pool_start=None, pool_padding=0, pool_layer_attr=None):
+    """
+    Convolution, batch normalization, pooling group.
+
+    :param name: group name
+    :type name: basestring
+    :param input: layer's input
+    :type input: LayerOutput
+    :param filter_size: see img_conv_layer's document
+    :type filter_size: int
+    :param num_filters: see img_conv_layer's document
+    :type num_filters: int
+    :param pool_size: see img_pool_layer's document.
+    :type pool_size: int
+    :param pool_type: see img_pool_layer's document.
+    :type pool_type: BasePoolingType
+    :param act: see batch_norm_layer's document.
+    :type act: BaseActivation
+    :param groups: see img_conv_layer's document
+    :type groups: int
+    :param conv_stride: see img_conv_layer's document.
+    :type conv_stride: int
+    :param conv_padding: see img_conv_layer's document.
+    :type conv_padding: int
+    :param conv_bias_attr: see img_conv_layer's document.
+    :type conv_bias_attr: ParameterAttribute
+    :param num_channel: see img_conv_layer's document.
+    :type num_channel: int
+    :param conv_param_attr: see img_conv_layer's document.
+    :type conv_param_attr: ParameterAttribute
+    :param shared_bias: see img_conv_layer's document.
+    :type shared_bias: bool
+    :param conv_layer_attr: see img_conv_layer's document.
+    :type conv_layer_attr: ExtraLayerOutput
+    :param bn_param_attr: see batch_norm_layer's document.
+    :type bn_param_attr: ParameterAttribute.
+    :param bn_bias_attr: see batch_norm_layer's document.
+    :param bn_layer_attr: ParameterAttribute.
+    :param pool_stride: see img_pool_layer's document.
+    :type pool_stride: int
+    :param pool_start: see img_pool_layer's document.
+    :type pool_start: int
+    :param pool_padding: see img_pool_layer's document.
+    :type pool_padding: int
+    :param pool_layer_attr: see img_pool_layer's document.
+    :type pool_layer_attr: ExtraLayerAttribute
+    :return: Layer groups output
+    :rtype: LayerOutput
+    """
+    __conv__ = img_conv_layer(name="%s_conv" % name,
+                              input=input, filter_size=filter_size,
+                              num_filters=num_filters, num_channels=num_channel,
+                              act=LinearActivation(), groups=groups,
+                              stride=conv_stride, padding=conv_padding,
+                              bias_attr=conv_bias_attr,
+                              param_attr=conv_param_attr,
+                              shared_biases=shared_bias,
+                              layer_attr=conv_layer_attr)
+    __bn__ = batch_norm_layer(name="%s_bn" % name,
+                              input=__conv__, act=act,
+                              bias_attr=bn_bias_attr, param_attr=bn_param_attr,
+                              layer_attr=bn_layer_attr)
+    return img_pool_layer(name="%s_pool" % name,
+                          input=__bn__, pool_type=pool_type,
+                          pool_size=pool_size, stride=pool_stride,
+                          start=pool_start, padding=pool_padding,
+                          layer_attr=pool_layer_attr)
+
+
+@wrap_act_default(param_names=['conv_act'],
+                  act=ReluActivation())
+@wrap_param_default(param_names=['pool_type'],
+                    default_factory=lambda _: MaxPooling())
+def img_conv_group(input, conv_num_filter,
+                   pool_size,
+                   num_channels=None,
+                   conv_padding=1,
+                   conv_filter_size=3,
+                   conv_act=None,
+                   conv_with_batchnorm=False,
+                   conv_batchnorm_drop_rate=0,
+                   pool_stride=1,
+                   pool_type=None):
+    """
+    Image Convolution Group, Used for vgg net.
+
+    TODO(yuyang18): Complete docs
+
+    :param conv_batchnorm_drop_rate:
+    :param input:
+    :param conv_num_filter:
+    :param pool_size:
+    :param num_channels:
+    :param conv_padding:
+    :param conv_filter_size:
+    :param conv_act:
+    :param conv_with_batchnorm:
+    :param pool_stride:
+    :param pool_type:
+    :return:
+    """
+    tmp = input
+
+    # Type checks
+    assert isinstance(tmp, LayerOutput)
+    assert isinstance(conv_num_filter, list) or isinstance(conv_num_filter,
+                                                           tuple)
+    for each_num_filter in conv_num_filter:
+        assert isinstance(each_num_filter, int)
+
+    assert isinstance(pool_size, int)
+
+    def __extend_list__(obj):
+        if not hasattr(obj, '__len__'):
+            return [obj] * len(conv_num_filter)
+        else:
+            return obj
+
+    conv_padding = __extend_list__(conv_padding)
+    conv_filter_size = __extend_list__(conv_filter_size)
+    conv_act = __extend_list__(conv_act)
+    conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
+    conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)
+
+    for i in xrange(len(conv_num_filter)):
+        extra_kwargs = dict()
+        if num_channels is not None:
+            extra_kwargs['num_channels'] = num_channels
+            num_channels = None
+        if conv_with_batchnorm[i]:
+            extra_kwargs['act'] = LinearActivation()
+        else:
+            extra_kwargs['act'] = conv_act[i]
+
+        tmp = img_conv_layer(input=tmp, padding=conv_padding[i],
+                             filter_size=conv_filter_size[i],
+                             num_filters=conv_num_filter[i],
+                             **extra_kwargs)
+
+        # logger.debug("tmp.num_filters = %d" % tmp.num_filters)
+
+        if conv_with_batchnorm[i]:
+            dropout = conv_batchnorm_drop_rate[i]
+            if dropout == 0 or abs(dropout) < 1e-5:  # dropout not set
+                tmp = batch_norm_layer(input=tmp, act=conv_act[i])
+            else:
+                tmp = batch_norm_layer(input=tmp, act=conv_act[i],
+                                       layer_attr=ExtraAttr(drop_rate=dropout))
+
+    return img_pool_layer(input=tmp, stride=pool_stride, pool_size=pool_size,
+                          pool_type=pool_type)
+
+
+def small_vgg(input_image, num_channels, num_classes):
+    def __vgg__(ipt, num_filter, times, dropouts, num_channels_=None):
+        return img_conv_group(input=ipt, num_channels=num_channels_,
+                              pool_size=2,
+                              pool_stride=2,
+                              conv_num_filter=[num_filter] * times,
+                              conv_filter_size=3,
+                              conv_act=ReluActivation(),
+                              conv_with_batchnorm=True,
+                              conv_batchnorm_drop_rate=dropouts,
+                              pool_type=MaxPooling())
+
+    tmp = __vgg__(input_image, 64, 2, [0.3, 0], num_channels)
+    tmp = __vgg__(tmp, 128, 2, [0.4, 0])
+    tmp = __vgg__(tmp, 256, 3, [0.4, 0.4, 0])
+    tmp = __vgg__(tmp, 512, 3, [0.4, 0.4, 0])
+    tmp = img_pool_layer(input = tmp, stride = 2,
+                         pool_size = 2, pool_type = MaxPooling())
+    tmp = dropout_layer(input=tmp, dropout_rate=0.5)
+    tmp = fc_layer(input=tmp, size=512, layer_attr=ExtraAttr(drop_rate=0.5),
+                   act=LinearActivation())
+    tmp = batch_norm_layer(input=tmp, act=ReluActivation())
+    return fc_layer(input=tmp, size=num_classes, act=SoftmaxActivation())
+
+
+def vgg_16_network(input_image, num_channels, num_classes=1000):
+    """
+    Same model from https://gist.github.com/ksimonyan/211839e770f7b538e2d8
+
+    :param num_classes:
+    :param input_image:
+    :type input_image: LayerOutput
+    :param num_channels:
+    :type num_channels: int
+    :return:
+    """
+
+    tmp = img_conv_group(input=input_image, num_channels=num_channels,
+                         conv_padding=1, conv_num_filter=[64, 64],
+                         conv_filter_size=3,
+                         conv_act=ReluActivation(), pool_size=2,
+                         pool_stride=2,
+                         pool_type=MaxPooling())
+
+    tmp = img_conv_group(input=tmp, conv_num_filter=[128, 128], conv_padding=1,
+                         conv_filter_size=3, conv_act=ReluActivation(),
+                         pool_stride=2, pool_type=MaxPooling(),
+                         pool_size=2)
+
+    tmp = img_conv_group(input=tmp, conv_num_filter=[256, 256, 256],
+                         conv_padding=1,
+                         conv_filter_size=3, conv_act=ReluActivation(),
+                         pool_stride=2, pool_type=MaxPooling(), pool_size=2)
+
+    tmp = img_conv_group(input=tmp, conv_num_filter=[512, 512, 512],
+                         conv_padding=1,
+                         conv_filter_size=3, conv_act=ReluActivation(),
+                         pool_stride=2, pool_type=MaxPooling(), pool_size=2)
+    tmp = img_conv_group(input=tmp, conv_num_filter=[512, 512, 512],
+                         conv_padding=1,
+                         conv_filter_size=3, conv_act=ReluActivation(),
+                         pool_stride=2, pool_type=MaxPooling(), pool_size=2)
+
+    tmp = fc_layer(input=tmp, size=4096, act=ReluActivation(),
+                   layer_attr=ExtraAttr(drop_rate=0.5))
+
+    tmp = fc_layer(input=tmp, size=4096, act=ReluActivation(),
+                   layer_attr=ExtraAttr(drop_rate=0.5))
+
+    return fc_layer(input=tmp, size=num_classes, act=SoftmaxActivation())
+
+
+############################################################################
+#                       Recurrent                                          #
+############################################################################
+
+@wrap_name_default("lstm")
+def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,
+                bias_param_attr=None, inner_param_attr=None, act=None,
+                gate_act=None, state_act=None, mixed_layer_attr=None,
+                lstm_cell_attr=None):
+    """
+    Simple LSTM Cell.
+
+    It just combine a mix_layer with fully_matrix_projection and a lstmemory
+    layer. The simple lstm cell was implemented as follow equations.
+
+    ..  math::
+
+        i_t = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+
+        f_t = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+
+        c_t = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+
+        o_t = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+
+        h_t = o_t tanh(c_t)
+
+    Please refer **Generating Sequences With Recurrent Neural Networks** if you
+    want to know what lstm is. Link_ is here.
+
+    .. _Link: http://arxiv.org/abs/1308.0850
+
+    :param name: lstm layer name.
+    :type name: basestring
+    :param input: input layer name.
+    :type input: LayerOutput
+    :param size: lstm layer size.
+    :type size: int
+    :param reverse: is lstm reversed
+    :type reverse: bool
+    :param mat_param_attr: mixed layer's matrix projection parameter attribute.
+    :type mat_param_attr: ParameterAttribute
+    :param bias_param_attr: bias parameter attribute. False means no bias, None
+                            means default bias.
+    :type bias_param_attr: ParameterAttribute|False
+    :param inner_param_attr: lstm cell parameter attribute.
+    :type inner_param_attr: ParameterAttribute
+    :param act: lstm final activate type
+    :type act: BaseActivation
+    :param gate_act: lstm gate activate type
+    :type gate_act: BaseActivation
+    :param state_act: lstm state activate type.
+    :type state_act: BaseActivation
+    :param mixed_layer_attr: mixed layer's extra attribute.
+    :type mixed_layer_attr: ExtraLayerAttribute
+    :param lstm_cell_attr: lstm layer's extra attribute.
+    :type lstm_cell_attr: ExtraLayerAttribute
+    :return: lstm layer name.
+    :rtype: LayerOutput
+    """
+    fc_name = 'lstm_transform_%s' % name
+    with mixed_layer(name=fc_name, size=size * 4,
+                     act=IdentityActivation(),
+                     layer_attr=mixed_layer_attr, bias_attr=False) as m:
+        m += full_matrix_projection(input, param_attr=mat_param_attr)
+
+    return lstmemory(name=name, input=m, reverse=reverse,
+                     bias_attr=bias_param_attr,
+                     param_attr=inner_param_attr, act=act,
+                     gate_act=gate_act, state_act=state_act,
+                     layer_attr=lstm_cell_attr)
+
+
+@wrap_name_default('lstm_unit')
+def lstmemory_unit(input, name=None, size=None,
+                   mixed_bias_attr=None, mixed_layer_attr=None,
+                   param_attr=None, lstm_bias_attr=None,
+                   act=None, gate_act=None,
+                   state_act=None, lstm_layer_attr=None,
+                   get_output_layer_attr=None):
+    """
+    TODO(yuyang18): complete docs
+
+    @param input:
+    @param name:
+    @param size:
+    @param mixed_bias_attr:
+    @param mixed_layer_attr:
+    @param param_attr:
+    @param lstm_bias_attr:
+    @param act:
+    @param gate_act:
+    @param state_act:
+    @param lstm_layer_attr:
+    @param get_output_layer_attr:
+    @return:
+    """
+    if size is None:
+        assert input.size % 4 == 0
+        size = input.size / 4
+    out_mem = memory(name=name, size=size)
+    state_mem = memory(name="%s_state" % name, size=size)
+
+    with mixed_layer(name="%s_input_recurrent" % name,
+                     size=size * 4, bias_attr=mixed_bias_attr,
+                     layer_attr=mixed_layer_attr,
+                     act=IdentityActivation()) as m:
+        m += identity_projection(input=input)
+        m += full_matrix_projection(input=out_mem, param_attr=param_attr)
+
+    lstm_out = lstm_step_layer(
+        name=name,
+        input=m,
+        state=state_mem,
+        size=size,
+        bias_attr=lstm_bias_attr,
+        act=act,
+        gate_act=gate_act,
+        state_act=state_act,
+        layer_attr=lstm_layer_attr
+    )
+    get_output_layer(name='%s_state' % name,
+                     input=lstm_out,
+                     arg_name='state',
+                     layer_attr=get_output_layer_attr)
+
+    return lstm_out
+
+
+@wrap_name_default('lstm_group')
+def lstmemory_group(input, size=None, name=None,
+                    reverse=False, param_attr=None,
+                    mix_bias_attr=None, lstm_bias_attr=None,
+                    act=None, gate_act=None, state_act=None,
+                    mixed_layer_attr=None, lstm_layer_attr=None,
+                    get_output_layer_attr=None):
+    """
+    TODO(yuyang18): complete docs
+
+    @param input:
+    @param size:
+    @param name:
+    @param reverse:
+    @param param_attr:
+    @param mix_bias_attr:
+    @param lstm_bias_attr:
+    @param act:
+    @param gate_act:
+    @param state_act:
+    @param mixed_layer_attr:
+    @param lstm_layer_attr:
+    @param get_output_layer_attr:
+    @return:
+    """
+
+    def __lstm_step__(ipt):
+        return lstmemory_unit(input=ipt, name=name,
+                              size=size, mixed_bias_attr=mix_bias_attr,
+                              mixed_layer_attr=mixed_layer_attr,
+                              param_attr=param_attr,
+                              lstm_bias_attr=lstm_bias_attr,
+                              act=act, gate_act=gate_act,
+                              state_act=state_act,
+                              lstm_layer_attr=lstm_layer_attr,
+                              get_output_layer_attr=get_output_layer_attr)
+
+    return recurrent_group(name='%s_recurrent_group' % name,
+                           step=__lstm_step__,
+                           reverse=reverse,
+                           input=input)
+
+
+@wrap_name_default('gru_unit')
+def gru_unit(input,
+             size=None,
+             name=None,
+             gru_bias_attr=None,
+             act=None,
+             gate_act=None,
+             gru_layer_attr=None):
+    """
+
+    :param input:
+    :type input: LayerOutput
+    :param name:
+    :param size:
+    :param gru_bias_attr:
+    :param act:
+    :param gate_act:
+    :param gru_layer_attr:
+    :return:
+    """
+
+    assert input.size % 3 == 0
+    if size is None:
+        size = input.size / 3
+
+    out_mem = memory(name=name, size=size)
+
+    gru_out = gru_step_layer(
+        name=name,
+        input=input,
+        output_mem=out_mem,
+        size=size,
+        bias_attr=gru_bias_attr,
+        act=act,
+        gate_act=gate_act,
+        layer_attr=gru_layer_attr
+    )
+    return gru_out
+
+
+@wrap_name_default('gru_group')
+def gru_group(input,
+              size=None,
+              name=None,
+              reverse=False,
+              gru_bias_attr=None,
+              act=None, gate_act=None,
+              gru_layer_attr=None):
+    def __gru_step__(ipt):
+        return gru_unit(
+            input=ipt,
+            name=name,
+            size=size,
+            gru_bias_attr=gru_bias_attr,
+            act=act,
+            gate_act=gate_act,
+            gru_layer_attr=gru_layer_attr
+        )
+
+    return recurrent_group(name='%s_recurrent_group' % name,
+                           step=__gru_step__,
+                           reverse=reverse,
+                           input=input)
+
+
+@wrap_name_default('simple_gru')
+def simple_gru(input,
+               size,
+               name=None,
+               reverse=False,
+               mixed_param_attr=None,
+               mixed_bias_param_attr=None,
+               mixed_layer_attr=None,
+               gru_bias_attr=None,
+               act=None,
+               gate_act=None,
+               gru_layer_attr=None
+               ):
+    with mixed_layer(name='%s_transform' % name,
+                     size=size * 3,
+                     bias_attr=mixed_bias_param_attr,
+                     layer_attr=mixed_layer_attr) as m:
+        m += full_matrix_projection(input=input, param_attr=mixed_param_attr)
+
+    return gru_group(name=name,
+                     size=size,
+                     input=m,
+                     reverse=reverse,
+                     gru_bias_attr=gru_bias_attr,
+                     act=act,
+                     gate_act=gate_act,
+                     gru_layer_attr=gru_layer_attr)
+
+
+@wrap_name_default("bidirectional_lstm")
+def bidirectional_lstm(input, size, name=None, return_seq=False,
+                       fwd_mat_param_attr=None, fwd_bias_param_attr=None,
+                       fwd_inner_param_attr=None, fwd_act=None,
+                       fwd_gate_act=None, fwd_state_act=None,
+                       fwd_mixed_layer_attr=None, fwd_lstm_cell_attr=None,
+
+                       bwd_mat_param_attr=None, bwd_bias_param_attr=None,
+                       bwd_inner_param_attr=None, bwd_act=None,
+                       bwd_gate_act=None, bwd_state_act=None,
+                       bwd_mixed_layer_attr=None, bwd_lstm_cell_attr=None,
+
+                       last_seq_attr=None, first_seq_attr=None,
+                       concat_attr=None, concat_act=None):
+    """
+    TODO(yuyang18): Complete docs
+
+    :param name: bidirectional lstm layer name.
+    :type name: basestring
+    :param input: input layer.
+    :type input: LayerOutput
+    :param size: lstm layer size.
+    :type size: int
+    :param return_seq: If False, concat word in last time step and return.
+                       If True, concat sequnce in all time step and return.
+    :type return_seq: bool
+    :return: lstm layer name.
+    :rtype: LayerOutput
+    """
+    args = locals()
+
+    fw = simple_lstm(name='%s_fw' % name, input=input, size=size,
+                     **dict((k[len('fwd_'):], v) for k, v in args.iteritems()
+                        if k.startswith('fwd_')))
+
+    bw = simple_lstm(name="%s_bw" % name, input=input, size=size,
+                     reverse=True,
+                     **dict((k[len('bwd_'):], v) for k, v in args.iteritems()
+                        if k.startswith('bwd_')))
+
+    if return_seq:
+        return concat_layer(name=name, input=[fw, bw], layer_attr=concat_attr,
+                            act=concat_act)
+    else:
+        fw_seq = last_seq(name="%s_fw_last" % name, input=fw,
+                          layer_attr=last_seq_attr)
+        bw_seq = first_seq(name="%s_bw_last" % name, input=bw,
+                           layer_attr=first_seq_attr)
+        return concat_layer(name=name, input=[fw_seq, bw_seq],
+                            layer_attr=concat_attr, act=concat_act)
+
+
+@wrap_name_default()
+@wrap_act_default(param_names=['weight_act'], act=TanhActivation())
+def simple_attention(encoded_sequence,
+                     encoded_proj,
+                     decoder_state,
+                     transform_param_attr=None,
+                     softmax_param_attr=None,
+                     weight_act=None,
+                     name=None):
+    """
+    Calculate and then return a context vector by attention machanism.
+    Size of the context vector equals to size of encoded_sequence.
+
+    ..  math::
+        a(s_{i-1},h_{j}) = v_{a}f(W_{a}s_{t-1} + U_{a}h_{j})
+    ..  math::
+        e_{i,j} = a(s_{i-1}, h_{j})
+    ..  math::
+        a_{i,j} = \\frac{exp(e_{i,i})}{\\sum_{k=1}^{T_{x}{exp(e_{i,k})}}}
+    ..  math::
+        c_{i} = \\sum_{j=1}^{T_{x}}a_{i,j}h_{j}
+
+    where :math:`h_{j}` is the jth element of encoded_sequence,
+    :math:`U_{a}h_{j}` is the jth element of encoded_proj
+    :math:`s_{i-1}` is decoder_state
+    :math:`f` is weight_act, and is set to tanh by default.
+
+    Please refer to **Neural Machine Translation by Jointly Learning to
+    Align and Translate** for more details. The link is as follows:
+    https://arxiv.org/abs/1409.0473.
+
+    The example usage is:
+    ..  code-block:: python
+
+        context = simple_attention(encoded_sequence=enc_seq,
+                                   encoded_proj=enc_proj,
+                                   decoder_state=decoder_prev,)
+
+    :param name: name of the attention model.
+    :type name: basestring
+    :param softmax_param_attr: parameter attribute of sequence softmax
+                               that is used to produce attention weight
+    :type softmax_param_attr: ParameterAttribute
+    :param weight_act: activation of the attention model
+    :type weight_act: Activation
+    :param encoded_sequence: output of the encoder
+    :type encoded_sequence: LayerOutput
+    :param encoded_proj: attention weight is computed by a feed forward neural
+                         network which has two inputs : decoder's hidden state
+                         of previous time step and encoder's output.
+                         encoded_proj is output of the feed-forward network for
+                         encoder's output. Here we pre-compute it outside
+                         simple_attention for speed consideration.
+    :type encoded_proj: LayerOutput
+    :param decoder_state: hidden state of decoder in previous time step
+    :type decoder_state: LayerOutput
+    :param transform_param_attr: parameter attribute of the feed-forward
+                                network that takes decoder_state as inputs to
+                                compute attention weight.
+    :type transform_param_attr: ParameterAttribute
+    :return: a context vector
+    """
+    assert encoded_proj.size == decoder_state.size
+    proj_size = encoded_proj.size
+
+    with mixed_layer(size=proj_size, name="%s_transform" % name) as m:
+        m += full_matrix_projection(decoder_state,
+                                    param_attr=transform_param_attr)
+
+    expanded = expand_layer(input=m, expand_as=encoded_sequence,
+                            name='%s_expand' % name)
+
+    with mixed_layer(size=proj_size, act=weight_act,
+                     name="%s_combine" % name) as m:
+        m += identity_projection(expanded)
+        m += identity_projection(encoded_proj)
+
+    # sequence softmax is used to normalize similarities between decoder state
+    # and encoder outputs into a distribution
+    attention_weight = fc_layer(input=m,
+                                size=1,
+                                act=SequenceSoftmaxActivation(),
+                                param_attr=softmax_param_attr,
+                                name="%s_softmax" % name,
+                                bias_attr=False)
+
+    scaled = scaling_layer(weight=attention_weight, input=encoded_sequence,
+                           name='%s_scaling' % name)
+
+    return pooling_layer(input=scaled, pooling_type=SumPooling(),
+                         name="%s_pooling" % name)
+
+
+############################################################################
+#                         Miscs                                            #
+############################################################################
+
+@wrap_name_default("dropout")
+def dropout_layer(input, dropout_rate, name=None):
+    """
+    @TODO(yuyang18): Add comments.
+
+    :param name:
+    :param input:
+    :param dropout_rate:
+    :return:
+    """
+    return addto_layer(name=name, input=input, act=LinearActivation(),
+                       bias_attr=False,
+                       layer_attr=ExtraAttr(drop_rate=dropout_rate))
+
+
+def outputs(layers):
+    """
+    Declare the end of network. Currently it will only calculate the
+    input/output order of network. It will calculate the predict network or
+    train network's output automatically.
+
+
+    :param layers:
+    :type layers: list|tuple|LayerOutput
+    :return:
+    """
+
+    def __dfs_travel__(layer,
+                       predicate=lambda x: x.layer_type == LayerType.DATA):
+
+        """
+        DFS LRV Travel for output layer.
+
+        The return order is define order for data_layer in this leaf node.
+
+        :param layer:
+        :type layer: LayerOutput
+        :return:
+        """
+        assert isinstance(layer, LayerOutput), "layer is %s" % (layer)
+        retv = []
+        if layer.parents is not None:
+            for p in layer.parents:
+                retv.extend(__dfs_travel__(p, predicate))
+
+        if predicate(layer):
+            retv.append(layer)
+        return retv
+
+    if isinstance(layers, LayerOutput):
+        layers = [layers]
+
+    assert len(layers) > 0
+    if len(layers) != 1:
+        logger.warning("EndOfNetwork routine try to calculate network's"
+                       " inputs and outputs order. It might not work well."
+                       "Please see follow log carefully.")
+    inputs = []
+    outputs_ = []
+    for each_layer in layers:
+        assert isinstance(each_layer, LayerOutput)
+        inputs.extend(__dfs_travel__(each_layer))
+        outputs_.extend(__dfs_travel__(
+            each_layer, lambda x: x.layer_type == LayerType.COST))
+
+    # Currently, we got each leaf node's inputs order, output order.
+    # We merge them together.
+
+    final_inputs = []
+    final_outputs = []
+
+    for each_input in inputs:
+        assert isinstance(each_input, LayerOutput)
+        if each_input.name not in final_inputs:
+            final_inputs.append(each_input.name)
+
+    for each_output in outputs_:
+        assert isinstance(each_output, LayerOutput)
+        if each_output.name not in final_outputs:
+            final_outputs.append(each_output.name)
+
+    logger.info(
+        "".join(["The input order is [", ", ".join(final_inputs), "]"])
+    )
+    logger.info(
+        "".join(["The output order is [", ", ".join(final_outputs), "]"
+                 ]))
+
+    Inputs(*final_inputs)
+    if len(final_outputs) != 0:
+        Outputs(*final_outputs)
+    else:
+        Outputs(*map(lambda x: x.name, layers))
diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py
new file mode 100644
index 00000000000000..c49a2e36526a24
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/optimizers.py
@@ -0,0 +1,403 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.config_parser import Settings, default_decay_rate, \
+    default_gradient_clipping_threshold, default_momentum
+
+from .default_decorators import wrap_param_default
+
+__all__ = ['Optimizer', 'BaseSGDOptimizer', 'MomentumOptimizer',
+           'AdamaxOptimizer', 'AdamOptimizer', 'AdaGradOptimizer',
+           'RMSPropOptimizer', 'DecayedAdaGradOptimizer',
+           'AdaDeltaOptimizer', 'BaseRegularization', 'L2Regularization',
+           'settings', 'ModelAverage']
+
+
+class Optimizer(object):
+    def to_setting_kwargs(self):
+        raise NotImplementedError()
+
+    def extra_settings(self):
+        pass
+
+    @property
+    def is_support_sparse(self):
+        return True
+
+
+class BaseSGDOptimizer(Optimizer):
+    """
+    SGD Optimizer.
+
+    SGD is an optimization method, trying to find a neural network that
+    minimize the "cost/error" of it by iteration. In paddle's implementation
+    SGD Optimizer is synchronized, which means all gradients will be wait to
+    calculate and reduced into one gradient, then do optimize operation.
+
+    The neural network consider the learning problem of minimizing an objective
+    function, that has the form of a sum
+
+    ..  math::
+
+        Q(w) = \\sum_{i}^{n} Q_i(w)
+
+    The value of function Q sometimes is the cost of neural network (Mean
+    Square Error between prediction and label for example). The function Q is
+    parametrised by w, the weight/bias of neural network. And weights is what to
+    be learned. The i is the i-th observation in (trainning) data.
+
+    So, the SGD method will optimize the weight by
+
+    ..  math::
+
+        w:= w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
+
+    where :math:`\\eta` is learning rate. And :math:`n` is batch size.
+
+    The SGD method is implemented by paddle with multiple extensions. Such as
+    momentum, adagrad, rmsprop, adam. Please use method 'use_xxx', such as
+    use_adam, to enhance the SGD method.
+
+    WARNING: IN PADDLE'S IMPLEMENTATION, BATCH_SIZE IS SET FOR ONE COMPUTE
+    PROCESS(NODE). IF YOU USE MULTIPLE MACHINE TO TRAIN YOUR NETWORK, THE GLOBAL
+    BATCH SIZE WILL BE (BATCH_SIZE * MACHINE_COUNT).
+    """
+
+    def to_setting_kwargs(self):
+        raise NotImplementedError()
+
+
+class MomentumOptimizer(BaseSGDOptimizer):
+    def extra_settings(self):
+        default_momentum(self.momentum)
+
+    def to_setting_kwargs(self):
+        return {
+            'learning_method': 'momentum'
+        }
+
+    def __init__(self, momentum=1e-3):
+        self.momentum = momentum
+
+
+class AdamOptimizer(BaseSGDOptimizer):
+    """
+    Adam optimizer.
+    The details of please refer `Adam: A Method for Stochastic Optimization
+    <https://arxiv.org/abs/1412.6980>`_
+
+    ..  math::
+
+        m(w, t) &:= \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
+        v(w, t) &:= \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
+        w &:= w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
+
+    :param beta1: the :math:`\\beta_1` in equation.
+    :type beta1: float
+    :param beta2: the :math:`\\beta_2` in equation.
+    :type beta2: float
+    :param epsilon: the :math:`\\epsilon` in equation. It is used to prevent
+                        divided by zero.
+    :type epsilon: float
+    """
+
+    @property
+    def is_support_sparse(self):
+        return False
+
+    def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8):
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+
+    def to_setting_kwargs(self):
+        return {
+            'learning_method': 'adam',
+            'adam_beta1': self.beta1,
+            'adam_beta2': self.beta2,
+            'adam_epsilon': self.epsilon
+        }
+
+
+class AdamaxOptimizer(BaseSGDOptimizer):
+    """
+    Adamax optimizer.
+
+    The details of please refer this `Adam: A Method for Stochastic Optimization
+    <https://arxiv.org/abs/1412.6980>`_
+    ..  math::
+
+        m_t &:= \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\
+        u_t &:= max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\
+        w_t &:= w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t
+
+    :param beta1: the :math:`\\beta_1` in the equation.
+    :type beta1: float
+    :param beta2: the :math:`\\beta_2` in the equation.
+    :type beta2: float
+    """
+
+    def __init__(self, beta1, beta2):
+        self.beta1 = beta1
+        self.beta2 = beta2
+
+    def to_setting_kwargs(self):
+        return {
+            'learning_method': 'adamax',
+            'adam_beta1': self.beta1,
+            'adam_beta2': self.beta2
+        }
+
+    @property
+    def is_support_sparse(self):
+        return False
+
+
+class AdaGradOptimizer(BaseSGDOptimizer):
+    """
+    Adagrad(for ADAptive GRAdient algorithm) optimizer.
+
+    For details please refer this `Adaptive Subgradient Methods for
+    Online Learning and Stochastic Optimization
+    <http://www.magicbroom.info/Papers/DuchiHaSi10.pdf>`_.
+
+    ..  math::
+
+        G &= \\sum_{\\tau=1}^{t} g_{\\tau} g_{\\tau}^T \\\\
+        w &:= w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g
+    """
+
+    def to_setting_kwargs(self):
+        return {
+            'learning_method': 'adagrad'
+        }
+
+    def __init__(self):
+        pass
+
+
+class RMSPropOptimizer(BaseSGDOptimizer):
+    """
+    RMSProp(for Root Mean Square Propagation) optimizer. For details please
+    refer this `slide <http://www.cs.toronto.edu/~tijmen/csc321/slides/
+    lecture_slides_lec6.pdf>`_.
+
+    The equations of this method as follows:
+
+    ..  math::
+
+        v(w, t) &:= \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
+        w &:= w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w)
+
+    :param rho: the :math:`\\rho` in the equation. The forgetting factor.
+    :type rho: float
+    :param epsilon: the :math:`\\epsilon` in the equation.
+    :type epsilon: float
+    """
+
+    def to_setting_kwargs(self):
+        return {
+            'learning_method': 'rmsprop',
+            'ada_rou': self.rho,
+            'ada_epsilon': self.epsilon
+        }
+
+    def __init__(self, rho=0.95, epsilon=1e-6):
+        self.rho = rho
+        self.epsilon = epsilon
+
+
+class DecayedAdaGradOptimizer(BaseSGDOptimizer):
+    """
+    AdaGrad method with decayed sum gradients. The equations of this method
+    show as follow.
+
+    ..  math::
+
+        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
+        learning\\_rate &= 1/sqrt( ( E(g_t^2) + \\epsilon )
+
+    :param rho: The :math:`\\rho` parameter in that equation
+    :type rho: float
+    :param epsilon: The :math:`\\epsilon` parameter in that equation.
+    :type epsilon: float
+    """
+
+    def to_setting_kwargs(self):
+        return {
+            'learning_method': 'decayed_adagrad',
+            'ada_rou': self.rho,
+            'ada_epsilon': self.epsilon
+        }
+
+    def __init__(self, rho=0.95, epsilon=1e-6):
+        self.rho = rho
+        self.epsilon = epsilon
+
+
+class AdaDeltaOptimizer(BaseSGDOptimizer):
+    """
+    AdaDelta method. The details of adadelta please refer to this
+    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD
+    <http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf>`_.
+
+    ..  math::
+
+        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
+        learning\\_rate &= sqrt( ( E(dx_{t-1}^2) + \\epsilon ) / ( \\
+                          E(g_t^2) + \\epsilon ) ) \\\\
+        E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2
+
+    :param rho: :math:`\\rho` in equation
+    :type rho: float
+    :param epsilon: :math:`\\rho` in equation
+    :type epsilon: float
+    """
+
+    def to_setting_kwargs(self):
+        return {
+            'learning_method': 'adadelta',
+            'ada_rou': self.rho,
+            'ada_epsilon': self.epsilon
+        }
+
+    def __init__(self, rho=0.95, epsilon=1e-6):
+        self.rho = rho
+        self.epsilon = epsilon
+
+
+class BaseRegularization(Optimizer):
+    def __init__(self):
+        self.algorithm = ""
+        self.learning_method = ""
+
+    def to_setting_kwargs(self):
+        return {}
+
+
+class L2Regularization(BaseRegularization):
+    def __init__(self, rate):
+        super(L2Regularization, self).__init__()
+        self.decay_rate = rate
+
+    def to_setting_kwargs(self):
+        if self.algorithm == 'owlqn':
+            return {
+                'l2weight': self.decay_rate
+            }
+        else:
+            return dict()
+
+    def extra_settings(self):
+        if self.algorithm == 'sgd' or self.algorithm == 'async_sgd':
+            default_decay_rate(self.decay_rate)
+
+
+class ModelAverage(Optimizer):
+    def to_setting_kwargs(self):
+        return {
+            'average_window': self.average_window,
+            'max_average_window': self.max_average_window,
+            'do_average_in_cpu': self.do_average_in_cpu
+        }
+
+    def __init__(self, average_window,
+                 max_average_window=None,
+                 do_average_in_cpu=False):
+        self.average_window = average_window
+        self.max_average_window = max_average_window
+        self.do_average_in_cpu = do_average_in_cpu
+
+
+class GradientClippingThreshold(Optimizer):
+    def extra_settings(self):
+        default_gradient_clipping_threshold(self.threshold)
+
+    def __init__(self, threshold):
+        self.threshold = threshold
+
+    def to_setting_kwargs(self):
+        return dict()
+
+
+def __extends__(dict1, dict2):
+    for key in dict2:
+        assert key not in dict1
+        dict1[key] = dict2[key]
+    return dict1
+
+
+@wrap_param_default(['learning_method'],
+                    default_factory=lambda _: MomentumOptimizer())
+@wrap_param_default(['regularization'],
+                    default_factory=lambda _: BaseRegularization())
+def settings(batch_size,
+             learning_rate=1e-3,
+             learning_method=None,
+             regularization=None,
+             is_async=False,
+             model_average=None,
+             gradient_clipping_threshold=None
+             ):
+    """
+    TODO(yuyang18): Complete docs.
+
+
+    :param batch_size:
+    :param learning_rate:
+    :param learning_method:
+    :param regularization:
+    :param is_async:
+    :param model_average:
+    :param gradient_clipping_threshold:
+    :return:
+    """
+    if isinstance(regularization, BaseRegularization):
+        regularization = [regularization]
+
+    assert isinstance(learning_method, Optimizer)
+    if isinstance(learning_method, BaseSGDOptimizer):
+        algorithm = 'async_sgd' if is_async else 'sgd'
+    else:
+        algorithm = 'owlqn'
+
+    kwargs = dict()
+    kwargs['batch_size'] = batch_size
+    kwargs['learning_rate'] = learning_rate
+    kwargs['algorithm'] = algorithm
+
+    kwargs = __extends__(kwargs, learning_method.to_setting_kwargs())
+    learning_method.extra_settings()
+
+    for regular in regularization:
+        assert isinstance(regular, BaseRegularization)
+        regular.algorithm = algorithm
+        regular.learning_method = kwargs['learning_method']
+        kwargs = __extends__(kwargs, regular.to_setting_kwargs())
+        regular.extra_settings()
+
+    if gradient_clipping_threshold is not None:
+        gradient_clipping_threshold = GradientClippingThreshold(
+            threshold=gradient_clipping_threshold)
+
+    for each in [model_average, gradient_clipping_threshold]:
+        if each is not None:
+            assert isinstance(each, Optimizer)
+            each.algorithm = algorithm
+            each.learning_method = kwargs['learning_method']
+            kwargs = __extends__(kwargs, each.to_setting_kwargs())
+            each.extra_settings()
+
+    # Do Check?
+    Settings(**kwargs)
diff --git a/python/paddle/trainer_config_helpers/poolings.py b/python/paddle/trainer_config_helpers/poolings.py
new file mode 100644
index 00000000000000..cd6610d22fecca
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/poolings.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Pooling type that paddle supported. Note these pooling types are used for
+sequence input, not for images.
+
+Each PoolingType contains.
+
+- name: pooling layer type name used by paddle.
+"""
+
+__all__ = [
+    "BasePoolingType",
+    "MaxPooling",
+    "AvgPooling",
+    "SumPooling",
+    "SquareRootNPooling"
+]
+
+
+class BasePoolingType(object):
+    """
+    Base Pooling Type.
+    """
+    def __init__(self, name):
+        self.name = name
+
+
+class MaxPooling(BasePoolingType):
+    """
+    Max pooling.
+
+    Return the very large values for each dimension in sequence or time steps.
+
+    ..  math::
+
+        max(samples\\_of\\_a\\_sequence)
+    """
+    def __init__(self):
+        BasePoolingType.__init__(self, "max")
+        
+
+class AvgPooling(BasePoolingType):
+    """
+    Average pooling.
+
+    Return the average values for each dimension in sequence or time steps.
+
+    ..  math::
+
+        sum(samples\\_of\\_a\\_sequence)/sample\\_num
+    """
+    STRATEGY_AVG = "average"
+    STRATEGY_SUM = "sum"
+    STRATEGY_SQROOTN = "squarerootn"
+
+    def __init__(self, strategy=STRATEGY_AVG):
+        BasePoolingType.__init__(self, "average")
+        self.strategy = strategy
+
+
+class SumPooling(AvgPooling):
+    """
+    Sum pooling.
+
+    Return the sum values of each dimension in sequence or time steps.
+
+    ..  math::
+
+        sum(samples\\_of\\_a\\_sequence)
+    """
+    def __init__(self): AvgPooling.__init__(self, AvgPooling.STRATEGY_SUM)
+
+
+class SquareRootNPooling(AvgPooling):
+    """
+    Square Root Pooling.
+
+    Return the square root values of each dimension in sequence or time steps.
+
+    ..  math::
+
+        sum(samples\\_of\\_a\\_sequence)/sqrt(sample\\_num)
+    """
+    def __init__(self): AvgPooling.__init__(self, AvgPooling.STRATEGY_SQROOTN)
diff --git a/python/paddle/trainer_config_helpers/utils.py b/python/paddle/trainer_config_helpers/utils.py
new file mode 100644
index 00000000000000..40ecbb99e2b9f2
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/utils.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.config_parser import logger
+import functools
+
+__all__ = ['deprecated']
+
+
+def deprecated(instead):
+    def __impl__(func):
+        @functools.wraps(func)
+        def __wrapper__(*args, **kwargs):
+            logger.warning("The interface %s is deprecated, "
+                           "will be removed soon. Please use %s instead."
+                           % (func.__name__, instead))
+
+            return func(*args, **kwargs)
+
+        return __wrapper__
+
+    return __impl__
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
new file mode 100644
index 00000000000000..4f3c9434efbd80
--- /dev/null
+++ b/python/paddle/utils/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['dump_config']
\ No newline at end of file
diff --git a/python/paddle/utils/dump_config.py b/python/paddle/utils/dump_config.py
new file mode 100644
index 00000000000000..d8a2722575d539
--- /dev/null
+++ b/python/paddle/utils/dump_config.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.config_parser import parse_config
+from paddle.proto import TrainerConfig_pb2
+import sys
+
+__all__ = []
+
+if __name__ == '__main__':
+    if len(sys.argv) == 2:
+        conf = parse_config(sys.argv[1], '')
+    elif len(sys.argv) == 3:
+        conf = parse_config(sys.argv[1], sys.argv[2])
+    else:
+        raise RuntimeError()
+
+    assert isinstance(conf, TrainerConfig_pb2.TrainerConfig)
+
+    print conf.model_config
diff --git a/python/paddle/utils/image_util.py b/python/paddle/utils/image_util.py
new file mode 100644
index 00000000000000..c545d16aafbc74
--- /dev/null
+++ b/python/paddle/utils/image_util.py
@@ -0,0 +1,207 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from PIL import Image
+from cStringIO import StringIO
+
+def resize_image(img, target_size):
+    """
+    Resize an image so that the shorter edge has length target_size.
+    img: the input image to be resized.
+    target_size: the target resized image size.
+    """
+    percent = (target_size/float(min(img.size[0], img.size[1])))
+    resized_size = int(round(img.size[0] * percent)), int(round(img.size[1] * percent))
+    img = img.resize(resized_size, Image.ANTIALIAS)
+    return img
+
+def flip(im):
+    """
+    Return the flipped image.
+    Flip an image along the horizontal direction.
+    im: input image, (H x W x K) ndarrays 
+    """
+    if len(im.shape) == 3:
+        return im[:, :, ::-1]
+    else:
+        return im[:, ::-1]
+
+def crop_img(im, inner_size, color=True, test=True):
+    """
+    Return cropped image.
+    The size of the cropped image is inner_size * inner_size.
+    im: (K x H x W) ndarrays
+    inner_size: the cropped image size.
+    color: whether it is color image.
+    test: whether in test mode.
+      If False, does random cropping and flipping.
+      If True, crop the center of images.
+    """
+    if color:
+        height, width = max(inner_size, im.shape[1]), max(inner_size, im.shape[2])
+        padded_im = np.zeros((3, height, width))
+        startY = (height - im.shape[1]) / 2
+        startX = (width - im.shape[2]) / 2
+        endY, endX = startY + im.shape[1], startX + im.shape[2]
+        padded_im[:, startY: endY, startX: endX] = im
+    else:
+        im = im.astype('float32')
+        height, width = max(inner_size, im.shape[0]), max(inner_size, im.shape[1])
+        padded_im = np.zeros((height, width))
+        startY = (height - im.shape[0]) / 2
+        startX = (width - im.shape[1]) / 2
+        endY, endX = startY + im.shape[0], startX + im.shape[1]
+        padded_im[startY: endY, startX: endX] = im
+    if test:
+        startY = (height - inner_size) / 2
+        startX = (width - inner_size) / 2
+    else:
+        startY = np.random.randint(0, height - inner_size + 1)
+        startX = np.random.randint(0, width - inner_size + 1)
+    endY, endX = startY + inner_size, startX + inner_size
+    if color:
+        pic = padded_im[:, startY: endY, startX: endX]
+    else:
+        pic = padded_im[startY: endY, startX: endX]
+    if (not test) and (np.random.randint(2) == 0):
+        pic = flip(pic)
+    return pic
+
+def decode_jpeg(jpeg_string):
+    np_array = np.array(Image.open(StringIO(jpeg_string)))
+    if len(np_array.shape) == 3:
+        np_array = np.transpose(np_array, (2, 0, 1))
+    return np_array
+
+def preprocess_img(im, img_mean, crop_size, is_train, color=True):
+    """
+    Does data augmentation for images.
+    If is_train is false, cropping the center region from the image.
+    If is_train is true, randomly crop a region from the image,
+    and randomy does flipping.
+    im: (K x H x W) ndarrays
+    """
+    im = im.astype('float32')
+    test = not is_train
+    pic = crop_img(im, crop_size, color, test)
+    pic -= img_mean
+    return pic.flatten()
+
+def load_meta(meta_path, mean_img_size, crop_size, color=True):
+    """
+    Return the loaded meta file.
+    Load the meta image, which is the mean of the images in the dataset.
+    The mean image is subtracted from every input image so that the expected mean
+    of each input image is zero.
+    """
+    mean = np.load(meta_path)['data_mean']
+    border = (mean_img_size - crop_size) / 2
+    if color:
+        assert(mean_img_size * mean_img_size * 3 == mean.shape[0])
+        mean = mean.reshape(3, mean_img_size, mean_img_size)
+        mean = mean[:, border: border + crop_size,
+                       border: border + crop_size].astype('float32')
+    else:
+        assert(mean_img_size * mean_img_size == mean.shape[0])
+        mean = mean.reshape(mean_img_size, mean_img_size)
+        mean = mean[border: border + crop_size,
+                    border: border + crop_size].astype('float32')
+    return mean
+
+def load_image(img_path, is_color=True):
+    """
+    Load image and return. 
+    img_path: image path.
+    is_color: is color image or not.
+    """
+    img = Image.open(img_path)
+    img.load()
+    return img
+
+def oversample(img, crop_dims):
+    """
+    image : iterable of (H x W x K) ndarrays
+    crop_dims: (height, width) tuple for the crops.
+    Returned data contains ten crops of input image, namely,
+    four corner patches and the center patch as well as their
+    horizontal reflections.
+    """
+    # Dimensions and center.
+    im_shape = np.array(img[0].shape)
+    crop_dims = np.array(crop_dims)
+    im_center = im_shape[:2] / 2.0
+
+    # Make crop coordinates
+    h_indices = (0, im_shape[0] - crop_dims[0])
+    w_indices = (0, im_shape[1] - crop_dims[1])
+    crops_ix = np.empty((5, 4), dtype=int)
+    curr = 0
+    for i in h_indices:
+        for j in w_indices:
+            crops_ix[curr] = (i, j, i + crop_dims[0], j + crop_dims[1])
+            curr += 1
+    crops_ix[4] = np.tile(im_center, (1, 2)) + np.concatenate([
+        -crop_dims / 2.0,
+         crop_dims / 2.0
+    ])
+    crops_ix = np.tile(crops_ix, (2, 1))
+
+    # Extract crops
+    crops = np.empty((10 * len(img), crop_dims[0], crop_dims[1],
+                      im_shape[-1]), dtype=np.float32)
+    ix = 0
+    for im in img:
+        for crop in crops_ix:
+            crops[ix] = im[crop[0]:crop[2], crop[1]:crop[3], :]
+            ix += 1
+        crops[ix-5:ix] = crops[ix-5:ix, :, ::-1, :]  # flip for mirrors
+    return crops
+
+class ImageTransformer:
+    def __init__(self, transpose = None,
+                 channel_swap = None, mean = None, is_color = True):
+        self.transpose = transpose
+        self.channel_swap = None
+        self.mean = None
+        self.is_color = is_color 
+
+    def set_transpose(self, order): 
+        if self.is_color:
+            assert 3 == len(order) 
+        self.transpose = order
+
+    def set_channel_swap(self, order): 
+        if self.is_color:
+            assert 3 == len(order) 
+        self.channel_swap = order
+
+    def set_mean(self, mean):
+        # mean value, may be one value per channel 
+        if mean.ndim == 1:
+            mean = mean[:, np.newaxis, np.newaxis]       
+        else: 
+            # elementwise mean
+            if self.is_color:
+                assert len(mean.shape) == 3
+        self.mean = mean 
+
+    def transformer(self, data):
+        if self.transpose is not None:
+            data = data.transpose(self.transpose)
+        if self.channel_swap is not None:
+            data = data[self.channel_swap, :, :]
+        if self.mean is not None:
+            data -= self.mean
+        return data
diff --git a/python/paddle/utils/make_model_diagram.py b/python/paddle/utils/make_model_diagram.py
new file mode 100644
index 00000000000000..c8990bf73e8e46
--- /dev/null
+++ b/python/paddle/utils/make_model_diagram.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Generate dot diagram file for the given paddle model config
+# The generated file can be viewed using Graphviz (http://graphviz.org)
+
+
+import sys
+import traceback
+
+from paddle.trainer.config_parser import parse_config
+
+
+def make_layer_label(layer_config):
+    label = '%s type=%s' % (layer_config.name, layer_config.type)
+    if layer_config.reversed:
+        label += ' <=='
+
+    label2 = ''
+    if layer_config.active_type:
+        label2 += 'act=%s ' % layer_config.active_type
+    if layer_config.bias_parameter_name:
+        label2 += 'bias=%s ' % layer_config.bias_parameter_name
+
+    if label2:
+        label += '\l' + label2
+    return label
+
+
+def make_diagram(config_file, dot_file, config_arg_str):
+    config = parse_config(config_file, config_arg_str)
+    # print >> sys.stderr, config
+    name2id = {}
+    f = open(dot_file, 'w')
+    submodel_layers = set()
+
+    def make_link(link):
+        return 'l%s -> l%s;' % (
+            name2id[link.layer_name], name2id[link.link_name])
+
+    def make_mem(mem):
+        s = ''
+        if mem.boot_layer_name:
+            s += 'l%s -> l%s;\n' % (
+                name2id[mem.boot_layer_name], name2id[mem.layer_name])
+        s += 'l%s -> l%s [style=dashed];' % (
+            name2id[mem.layer_name], name2id[mem.link_name])
+        return s
+
+    print >> f, 'digraph graphname {'
+    print >> f, 'node [width=0.375,height=0.25];'
+    for i in xrange(len(config.model_config.layers)):
+        l = config.model_config.layers[i]
+        name2id[l.name] = i
+
+    i = 0
+    for sub_model in config.model_config.sub_models:
+        if sub_model.name == 'root':
+            continue
+        print >> f, 'subgraph cluster_%s {' % i
+        print >> f, 'style=dashed;'
+        label = '%s ' % sub_model.name
+        if sub_model.reversed:
+            label += '<=='
+        print >> f, 'label = "%s";' % label
+        i += 1
+        submodel_layers.add(sub_model.name)
+        for layer_name in sub_model.layer_names:
+            submodel_layers.add(layer_name)
+            lid = name2id[layer_name]
+            layer_config = config.model_config.layers[lid]
+            label = make_layer_label(layer_config)
+            print >> f, 'l%s [label="%s", shape=box];' % (lid, label)
+        print >> f, '}'
+
+    for i in xrange(len(config.model_config.layers)):
+        l = config.model_config.layers[i]
+        if l.name not in submodel_layers:
+            label = make_layer_label(l)
+            print >> f, 'l%s [label="%s", shape=box];' % (i, label)
+
+    for sub_model in config.model_config.sub_models:
+        if sub_model.name == 'root':
+            continue
+        for link in sub_model.in_links:
+            print >> f, make_link(link)
+        for link in sub_model.out_links:
+            print >> f, make_link(link)
+        for mem in sub_model.memories:
+            print >> f, make_mem(mem)
+
+    for i in xrange(len(config.model_config.layers)):
+        for l in config.model_config.layers[i].inputs:
+            print >> f, 'l%s -> l%s [label="%s"];' % (
+                name2id[l.input_layer_name], i, l.input_parameter_name)
+
+    print >> f, '}'
+    f.close()
+
+
+def usage():
+    print >> sys.stderr, ("Usage: python show_model_diagram.py"
+                          + " CONFIG_FILE DOT_FILE [config_str]")
+    exit(1)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 3 or len(sys.argv) > 4:
+        usage()
+
+    config_file = sys.argv[1]
+    dot_file = sys.argv[2]
+    config_arg_str = sys.argv[3] if len(sys.argv) == 4 else ''
+
+    try:
+        make_diagram(config_file, dot_file, config_arg_str)
+    except:
+        traceback.print_exc()
+        raise
diff --git a/python/paddle/utils/plotcurve.py b/python/paddle/utils/plotcurve.py
new file mode 100644
index 00000000000000..076eb6f74d5b18
--- /dev/null
+++ b/python/paddle/utils/plotcurve.py
@@ -0,0 +1,142 @@
+#!/usr/bin/python
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Plot training and testing curve from paddle log.
+
+It takes input from a file or stdin, and output to a file or stdout.
+
+Note: must have numpy and matplotlib installed in order to use this tool.
+
+usage: Plot training and testing curves from paddle log file.
+       [-h] [-i INPUT] [-o OUTPUT] [--format FORMAT] [key [key ...]]
+
+positional arguments:
+  key                   keys of scores to plot, the default will be AvgCost
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -i INPUT, --input INPUT
+                        input filename of paddle log, default will be standard
+                        input
+  -o OUTPUT, --output OUTPUT
+                        output filename of figure, default will be standard
+                        output
+  --format FORMAT       figure format(png|pdf|ps|eps|svg)
+
+
+The keys must be in the order of paddle output(!!!).
+
+For example, paddle.INFO contrains the following log
+   I0406 21:26:21.325584  3832 Trainer.cpp:601]  Pass=0 Batch=7771 AvgCost=0.624935 Eval: error=0.260972
+
+To use this script to generate plot for AvgCost, error:
+   python plotcurve.py -i paddle.INFO -o figure.png AvgCost error
+"""
+
+import sys
+import matplotlib
+# the following line is added immediately after import matplotlib
+# and before import pylot. The purpose is to ensure the plotting
+# works even under remote login (i.e. headless display)
+matplotlib.use('Agg')
+from matplotlib import cm
+import matplotlib.pyplot as pyplot
+import numpy
+import argparse
+import re
+import os
+
+
+def plot_paddle_curve(keys, inputfile, outputfile,
+                      format='png', show_fig = False):
+    """Plot curves from paddle log and save to outputfile.
+
+    :param keys: a list of strings to be plotted, e.g. AvgCost
+    :param inputfile: a file object for input
+    :param outputfile: a file object for output
+    :return: None
+    """
+    pass_pattern = r"Pass=([0-9]*)"
+    test_pattern = r"Test samples=([0-9]*)"
+    if not keys:
+        keys = ['AvgCost']
+    for k in keys:
+        pass_pattern += r".*?%s=([0-9e\-\.]*)" % k
+        test_pattern += r".*?%s=([0-9e\-\.]*)" % k
+    data = []
+    test_data = []
+    compiled_pattern = re.compile(pass_pattern)
+    compiled_test_pattern = re.compile(test_pattern)
+    for line in inputfile:
+        found = compiled_pattern.search(line)
+        found_test = compiled_test_pattern.search(line)
+        if found:
+            data.append([float(x) for x in found.groups()])
+        if found_test:
+            test_data.append([float(x) for x in found_test.groups()])
+    x = numpy.array(data)
+    x_test = numpy.array(test_data)
+    if x.shape[0] <= 0:
+        sys.stderr.write("No data to plot. Exiting!\n")
+        return
+    m = len(keys) + 1
+    for i in xrange(1, m):
+        pyplot.plot(x[:, 0], x[:, i], color=cm.jet(1.0 * (i - 1) / (2 * m)),
+                    label=keys[i - 1])
+        if (x_test.shape[0] > 0):
+            pyplot.plot(x[:, 0], x_test[:, i],
+                        color=cm.jet(1.0 - 1.0 * (i - 1) / (2 * m)),
+                        label="Test " + keys[i - 1])
+    pyplot.xlabel('number of epoch')
+    pyplot.legend(loc='best')
+    if show_fig:
+        pyplot.show()
+    pyplot.savefig(outputfile, bbox_inches='tight')
+    pyplot.clf()
+
+
+def main(argv):
+    """
+    main method of plotting curves.
+    """
+    cmdparser = argparse.ArgumentParser("Plot training and testing curves from paddle log file.")
+    cmdparser.add_argument('key', nargs='*', help='keys of scores to plot, the default is AvgCost')
+    cmdparser.add_argument('-i', '--input', help='input filename of paddle log, '
+                                                 'default will be standard input')
+    cmdparser.add_argument('-o', '--output', help='output filename of figure, '
+                                                 'default will be standard output')
+    cmdparser.add_argument('--format', help='figure format(png|pdf|ps|eps|svg)')
+    args = cmdparser.parse_args(argv)
+    keys = args.key
+    if args.input:
+        inputfile = open(args.input)
+    else:
+        inputfile = sys.stdin
+    format = args.format
+    if args.output:
+        outputfile = open(args.output, 'wb')
+        if not format:
+            format = os.path.splitext(args.output)[1]
+            if not format:
+                format = 'png'
+    else:
+        outputfile = sys.stdout
+    plot_paddle_curve(keys, inputfile, outputfile, format)
+    inputfile.close()
+    outputfile.close()
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/python/paddle/utils/predefined_net.py b/python/paddle/utils/predefined_net.py
new file mode 100644
index 00000000000000..6b0db0b6984d0e
--- /dev/null
+++ b/python/paddle/utils/predefined_net.py
@@ -0,0 +1,364 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import os
+from paddle.trainer.config_parser import *
+from paddle.utils.preprocess_img import \
+    ImageClassificationDatasetCreater
+from paddle.trainer_config_helpers import *
+
+
+def image_data(data_dir,
+               processed_image_size,
+               overwrite=False,
+               color=True,
+               train_list="batches/train.list",
+               test_list="batches/test.list",
+               meta_file="batches/batches.meta",
+               use_jpeg=1):
+    """
+    Predefined image data provider for image classification.
+    train_list: a text file containing a list of training batches.
+    test_list: a text file containing a list of test batches.
+    processed_image_size: all the input images will be resized into this size.
+       If the image is not square. Then the shorter edge will be resized into
+       this size, and the aspect ratio is kept the same.
+    color: whether the images are color or gray.
+    meta_path: the path of the meta file that stores the mean image file and
+               other dataset information, such as the size of images,
+               the size of the mean image, the number of classes.
+    async_load_data: whether to load image data asynchronuously.
+    """
+    data_creator = ImageClassificationDatasetCreater(data_dir,
+                                                     processed_image_size,
+                                                     color)
+    batch_data_dir = data_dir
+    train_list = os.path.join(batch_data_dir, train_list)
+    test_list = os.path.join(batch_data_dir, test_list)
+    meta_path = os.path.join(batch_data_dir, meta_file)
+    image_size = processed_image_size
+    conf = np.load(meta_path)
+    mean_image_size = conf["mean_image_size"]
+    is_color = conf["color"]
+    num_classes = conf["num_classes"]
+    color_string = "color" if is_color else "gray"
+
+    args = {
+        'meta': meta_path,
+        'mean_img_size': mean_image_size,
+        'img_size': image_size,
+        'num_classes': num_classes,
+        'use_jpeg': use_jpeg != 0,
+        'color': color_string
+    }
+
+    define_py_data_sources2(train_list, test_list,
+                           module='image_provider',
+                           obj='processData',
+                           args=args)
+    return {"image_size": image_size,
+            "num_classes": num_classes,
+            "is_color": is_color}
+
+
+def get_extra_layer_attr(drop_rate):
+    if drop_rate == 0:
+        return None
+    else:
+        return ExtraLayerAttribute(drop_rate=drop_rate)
+
+
+def image_data_layers(image_size, num_classes,
+                      is_color=False, is_predict=False):
+    """
+    Data layers for image classification.
+    image_size: image size.
+    num_classes: num of classes.
+    is_color: whether the input images are color.
+    is_predict: whether the network is used for prediction.
+    """
+    num_image_channels = 3 if is_color else 1
+    data_input = data_layer("input",
+                            image_size * image_size * num_image_channels)
+    if is_predict:
+        return data_input, None, num_image_channels
+    else:
+        label_input = data_layer("label", 1)
+        return data_input, label_input, num_image_channels
+
+
+def simple_conv_net(data_conf, is_color=False):
+    """
+    A Wrapper for a simple network for MNIST digit recognition.
+    It contains two convolutional layers, one fully conencted layer, and
+    one softmax layer.
+    data_conf is a dictionary with the following keys:
+        image_size: image size.
+        num_classes: num of classes.
+        is_color: whether the input images are color.
+    """
+    for k, v in data_conf.iteritems(): globals()[k] = v
+    data_input, label_input, num_image_channels = \
+        image_data_layers(image_size, num_classes, is_color, is_predict)
+    filter_sizes = [5, 5]
+    num_channels = [32, 64]
+    strides = [1, 1]
+    fc_dims = [500]
+    conv_bn_pool1 = img_conv_bn_pool(name="g1",
+                                     input=data_input,
+                                     filter_size=filter_sizes[0],
+                                     num_channel=num_image_channels,
+                                     num_filters=num_channels[0],
+                                     conv_stride=1,
+                                     conv_padding=0,
+                                     pool_size=3,
+                                     pool_stride=2,
+                                     act=ReluActivation())
+    conv_bn_pool2 = img_conv_bn_pool(name="g2",
+                                     input=conv_bn_pool1,
+                                     filter_size=filter_sizes[1],
+                                     num_channel=num_channels[0],
+                                     num_filters=num_channels[1],
+                                     conv_stride=1,
+                                     conv_padding=0,
+                                     pool_size=3,
+                                     pool_stride=2,
+                                     act=ReluActivation())
+    fc3 = fc_layer(name="fc3",
+                   input=conv_bn_pool2,
+                   dim=fc_dims[0],
+                   act=ReluActivation())
+    fc3_dropped = dropout_layer(name="fc3_dropped",
+                                input=fc3,
+                                dropout_rate=0.5)
+    output = fc_layer(name="output",
+                      input=fc3_dropped,
+                      dim=fc_dims[0],
+                      act=SoftmaxActivation())
+    if is_predict:
+        end_of_network(output)
+    else:
+        cost = classify(name="cost",
+                        input=output,
+                        label=label_input)
+        end_of_network(cost)
+
+
+def conv_layer_group(prefix_num, num_layers, input,
+                     input_channels, output_channels,
+                     drop_rates=[], strides=[],
+                     with_bn=[]):
+    """
+    A set of convolution layers, and batch normalization layers,
+    followed by one pooling layer.
+    It is utilized in VGG network for image classifcation.
+    prefix_num: the prefix number of the layer names.
+                For example, if prefix_num = 1, the first convolutioal layer's
+                name will be conv_1_1.
+    num_layers: number of the convolutional layers.
+    input: the name of the input layer.
+    input_channels: the number of channels of the input feature map.
+    output_channels: the number of channels of the output feature map.
+    drop_rates: the drop rates of the BN layers. It will be all zero by default.
+    strides: the stride of the convolution for the layers.
+             It will be all 1 by  default.
+    with_bn: whether to use Batch Normalization for Conv layers.
+             By default,  it is all false.
+    """
+    if len(drop_rates) == 0: drop_rates = [0] * num_layers
+    if len(strides) == 0: strides = [1] * num_layers
+    if len(with_bn) == 0: with_bn = [False] * num_layers
+    assert (len(drop_rates) == num_layers)
+    assert (len(strides) == num_layers)
+
+    for i in range(1, num_layers + 1):
+        if i == 1:
+            i_conv_in = input
+        else:
+            i_conv_in = group_output
+        i_channels_conv = input_channels if i == 1 else output_channels
+        conv_act = LinearActivation() if with_bn[i - 1] else ReluActivation()
+        conv_output = img_conv_layer(name="conv%d_%d" % (prefix_num, i),
+                                     input=i_conv_in,
+                                     filter_size=3,
+                                     num_channels=i_channels_conv,
+                                     num_filters=output_channels,
+                                     stride=strides[i - 1],
+                                     padding=1,
+                                     act=conv_act)
+        if with_bn[i - 1]:
+            bn = batch_norm_layer(name="conv%d_%d_bn" % (prefix_num, i),
+                                  input=conv_output,
+                                  num_channels=output_channels,
+                                  act=ReluActivation(),
+                                  layer_attr=get_extra_layer_attr(
+                                      drop_rate=drop_rates[i - 1]))
+            group_output = bn
+        else:
+            group_output = conv_output
+    pool = img_pool_layer(name="pool%d" % prefix_num,
+                          input=group_output,
+                          pool_size=2,
+                          num_channels=output_channels,
+                          stride=2)
+    return pool
+
+
+def vgg_conv_net(image_size, num_classes, num_layers,
+                 channels, strides, with_bn, fc_dims,
+                 drop_rates, drop_rates_fc=[],
+                 is_color=True, is_predict=False):
+    """
+    A Wrapper for a VGG network for image classification.
+    It is a set of convolutional groups followed by several fully
+    connected layers, and a cross-entropy classifiation loss.
+    The detailed architecture of the paper can be found here:
+      Very Deep Convolutional Networks for Large-Scale Visual Recognition
+      http://www.robots.ox.ac.uk/~vgg/research/very_deep/
+    image_size: image size.
+    num_classes: num of classes.
+    num_layers: the number of layers for all the convolution groups.
+    channels: the number of output filters for all the convolution groups.
+    with_bn: whether each layer of a convolution group is followed by a
+    batch normalization.
+    drop_rates: the dropout rates for all the convolutional layers.
+    fc_dims: the dimension for all the fully connected layers.
+    is_color: whether the input images are color.
+    """
+    data_input, label_input, num_image_channels = \
+        image_data_layers(image_size, num_classes, is_color, is_predict)
+    assert (len(num_layers) == len(channels))
+    assert (len(num_layers) == len(strides))
+    assert (len(num_layers) == len(with_bn))
+    num_fc_layers = len(fc_dims)
+    assert (num_fc_layers + 1 == len(drop_rates_fc))
+
+    for i in range(len(num_layers)):
+        input_layer = data_input if i == 0 else group_output
+        input_channels = 3 if i == 0 else channels[i - 1]
+        group_output = conv_layer_group(prefix_num=i + 1,
+                                        num_layers=num_layers[i],
+                                        input=input_layer,
+                                        input_channels=input_channels,
+                                        output_channels=channels[i],
+                                        drop_rates=drop_rates[i],
+                                        strides=strides[i],
+                                        with_bn=with_bn[i])
+    conv_output_name = group_output
+    if drop_rates_fc[0] != 0.0:
+        dropped_pool_name = "pool_dropped"
+        conv_output_name = dropout_layer(name=dropped_pool_name,
+                                         input=conv_output_name,
+                                         dropout_rate=drop_rates_fc[0])
+    for i in range(len(fc_dims)):
+        input_layer_name = conv_output_name if i == 0 else fc_output
+        active_type = LinearActivation() if i == len(
+            fc_dims) - 1 else ReluActivation()
+        drop_rate = 0.0 if i == len(fc_dims) - 1 else drop_rates_fc[i + 1]
+        fc_output = fc_layer(name="fc%d" % (i + 1),
+                             input=input_layer_name,
+                             size=fc_dims[i],
+                             act=active_type,
+                             layer_attr=get_extra_layer_attr(drop_rate))
+    bn = batch_norm_layer(name="fc_bn",
+                          input=fc_output,
+                          num_channels=fc_dims[len(fc_dims) - 1],
+                          act=ReluActivation(),
+                          layer_attr=get_extra_layer_attr(
+                              drop_rate=drop_rates_fc[-1]))
+    output = fc_layer(name="output",
+                      input=bn,
+                      size=num_classes,
+                      act=SoftmaxActivation())
+    if is_predict:
+        outputs(output)
+    else:
+        cost = classification_cost(name="cost",
+                                   input=output,
+                                   label=label_input)
+        outputs(cost)
+
+
+def vgg16_conv_net(image_size, num_classes,
+                   is_color=True, is_predict=False):
+    """
+    A Wrapper for a 16 layers VGG network for image classification.
+    The detailed architecture of the paper can be found here:
+      Very Deep Convolutional Networks for Large-Scale Visual Recognition
+      http://www.robots.ox.ac.uk/~vgg/research/very_deep/
+    image_size: image size.
+    num_classes: num of classes.
+    is_color: whether the input images are color.
+    """
+    vgg_conv_net(image_size, num_classes,
+                 num_layers=[2, 2, 3, 3, 3],
+                 channels=[64, 128, 256, 512, 512],
+                 strides=[[], [], [], [], []],
+                 with_bn=[[False, True], [False, True], [False, False, True], \
+                          [False, False, True], [False, False, True]],
+                 drop_rates=[[]] * 5,
+                 drop_rates_fc=[0.0, 0.5, 0.5],
+                 fc_dims=[4096, 4096],
+                 is_predict=is_predict)
+
+
+def small_vgg(data_conf,
+              is_predict=False):
+    """
+    A Wrapper for a small VGG network for CIFAR-10 image classification.
+    The detailed architecture of the paper can be found here:
+      92.45% on CIFAR-10 in Torch
+      http://torch.ch/blog/2015/07/30/cifar.html
+    Due to the constraints of CuDNN, it only has four convolutional groups
+    rather than five.
+    Thus, it only achieves 91.2% test accuracy and 98.1% training accuracy.
+    data_conf is a dictionary with the following keys:
+        image_size: image size.
+        num_classes: num of classes.
+        is_color: whether the input images are color.
+    """
+    for k, v in data_conf.iteritems(): globals()[k] = v
+    vgg_conv_net(image_size, num_classes,
+                 num_layers=[2, 2, 3, 3],
+                 channels=[64, 128, 256, 512],
+                 strides=[[], [], [], []],
+                 with_bn=[[True, True], [True, True], [True, True, True], \
+                          [True, True, True]],
+                 drop_rates=[[0.3, 0.0], [0.4, 0.0],
+                             [0.4, 0.4, 0.0], [0.4, 0.4, 0.0]],
+                 drop_rates_fc=[0.5, 0.5],
+                 fc_dims=[512],
+                 is_predict=is_predict)
+
+
+def training_settings(learning_rate=0.1, batch_size=128, algorithm="sgd",
+                      momentum=0.9, decay_rate=0.001):
+    """
+    Training settings.
+    learning_rate: learning rate of the training.
+    batch_size: the size of each training batch.
+    algorithm: training algorithm, can be
+       - sgd
+       - adagrad
+       - adadelta
+       - rmsprop
+    momentum: momentum of the training algorithm.
+    decay_rate: weight decay rate.
+    """
+    Settings(algorithm=algorithm,
+             batch_size=batch_size,
+             learning_rate=learning_rate / float(batch_size))
+    default_momentum(momentum)
+    default_decay_rate(decay_rate * batch_size)
diff --git a/python/paddle/utils/preprocess_img.py b/python/paddle/utils/preprocess_img.py
new file mode 100644
index 00000000000000..2765e93fa14e21
--- /dev/null
+++ b/python/paddle/utils/preprocess_img.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+import random
+import numpy as np
+import PIL.Image as Image
+import StringIO
+import preprocess_util
+from image_util import crop_img
+
+
+def resize_image(img, target_size):
+    """
+    Resize an image so that the shorter edge has length target_size.
+    img: the input image to be resized.
+    target_size: the target resized image size.
+    """
+    percent = (target_size/float(min(img.size[0], img.size[1])))
+    resized_size = int(round(img.size[0] * percent)),\
+                   int(round(img.size[1] * percent))
+    img = img.resize(resized_size, Image.ANTIALIAS)
+    return img
+
+class DiskImage:
+    """
+    A class of image data on disk.
+    """
+    def __init__(self, path, target_size):
+        """
+        path: path of the image.
+        target_size: target resize size.
+        """
+        self.path = path
+        self.target_size = target_size
+        self.img = None
+        pass
+
+    def read_image(self):
+        if self.img is None:
+            print "reading: " + self.path
+            image = resize_image(Image.open(self.path), self.target_size)
+            self.img = image
+
+    def convert_to_array(self):
+        self.read_image()
+        np_array = np.array(self.img)
+        if len(np_array.shape) == 3:
+            np_array = np.swapaxes(np_array, 1, 2)
+            np_array = np.swapaxes(np_array, 1, 0)
+        return np_array
+
+    def convert_to_paddle_format(self):
+        """
+        convert the image into the paddle batch format.
+        """
+        self.read_image()
+        output = StringIO.StringIO()
+        self.img.save(output, "jpeg")
+        contents = output.getvalue()
+        return contents
+
+
+class ImageClassificationDatasetCreater(preprocess_util.DatasetCreater):
+    """
+    A class to process data for image classification.
+    """
+    def __init__(self, data_path, target_size, color=True):
+        """
+        data_path: the path to store the training data and batches.
+        target_size: processed image size in a batch.
+        color: whether to use color images.
+        """
+        preprocess_util.DatasetCreater.__init__(self, data_path)
+        self.target_size = target_size
+        self.color = color
+        self.keys = ["images", "labels"]
+        self.permute_key = "labels"
+
+    def create_meta_file(self, data):
+        """
+        Create a meta file for image classification.
+        The meta file contains the meam image, as well as some configs.
+        data: the training Dataaet.
+        """
+        output_path = os.path.join(self.data_path,
+                                   self.batch_dir_name,
+                                   self.meta_filename)
+        if self.color:
+            mean_img = np.zeros((3, self.target_size, self.target_size))
+        else:
+            mean_img = np.zeros((self.target_size, self.target_size))
+        for d in data.data:
+            img = d[0].convert_to_array()
+            cropped_img = crop_img(img, self.target_size, self.color)
+            mean_img += cropped_img
+        mean_img /= len(data.data)
+        mean_img = mean_img.astype('int32').flatten()
+        preprocess_util.save_file({"data_mean": mean_img,
+                                   "image_size": self.target_size,
+                                   "mean_image_size": self.target_size,
+                                   "num_classes": self.num_classes,
+                                   "color": self.color},
+                                  output_path)
+        pass
+
+    def create_dataset_from_list(self, path):
+        data = []
+        label_set = []
+        for line in open(file_list):
+            items = line.rstrip.split()
+            image_path = items[0]
+            label_name = items[1]
+            if not label_name in label_set:
+                label_set[label_name] = len(label_set.keys())
+            img = DiskImage(path = image_path, target_size = self.target_size)
+            label = preprocess_util.Lablel(label = label_set[label_name],
+                                           name=label_name)
+        return preprocess_util.Dataset(data, self.keys), label_set
+
+
+    def create_dataset_from_dir(self, path):
+        """
+        Create a Dataset object for image classfication.
+        Each folder in the path directory corresponds to a set of images of
+        this label, and the name of the folder is the name of the
+        path: the path of the image dataset.
+        """
+        if self.from_list:
+            return create_dataset_from_list(path)
+        label_set = preprocess_util.get_label_set_from_dir(path)
+        data = []
+        for l_name in label_set.keys():
+            image_paths = preprocess_util.list_images(os.path.join(path, l_name))
+            for p in image_paths:
+                img = DiskImage(path = p, target_size = self.target_size)
+                label = preprocess_util.Label(label = label_set[l_name],
+                                              name = l_name)
+                data.append((img, label))
+        random.shuffle(data)
+        return preprocess_util.Dataset(data, self.keys), label_set
diff --git a/python/paddle/utils/preprocess_util.py b/python/paddle/utils/preprocess_util.py
new file mode 100644
index 00000000000000..f187ed1f67366a
--- /dev/null
+++ b/python/paddle/utils/preprocess_util.py
@@ -0,0 +1,347 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import math
+import cPickle as pickle
+import random
+import collections
+
+def save_file(data, filename):
+    """
+    Save data into pickle format.
+    data: the data to save.
+    filename: the output filename.
+    """
+    pickle.dump(data, open(filename, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
+
+def save_list(l, outfile):
+    """
+    Save a list of string into a text file. There is one line for each string.
+    l: the list of string to save
+    outfile: the output file
+    """
+    open(outfile, "w").write("\n".join(l))
+
+
+def exclude_pattern(f):
+    """
+    Return whether f is in the exlucde pattern.
+    Exclude the files that starts with . or ends with ~.
+    """
+    return f.startswith(".") or f.endswith("~")
+
+def list_dirs(path):
+    """
+    Return a list of directories in path. Exclude all the directories that
+    start with '.'.
+    path: the base directory to search over.
+    """
+    return [os.path.join(path, d) for d in next(os.walk(path))[1] if not exclude_pattern(d)]
+
+def list_images(path, exts = set(["jpg", "png", "bmp", "jpeg"])):
+    """
+    Return a list of images in path.
+    path: the base directory to search over.
+    exts: the extensions of the images to find.
+    """
+    return [os.path.join(path, d) for d in  os.listdir(path) \
+            if os.path.isfile(os.path.join(path, d)) and not exclude_pattern(d)\
+            and os.path.splitext(d)[-1][1:] in exts]
+
+def list_files(path):
+    """
+    Return a list of files in path.
+    path: the base directory to search over.
+    exts: the extensions of the images to find.
+    """
+    return [os.path.join(path, d) for d in  os.listdir(path) \
+            if os.path.isfile(os.path.join(path, d)) and not exclude_pattern(d)]
+
+def get_label_set_from_dir(path):
+    """
+    Return a dictionary of the labels and label ids from a path.
+    Assume each direcotry in the path corresponds to a unique label.
+    The keys of the dictionary is the label name.
+    The values of the dictionary is the label id.
+    """
+    dirs = list_dirs(path)
+    return dict([(os.path.basename(d), i) for i, d in enumerate(sorted(dirs))])
+
+
+class Label:
+    """
+    A class of label data.
+    """
+    def __init__(self, label, name):
+        """
+        label: the id of the label.
+        name: the name of the label.
+        """
+        self.label = label
+        self.name = name
+
+    def convert_to_paddle_format(self):
+        """
+        convert the image into the paddle batch format.
+        """
+        return int(self.label)
+
+    def  __hash__(self):
+        return hash((self.label))
+
+class Dataset:
+    """
+    A class to represent a dataset. A dataset contains a set of items.
+    Each item contains multiple slots of data.
+    For example: in image classification dataset, each item contains two slot,
+    The first slot is an image, and the second slot is a label.
+    """
+    def __init__(self, data, keys):
+        """
+        data: a list of data.
+              Each data is a tuple containing multiple slots of data.
+              Each slot is an object with convert_to_paddle_format function.
+        keys: contains a list of keys for all the slots.
+        """
+        self.data = data
+        self.keys = keys
+
+    def check_valid(self):
+        for d in self.data:
+            assert(len(d) == len(self.keys))
+
+    def permute(self, key_id, num_per_batch):
+        """
+        Permuate data for batching. It supports two types now:
+        1. if key_id == None, the batching process is completely random.
+        2. if key_id is not None. The batching process Permuate the data so that the key specified by key_id are
+        uniformly distributed in batches. See the comments of permute_by_key for details.
+        """
+        if key_id is None:
+            self.uniform_permute()
+        else:
+            self.permute_by_key(key_id, num_per_batch)
+
+    def uniform_permute(self):
+        """
+        Permuate the data randomly.
+        """
+        random.shuffle(self.data)
+
+    def permute_by_key(self, key_id, num_per_batch):
+        """
+        Permuate the data so that the key specified by key_id are
+        uniformly distributed in batches.
+        For example: if we have three labels, and the number of data
+        for each label are 100, 200, and 300, respectively.  The number of batches is 4.
+        Then, the number of data for these labels is 25, 50, and 75.
+        """
+        # Store the indices of the data that has the key value
+        # specified by key_id.
+        keyvalue_indices = collections.defaultdict(list)
+        for idx in range(len(self.data)):
+            keyvalue_indices[self.data[idx][key_id].label].append(idx)
+        for k in keyvalue_indices:
+            random.shuffle(keyvalue_indices[k])
+
+        num_data_per_key_batch = \
+            math.ceil(num_per_batch / float(len(keyvalue_indices.keys())))
+
+        if num_data_per_key_batch < 2:
+            raise Exception("The number of data in a batch is too small")
+
+        permuted_data = []
+        keyvalue_readpointer = collections.defaultdict(int)
+        while len(permuted_data) < len(self.data):
+            for k in keyvalue_indices:
+                begin_idx = keyvalue_readpointer[k]
+                end_idx = int(min(begin_idx + num_data_per_key_batch,
+                              len(keyvalue_indices[k])))
+                print "begin_idx, end_idx"
+                print begin_idx, end_idx
+                for idx in range(begin_idx, end_idx):
+                    permuted_data.append(self.data[keyvalue_indices[k][idx]])
+                keyvalue_readpointer[k] = end_idx
+        self.data = permuted_data
+
+
+
+class DataBatcher:
+    """
+    A class that is used to create batches for both training and testing
+    datasets.
+    """
+    def __init__(self, train_data, test_data, label_set):
+        """
+        train_data, test_data: Each one is a dataset object repesenting
+        training and testing data, respectively.
+        label_set: a dictionary storing the mapping from label name to label id.
+        """
+        self.train_data = train_data
+        self.test_data  = test_data
+        self.label_set = label_set
+        self.num_per_batch = 5000
+        assert(self.train_data.keys == self.test_data.keys)
+
+    def create_batches_and_list(self, output_path, train_list_name,
+                                test_list_name, label_set_name):
+        """
+        Create batches for both training and testing objects.
+        It also create train.list and test.list to indicate the list
+        of the batch files for training and testing data, respectively.
+        """
+        train_list = self.create_batches(self.train_data, output_path,
+                                         "train_", self.num_per_batch)
+        test_list = self.create_batches(self.test_data, output_path, "test_",
+                                       self.num_per_batch)
+        save_list(train_list, os.path.join(output_path, train_list_name))
+        save_list(test_list, os.path.join(output_path, test_list_name))
+        save_file(self.label_set, os.path.join(output_path, label_set_name))
+
+    def create_batches(self, data, output_path,
+                       prefix = "", num_data_per_batch=5000):
+        """
+        Create batches for a Dataset object.
+        data: the Dataset object to process.
+        output_path: the output path of the batches.
+        prefix: the prefix of each batch.
+        num_data_per_batch: number of data in each batch.
+        """
+        num_batches = int(math.ceil(len(data.data) / float(num_data_per_batch)))
+        batch_names = []
+        data.check_valid()
+        num_slots = len(data.keys)
+        for i in range(num_batches):
+            batch_name = os.path.join(output_path, prefix + "batch_%03d" % i)
+            out_data = dict([(k, []) for k in data.keys])
+            begin_idx = i * num_data_per_batch
+            end_idx = min((i + 1) * num_data_per_batch, len(data.data))
+            for j in range(begin_idx, end_idx):
+                for slot_id in range(num_slots):
+                    out_data[data.keys[slot_id]].\
+                        append(data.data[j][slot_id].convert_to_paddle_format())
+            save_file(out_data, batch_name)
+            batch_names.append(batch_name)
+        return batch_names
+
+
+class DatasetCreater(object):
+    """
+    A virtual class for creating datasets.
+    The derived clasas needs to implemnt the following methods:
+       - create_dataset()
+       - create_meta_file()
+    """
+    def __init__(self, data_path):
+        """
+        data_path: the path to store the training data and batches.
+        train_dir_name: relative training data directory.
+        test_dir_name: relative testing data directory.
+        batch_dir_name: relative batch directory.
+        num_per_batch: the number of data in a batch.
+        meta_filename: the filename of the meta file.
+        train_list_name: training batch list name.
+        test_list_name: testing batch list name.
+        label_set: label set name.
+        overwrite: whether to overwrite the files if the batches are already in
+                   the given path.
+        """
+        self.data_path = data_path
+        self.train_dir_name = 'train'
+        self.test_dir_name = 'test'
+        self.batch_dir_name = 'batches'
+        self.num_per_batch = 50000
+        self.meta_filename = "batches.meta"
+        self.train_list_name = "train.list"
+        self.test_list_name = "test.list"
+        self.label_set_name = "labels.pkl"
+        self.output_path = os.path.join(self.data_path, self.batch_dir_name)
+        self.overwrite = False
+        self.permutate_key = "labels"
+        self.from_list = False
+
+    def create_meta_file(self, data):
+        """
+        Create a meta file from training data.
+        data: training data given in a Dataset format.
+        """
+        raise NotImplementedError
+
+    def create_dataset(self, path):
+        """
+        Create a data set object from a path.
+        It will use directory structure or a file list to determine dataset if
+        self.from_list is True. Otherwise, it will uses a file list  to
+        determine the datset.
+        path: the path of the dataset.
+        return a tuple of Dataset object, and a mapping from lable set
+        to label id.
+        """
+        if self.from_list:
+            return self.create_dataset_from_list(path)
+        else:
+            return self.create_dataset_from_dir(path)
+
+    def create_dataset_from_list(self, path):
+        """
+        Create a data set object from a path.
+        It will uses a file list to determine the datset.
+        path: the path of the dataset.
+        return a tuple of Dataset object, and a mapping from lable set
+        to label id
+        """
+        raise NotImplementedError
+
+    def create_dataset_from_dir(self, path):
+        """
+        Create a data set object from a path.
+        It will use directory structure or a file list to determine dataset if
+        self.from_list is True.
+        path: the path of the dataset.
+        return a tuple of Dataset object, and a mapping from lable set
+        to label id
+        """
+        raise NotImplementedError
+
+    def create_batches(self):
+        """
+        create batches and meta file.
+        """
+        train_path = os.path.join(self.data_path, self.train_dir_name)
+        test_path = os.path.join(self.data_path, self.test_dir_name)
+        out_path = os.path.join(self.data_path, self.batch_dir_name)
+        if not os.path.exists(out_path):
+            os.makedirs(out_path)
+        if (self.overwrite or
+            not os.path.exists(os.path.join(out_path, self.train_list_name))):
+            train_data, train_label_set = \
+                self.create_dataset(train_path)
+            test_data, test_label_set = \
+                self.create_dataset(test_path)
+
+            train_data.permute(self.keys.index(self.permutate_key),
+                               self.num_per_batch)
+
+            assert(train_label_set == test_label_set)
+            data_batcher = DataBatcher(train_data, test_data,
+                                       train_label_set)
+            data_batcher.num_per_batch = self.num_per_batch
+            data_batcher.create_batches_and_list(self.output_path,
+                                                 self.train_list_name,
+                                                 self.test_list_name,
+                                                 self.label_set_name)
+            self.num_classes = len(train_label_set.keys())
+            self.create_meta_file(train_data)
+        return out_path
diff --git a/python/paddle/utils/show_pb.py b/python/paddle/utils/show_pb.py
new file mode 100644
index 00000000000000..81b28c48554efb
--- /dev/null
+++ b/python/paddle/utils/show_pb.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Show the content of proto buffer data file of PADDLE
+"""
+
+import os
+import sys
+from google.protobuf.internal.decoder import _DecodeVarint
+import paddle.proto.DataFormat_pb2 as DataFormat
+
+def read_proto(file, message):
+    """
+    read a protobuffer struct from file, the length of the struct is stored as
+    a varint, then followed by the actual struct data.
+    @return True success, False for end of file
+    """
+
+    buf = file.read(8)
+    if not buf:
+        return False
+    result, pos = _DecodeVarint(buf, 0)
+    buf = buf[pos:] + file.read(result - len(buf) + pos)
+    message.ParseFromString(buf)
+
+    return True
+
+
+def usage():
+    print >>sys.stderr, "Usage: python show_pb.py PROTO_DATA_FILE"
+    exit(1)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        usage()
+
+    f = open(sys.argv[1])
+    header = DataFormat.DataHeader()
+    read_proto(f, header)
+    print header
+    
+    sample = DataFormat.DataSample()
+    while read_proto(f, sample):
+        print sample
+
+
+    
diff --git a/python/paddle/utils/torch2paddle.py b/python/paddle/utils/torch2paddle.py
new file mode 100644
index 00000000000000..6a5ea130615e7a
--- /dev/null
+++ b/python/paddle/utils/torch2paddle.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Convert torch parameter file to paddle model files.
+
+Note: must have torchfile installed in order to use this tool.
+
+Usage: python torch2paddle.py -i torchfile.t7 -l layers.txt -o path/to/paddle_model
+"""
+
+import os
+import sys
+import struct
+import numpy as np
+import torchfile
+import cPickle as pickle
+import argparse
+
+# save parameters
+def save_layer_parameters(outfile, feats):
+    version = 0
+    value_size  = 4;
+    ret = ""
+    for feat in feats:
+        ret += feat.tostring()
+    size = len(ret) / 4
+    fo = open(outfile, 'wb')
+    fo.write(struct.pack('iIQ', version, value_size, size))
+    fo.write(ret)
+    fo.close()
+
+def save_net_parameters(layers, params, output_path):
+    for i in range(len(layers)):
+        weight = params[i*2]
+        biases = params[i*2+1]        
+        weight_file = os.path.join(output_path, '_%s.w0' % layers[i])
+        biases_file = os.path.join(output_path, '_%s.wbias' % layers[i])
+        print "Saving for layer %s." % layers[i]
+        save_layer_parameters(weight_file, [weight])
+        save_layer_parameters(biases_file, biases)
+
+def load_layer_parameters(filename):
+    fn = open(filename, 'rb')
+    version, = struct.unpack('i', fn.read(4))
+    value_length, = struct.unpack("I", fn.read(4))
+    dtype = 'float32' if value_length == 4 else 'float64'
+    param_size, = struct.unpack("L", fn.read(8))
+    value = np.fromfile(fn, dtype)
+    return value
+
+def main(argv):
+    """
+    main method of converting torch to paddle files.
+    :param argv:
+    :return:
+    """
+    cmdparser = argparse.ArgumentParser("Convert torch parameter file to paddle model files.")
+    cmdparser.add_argument('-i', '--input', help='input filename of torch parameters')
+    cmdparser.add_argument('-l', '--layers', help='list of layer names')
+    cmdparser.add_argument('-o', '--output', help='output file path of paddle model')
+
+    args = cmdparser.parse_args(argv)
+    if args.input and args.layers and args.output:
+        params = torchfile.load(args.input)
+        layers = [line.strip() for line in open(args.layers, 'r')]
+        save_net_parameters(layers, params, args.output)
+    else:
+        print('Usage: python torch2paddle.py -i torchfile.t7 -l layers.txt -o path/to/paddle_model')
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/python/setup.py.in b/python/setup.py.in
new file mode 100644
index 00000000000000..3f906ed46f274f
--- /dev/null
+++ b/python/setup.py.in
@@ -0,0 +1,21 @@
+from setuptools import setup
+
+INTERNAL_PACKAGE='${PADDLE_INTERNAL_PACKAGE}'
+
+packages=['paddle',
+          'paddle.proto',
+          'paddle.trainer',
+          'paddle.trainer_config_helpers',
+          'paddle.utils']
+
+if len(INTERNAL_PACKAGE) == 0:
+    packages.append(INTERNAL_PACKAGE)
+
+setup(name='paddle',
+      version='${PADDLE_VERSION}',
+      description='Parallel Distributed Deep Learning',
+      packages=packages,
+      package_dir={
+          '': '${CMAKE_CURRENT_SOURCE_DIR}'
+      }
+)